From c6b9cc043406d5393478ba8cef751dc5504fbad7 Mon Sep 17 00:00:00 2001
From: Konstantin Isakov <ikm@users.berlios.de>
Date: Tue, 30 Mar 2010 17:41:14 +0400
Subject: [PATCH] When an unsuccessful attempt to look up a phrase or whole
 sentence is made, the query is splitted into individual words and presented
 to user as links. Then all the compound expressions (expressions consisting
 of two or more words) are attempted to be found, and the results are also
 presented to the user.

---
 src/article_maker.cc | 278 ++++++++++++++++++++++++++++++++++++++++---
 src/article_maker.hh |  35 +++++-
 src/folding.cc       |  23 ++++
 src/folding.hh       |   3 +
 src/wordfinder.cc    |  10 +-
 src/wordfinder.hh    |   9 +-
 6 files changed, 336 insertions(+), 22 deletions(-)
diff --git a/src/article_maker.cc b/src/article_maker.cc
index 673a6142..1077eb87 100644
--- a/src/article_maker.cc
+++ b/src/article_maker.cc
@@ -9,6 +9,7 @@
 #include <limits.h>
 #include <QFile>
 #include <QUrl>
+#include "folding.hh"
 
 using std::vector;
 using std::string;
@@ -103,13 +104,21 @@ std::string ArticleMaker::makeHtmlHeader( QString const & word,
 std::string ArticleMaker::makeNotFoundBody( QString const & word,
                                             QString const & group )
 {
+  string result( "<div class=\"gdnotfound\"><p>" );
 
-  return string( "<div class=\"gdnotfound\"><p>" ) +
-      tr( "No translation for <b>%1</b> was found in group <b>%2</b>." ).
-        arg( QString::fromUtf8( Html::escape( word.toUtf8().data() ).c_str() ) ).
-        arg( QString::fromUtf8( Html::escape( group.toUtf8().data() ).c_str() ) ).
-          toUtf8().data()
-        +"</p></div>";
+  if ( word.size() )
+    result += tr( "No translation for <b>%1</b> was found in group <b>%2</b>." ).
+              arg( QString::fromUtf8( Html::escape( word.toUtf8().data() ).c_str() ) ).
+              arg( QString::fromUtf8( Html::escape( group.toUtf8().data() ).c_str() ) ).
+                toUtf8().data();
+  else
+    result += tr( "No translation was found in group <b>%1</b>." ).
+              arg( QString::fromUtf8( Html::escape( group.toUtf8().data() ).c_str() ) ).
+                toUtf8().data();
+
+  result += "</p></div>";
+
+  return result;
 }
 
 sptr< Dictionary::DataRequest > ArticleMaker::makeDefinitionFor(
@@ -441,7 +450,10 @@ void ArticleRequest::bodyFinished()
       if ( !foundAnyDefinitions )
       {
         // No definitions were ever found, say so to the user.
-        footer += ArticleMaker::makeNotFoundBody( word, group );
+
+        // Larger words are usually whole sentences - don't clutter the ouput
+        // with their full bodies.
+        footer += ArticleMaker::makeNotFoundBody( word.size() < 40 ? word : "", group );
 
         // When there were no definitions, we run stemmed search.
         stemmedWordFinder = new WordFinder( this );
@@ -483,6 +495,8 @@ void ArticleRequest::stemmedSearchFinished()
 
   string footer;
 
+  bool continueMatching = false;
+
   if ( sr.size() )
   {
     footer += "<div class=\"gdstemmedsuggestion\"><span class=\"gdstemmedsuggestion_head\">" +
@@ -491,14 +505,7 @@ void ArticleRequest::stemmedSearchFinished()
 
     for( unsigned x = 0; x < sr.size(); ++x )
     {
-      QUrl url;
-
-      url.setScheme( "gdlookup" );
-      url.setHost( "localhost" );
-      url.setPath( sr[ x ].first );
-
-      string escapedResult = Html::escape( sr[ x ].first.toUtf8().data() );
-      footer += string( "<a href=\"" ) + url.toEncoded().data() + "\">" + escapedResult +"</a>";
+      footer += linkWord( sr[ x ].first );
 
       if ( x != sr.size() - 1 )
       {
@@ -509,7 +516,42 @@ void ArticleRequest::stemmedSearchFinished()
     footer += "</span></div>";
   }
 
-  footer += "</body></html>";
+  splittedWords = splitIntoWords( word );
+
+  if ( splittedWords.first.size() > 1 ) // Contains more than one word
+  {
+    footer += "<div class=\"gdstemmedsuggestion\"><span class=\"gdstemmedsuggestion_head\">" +
+      Html::escape( tr( "Individual words: " ).toUtf8().data() ) +
+      "</span><span class=\"gdstemmedsuggestion_body\">";
+
+    footer += escapeSpacing( splittedWords.second[ 0 ] );
+
+    for( int x = 0; x < splittedWords.first.size(); ++x )
+    {
+      footer += linkWord( splittedWords.first[ x ] );
+      footer += escapeSpacing( splittedWords.second[ x + 1 ] );
+    }
+
+    footer += "</span>";
+
+    disconnect( stemmedWordFinder.get(), SIGNAL( finished() ),
+                this, SLOT( stemmedSearchFinished() ) );
+
+    connect( stemmedWordFinder.get(), SIGNAL( finished() ),
+             this, SLOT( individualWordFinished() ), Qt::QueuedConnection );
+
+    currentSplittedWordStart = -1;
+    currentSplittedWordEnd = currentSplittedWordStart;
+
+    firstCompoundWasFound = false;
+
+    compoundSearchNextStep( false );
+
+    continueMatching = true;
+  }
+
+  if ( !continueMatching )
+    footer += "</body></html>";
 
   {
     Mutex::Lock _( dataMutex );
@@ -521,6 +563,208 @@ void ArticleRequest::stemmedSearchFinished()
     memcpy( &data.front() + offset, footer.data(), footer.size() );
   }
 
-  finish();
+  if ( continueMatching )
+    update();
+  else
+    finish();
 }
 
+void ArticleRequest::compoundSearchNextStep( bool lastSearchSucceeded )
+{
+  if ( !lastSearchSucceeded )
+  {
+    // Last search was unsuccessful. First, emit what we had.
+
+    string footer;
+
+    if ( currentSplittedWordEnd - currentSplittedWordStart > 1 ) // We have something to append
+    {
+//      printf( "Appending\n" );
+
+      --currentSplittedWordEnd;
+
+      if ( !firstCompoundWasFound )
+      {
+        // Append the beginning
+        footer += "<div class=\"gdstemmedsuggestion\"><span class=\"gdstemmedsuggestion_head\">" +
+          Html::escape( tr( "Compound expressions: " ).toUtf8().data() ) +
+          "</span><span class=\"gdstemmedsuggestion_body\">";
+
+        firstCompoundWasFound = true;
+      }
+      else
+      {
+        // Append the separator
+        footer += " / ";
+      }
+
+      footer += linkWord( makeSplittedWordCompound() );
+    }
+
+    // Then, start a new search for the next word, if possible
+
+    if ( currentSplittedWordStart >= splittedWords.first.size() - 2 )
+    {
+      // The last word was the last possible to start from
+
+      if ( firstCompoundWasFound )
+        footer += "</span>";
+
+      footer += "</body></html>";
+
+      appendToData( footer );
+
+      finish();
+
+      return;
+    }
+
+    if ( footer.size() )
+    {
+      appendToData( footer );
+      update();
+    }
+
+    // Advance to the next word and start from looking up two words
+    ++currentSplittedWordStart;
+    currentSplittedWordEnd = currentSplittedWordStart + 1;
+  }
+  else
+  {
+    // Last lookup succeeded -- see if we can try the larger sequence
+
+    if ( currentSplittedWordEnd < splittedWords.first.size() - 1 )
+    {
+      // We can, indeed.
+      ++currentSplittedWordEnd;
+    }
+    else
+    {
+      // We can't. Emit what we have and start over.
+
+      ++currentSplittedWordEnd; // So we could use the same code for result
+                                // emitting
+
+      // Initiate new lookup
+      compoundSearchNextStep( false );
+
+      return;
+    }
+  }
+
+  // Build the compound sequence
+
+  currentSplittedWordCompound = makeSplittedWordCompound();
+
+  // Look it up
+
+//  printf( "Looking up %s\n", qPrintable( currentSplittedWordCompound ) );
+
+  stemmedWordFinder->stemmedMatch( currentSplittedWordCompound, activeDicts, 0, 0, 1 );
+}
+
+QString ArticleRequest::makeSplittedWordCompound()
+{
+  QString result;
+
+  result.clear();
+
+  for( int x = currentSplittedWordStart; x <= currentSplittedWordEnd; ++x )
+  {
+    result.append( splittedWords.first[ x ] );
+
+    if ( x < currentSplittedWordEnd )
+    {
+      wstring ws( gd::toWString( splittedWords.second[ x + 1 ] ) );
+
+      Folding::normalizeWhitespace( ws );
+
+      result.append( gd::toQString( ws ) );
+    }
+  }
+
+  return result;
+}
+
+void ArticleRequest::individualWordFinished()
+{
+  WordFinder::SearchResults const & results = stemmedWordFinder->getResults();
+
+  if ( results.size() )
+  {
+    // Check if the aliases are acceptable
+    wstring source = Folding::applySimpleCaseOnly( gd::toWString( currentSplittedWordCompound ) );
+
+    for( unsigned x = 0; x < results.size(); ++x )
+      if ( source == Folding::applySimpleCaseOnly( gd::toWString( results[ x ].first ) ) )
+      {
+        // Ok, good enough
+        compoundSearchNextStep( true );
+        return;
+      }
+  }
+
+  compoundSearchNextStep( false );
+}
+
+void ArticleRequest::appendToData( std::string const & str )
+{
+  Mutex::Lock _( dataMutex );
+
+  size_t offset = data.size();
+
+  data.resize( data.size() + str.size() );
+
+  memcpy( &data.front() + offset, str.data(), str.size() );
+
+}
+
+QPair< ArticleRequest::Words, ArticleRequest::Spacings > ArticleRequest::splitIntoWords( QString const & input )
+{
+  QPair< Words, Spacings > result;
+
+  QChar const * ptr = input.data();
+
+  for( ; ; )
+  {
+    QString spacing;
+
+    for( ; ptr->unicode() && ( Folding::isPunct( ptr->unicode() ) || Folding::isWhitespace( ptr->unicode() ) ); ++ptr )
+      spacing.append( *ptr );
+
+    result.second.append( spacing );
+
+    QString word;
+
+    for( ; ptr->unicode() && !( Folding::isPunct( ptr->unicode() ) || Folding::isWhitespace( ptr->unicode() ) ); ++ptr )
+      word.append( *ptr );
+
+    if ( word.isEmpty() )
+      break;
+
+    result.first.append( word );
+  }
+
+  return result;
+}
+
+string ArticleRequest::linkWord( QString const & str )
+{
+  QUrl url;
+
+  url.setScheme( "gdlookup" );
+  url.setHost( "localhost" );
+  url.setPath( str );
+
+  string escapedResult = Html::escape( str.toUtf8().data() );
+  return string( "<a href=\"" ) + url.toEncoded().data() + "\">" + escapedResult +"</a>";
+}
+
+std::string ArticleRequest::escapeSpacing( QString const & str )
+{
+  QByteArray spacing = Html::escape( str.toUtf8().data() ).c_str();
+
+  spacing.replace( "\n", "<br>" );
+
+  return spacing.data();
+}
diff --git a/src/article_maker.hh b/src/article_maker.hh
index f33e1e68..761f5de3 100644
--- a/src/article_maker.hh
+++ b/src/article_maker.hh
@@ -27,7 +27,7 @@ public:
   /// On construction, a reference to all dictionaries and a reference all
   /// groups' instances are to be passed. Those references are kept stored as
   /// references, and as such, any changes to them would reflect on the results
-  /// of the inquiries, altthough those changes are perfectly legal.
+  /// of the inquiries, although those changes are perfectly legal.
   ArticleMaker( std::vector< sptr< Dictionary::Class > > const & dictionaries,
                 std::vector< Instances::Group > const & groups,
                 QString const & displayStyle );
@@ -88,6 +88,20 @@ class ArticleRequest: public Dictionary::DataRequest
                       // be closed after the article ends.
   sptr< WordFinder > stemmedWordFinder; // Used when there're no results
 
+  /// A sequence of words and spacings between them, including the initial
+  /// spacing before the first word and the final spacing after the last word.
+  typedef QList< QString > Words;
+  typedef QList< QString > Spacings;
+
+  /// Splits the given string into words and spacings between them.
+  QPair< Words, Spacings > splitIntoWords( QString const & );
+
+  QPair< Words, Spacings > splittedWords;
+  int currentSplittedWordStart;
+  int currentSplittedWordEnd;
+  QString currentSplittedWordCompound;
+  bool firstCompoundWasFound;
+
 public:
 
   ArticleRequest( QString const & word, QString const & group,
@@ -103,6 +117,25 @@ private slots:
   void altSearchFinished();
   void bodyFinished();
   void stemmedSearchFinished();
+  void individualWordFinished();
+
+private:
+
+  /// Appends the given string to 'data', with locking its mutex.
+  void appendToData( std::string const & );
+
+  /// Uses stemmedWordFinder to perform the next step of looking up word
+  /// combinations.
+  void compoundSearchNextStep( bool lastSearchSucceeded );
+
+  /// Creates a single word out of the [currentSplittedWordStart..End] range.
+  QString makeSplittedWordCompound();
+
+  /// Makes an html link to the given word.
+  std::string linkWord( QString const & );
+
+  /// Escapes the spacing between the words to include in html.
+  std::string escapeSpacing( QString const & );
 };
 
 
diff --git a/src/folding.cc b/src/folding.cc
index b88d8441..a16c268e 100644
--- a/src/folding.cc
+++ b/src/folding.cc
@@ -636,4 +636,27 @@ wstring trimWhitespace( wstring const & in )
   return wstring( wordBegin, wordSize );
 }
 
+void normalizeWhitespace( wstring & str )
+{
+  for( size_t x = str.size(); x-- > 1; ) // >1 -- Don't test the first char
+  {
+    if ( isWhitespace( str[ x ] ) )
+    {
+      size_t y;
+      for( y = x; y && ( isWhitespace( str[ y - 1 ] ) ) ; --y );
+
+      if ( y != x )
+      {
+        // Remove extra spaces
+
+        str.erase( y, x - y );
+
+        x = y;
+
+        str[ x ] = ' ';
+      }
+    }
+  }
+}
+
 }
diff --git a/src/folding.hh b/src/folding.hh
index 6e062aac..cfb2cd85 100644
--- a/src/folding.hh
+++ b/src/folding.hh
@@ -67,6 +67,9 @@ wstring trimWhitespaceOrPunct( wstring const & );
 /// the word.
 wstring trimWhitespace( wstring const & );
 
+/// Turns any sequences of consecutive whitespace into a single basic space.
+void normalizeWhitespace( wstring & );
+
 /// Same as apply( wstring ), but without any heap operations, therefore
 /// preferable when there're many strings to process. Returns -1 if the
 /// operation succeded, or otherwise the minimum value of outSize required
diff --git a/src/wordfinder.cc b/src/wordfinder.cc
index 006f090a..0be3f4e2 100644
--- a/src/wordfinder.cc
+++ b/src/wordfinder.cc
@@ -56,7 +56,10 @@ void WordFinder::prefixMatch( QString const & str,
   // cancelled, but still it could take some time.
 }
 void WordFinder::stemmedMatch( QString const & str,
-                               std::vector< sptr< Dictionary::Class > > const & dicts )
+                               std::vector< sptr< Dictionary::Class > > const & dicts,
+                               unsigned minLength,
+                               unsigned maxSuffixVariation,
+                               unsigned long maxResults )
 {
   cancel();
 
@@ -64,6 +67,9 @@ void WordFinder::stemmedMatch( QString const & str,
   searchType = StemmedMatch;
   inputWord = str;
   inputDicts = &dicts;
+  stemmedMinLength = minLength;
+  stemmedMaxSuffixVariation = maxSuffixVariation;
+  stemmedMaxResults = maxResults;
 
   resultsArray.clear();
   resultsIndex.clear();
@@ -110,7 +116,7 @@ void WordFinder::startSearch()
       sptr< Dictionary::WordSearchRequest > sr =
         ( searchType == PrefixMatch ) ?
           (*inputDicts)[ x ]->prefixMatch( allWordWritings[ y ], 40 ) :
-          (*inputDicts)[ x ]->stemmedMatch( allWordWritings[ y ], 3, 3, 30 );
+          (*inputDicts)[ x ]->stemmedMatch( allWordWritings[ y ], stemmedMinLength, stemmedMaxSuffixVariation, stemmedMaxResults );
   
       connect( sr.get(), SIGNAL( finished() ),
                this, SLOT( requestFinished() ), Qt::QueuedConnection );
diff --git a/src/wordfinder.hh b/src/wordfinder.hh
index fe7aad1f..423e1dd1 100644
--- a/src/wordfinder.hh
+++ b/src/wordfinder.hh
@@ -42,6 +42,9 @@ private:
     PrefixMatch,
     StemmedMatch
   } searchType;
+  unsigned stemmedMinLength;
+  unsigned stemmedMaxSuffixVariation;
+  unsigned long stemmedMaxResults;
 
   std::vector< sptr< Dictionary::Class > > const * inputDicts;
 
@@ -76,11 +79,13 @@ public:
   void prefixMatch( QString const &,
                     std::vector< sptr< Dictionary::Class > > const & );
 
-
   /// Do a stemmed-match search in the given list of dictionaries. All comments
   /// from prefixMatch() generally apply as well.
   void stemmedMatch( QString const &,
-                     std::vector< sptr< Dictionary::Class > > const & );
+                     std::vector< sptr< Dictionary::Class > > const &,
+                     unsigned minLength = 3,
+                     unsigned maxSuffixVariation = 3,
+                     unsigned long maxResults = 30 );
   
   /// Returns the vector containing search results from the last operation.
   /// If it didn't finish yet, the result is not final and may be changing