When an unsuccessful attempt to look up a phrase or whole sentence is made, the query is splitted into individual words and presented to user as links. Then all the compound expressions (expressions consisting of two or more words) are attempted to be found, and the results are also presented to the user.

This commit is contained in:
Konstantin Isakov 2010-03-30 17:41:14 +04:00
parent 240bff7e63
commit c6b9cc0434
6 changed files with 336 additions and 22 deletions

View file

@ -9,6 +9,7 @@
#include <limits.h>
#include <QFile>
#include <QUrl>
#include "folding.hh"
using std::vector;
using std::string;
@ -103,13 +104,21 @@ std::string ArticleMaker::makeHtmlHeader( QString const & word,
std::string ArticleMaker::makeNotFoundBody( QString const & word,
QString const & group )
{
string result( "<div class=\"gdnotfound\"><p>" );
return string( "<div class=\"gdnotfound\"><p>" ) +
tr( "No translation for <b>%1</b> was found in group <b>%2</b>." ).
arg( QString::fromUtf8( Html::escape( word.toUtf8().data() ).c_str() ) ).
arg( QString::fromUtf8( Html::escape( group.toUtf8().data() ).c_str() ) ).
toUtf8().data()
+"</p></div>";
if ( word.size() )
result += tr( "No translation for <b>%1</b> was found in group <b>%2</b>." ).
arg( QString::fromUtf8( Html::escape( word.toUtf8().data() ).c_str() ) ).
arg( QString::fromUtf8( Html::escape( group.toUtf8().data() ).c_str() ) ).
toUtf8().data();
else
result += tr( "No translation was found in group <b>%1</b>." ).
arg( QString::fromUtf8( Html::escape( group.toUtf8().data() ).c_str() ) ).
toUtf8().data();
result += "</p></div>";
return result;
}
sptr< Dictionary::DataRequest > ArticleMaker::makeDefinitionFor(
@ -441,7 +450,10 @@ void ArticleRequest::bodyFinished()
if ( !foundAnyDefinitions )
{
// No definitions were ever found, say so to the user.
footer += ArticleMaker::makeNotFoundBody( word, group );
// Larger words are usually whole sentences - don't clutter the ouput
// with their full bodies.
footer += ArticleMaker::makeNotFoundBody( word.size() < 40 ? word : "", group );
// When there were no definitions, we run stemmed search.
stemmedWordFinder = new WordFinder( this );
@ -483,6 +495,8 @@ void ArticleRequest::stemmedSearchFinished()
string footer;
bool continueMatching = false;
if ( sr.size() )
{
footer += "<div class=\"gdstemmedsuggestion\"><span class=\"gdstemmedsuggestion_head\">" +
@ -491,14 +505,7 @@ void ArticleRequest::stemmedSearchFinished()
for( unsigned x = 0; x < sr.size(); ++x )
{
QUrl url;
url.setScheme( "gdlookup" );
url.setHost( "localhost" );
url.setPath( sr[ x ].first );
string escapedResult = Html::escape( sr[ x ].first.toUtf8().data() );
footer += string( "<a href=\"" ) + url.toEncoded().data() + "\">" + escapedResult +"</a>";
footer += linkWord( sr[ x ].first );
if ( x != sr.size() - 1 )
{
@ -509,7 +516,42 @@ void ArticleRequest::stemmedSearchFinished()
footer += "</span></div>";
}
footer += "</body></html>";
splittedWords = splitIntoWords( word );
if ( splittedWords.first.size() > 1 ) // Contains more than one word
{
footer += "<div class=\"gdstemmedsuggestion\"><span class=\"gdstemmedsuggestion_head\">" +
Html::escape( tr( "Individual words: " ).toUtf8().data() ) +
"</span><span class=\"gdstemmedsuggestion_body\">";
footer += escapeSpacing( splittedWords.second[ 0 ] );
for( int x = 0; x < splittedWords.first.size(); ++x )
{
footer += linkWord( splittedWords.first[ x ] );
footer += escapeSpacing( splittedWords.second[ x + 1 ] );
}
footer += "</span>";
disconnect( stemmedWordFinder.get(), SIGNAL( finished() ),
this, SLOT( stemmedSearchFinished() ) );
connect( stemmedWordFinder.get(), SIGNAL( finished() ),
this, SLOT( individualWordFinished() ), Qt::QueuedConnection );
currentSplittedWordStart = -1;
currentSplittedWordEnd = currentSplittedWordStart;
firstCompoundWasFound = false;
compoundSearchNextStep( false );
continueMatching = true;
}
if ( !continueMatching )
footer += "</body></html>";
{
Mutex::Lock _( dataMutex );
@ -521,6 +563,208 @@ void ArticleRequest::stemmedSearchFinished()
memcpy( &data.front() + offset, footer.data(), footer.size() );
}
finish();
if ( continueMatching )
update();
else
finish();
}
void ArticleRequest::compoundSearchNextStep( bool lastSearchSucceeded )
{
if ( !lastSearchSucceeded )
{
// Last search was unsuccessful. First, emit what we had.
string footer;
if ( currentSplittedWordEnd - currentSplittedWordStart > 1 ) // We have something to append
{
// printf( "Appending\n" );
--currentSplittedWordEnd;
if ( !firstCompoundWasFound )
{
// Append the beginning
footer += "<div class=\"gdstemmedsuggestion\"><span class=\"gdstemmedsuggestion_head\">" +
Html::escape( tr( "Compound expressions: " ).toUtf8().data() ) +
"</span><span class=\"gdstemmedsuggestion_body\">";
firstCompoundWasFound = true;
}
else
{
// Append the separator
footer += " / ";
}
footer += linkWord( makeSplittedWordCompound() );
}
// Then, start a new search for the next word, if possible
if ( currentSplittedWordStart >= splittedWords.first.size() - 2 )
{
// The last word was the last possible to start from
if ( firstCompoundWasFound )
footer += "</span>";
footer += "</body></html>";
appendToData( footer );
finish();
return;
}
if ( footer.size() )
{
appendToData( footer );
update();
}
// Advance to the next word and start from looking up two words
++currentSplittedWordStart;
currentSplittedWordEnd = currentSplittedWordStart + 1;
}
else
{
// Last lookup succeeded -- see if we can try the larger sequence
if ( currentSplittedWordEnd < splittedWords.first.size() - 1 )
{
// We can, indeed.
++currentSplittedWordEnd;
}
else
{
// We can't. Emit what we have and start over.
++currentSplittedWordEnd; // So we could use the same code for result
// emitting
// Initiate new lookup
compoundSearchNextStep( false );
return;
}
}
// Build the compound sequence
currentSplittedWordCompound = makeSplittedWordCompound();
// Look it up
// printf( "Looking up %s\n", qPrintable( currentSplittedWordCompound ) );
stemmedWordFinder->stemmedMatch( currentSplittedWordCompound, activeDicts, 0, 0, 1 );
}
QString ArticleRequest::makeSplittedWordCompound()
{
QString result;
result.clear();
for( int x = currentSplittedWordStart; x <= currentSplittedWordEnd; ++x )
{
result.append( splittedWords.first[ x ] );
if ( x < currentSplittedWordEnd )
{
wstring ws( gd::toWString( splittedWords.second[ x + 1 ] ) );
Folding::normalizeWhitespace( ws );
result.append( gd::toQString( ws ) );
}
}
return result;
}
void ArticleRequest::individualWordFinished()
{
WordFinder::SearchResults const & results = stemmedWordFinder->getResults();
if ( results.size() )
{
// Check if the aliases are acceptable
wstring source = Folding::applySimpleCaseOnly( gd::toWString( currentSplittedWordCompound ) );
for( unsigned x = 0; x < results.size(); ++x )
if ( source == Folding::applySimpleCaseOnly( gd::toWString( results[ x ].first ) ) )
{
// Ok, good enough
compoundSearchNextStep( true );
return;
}
}
compoundSearchNextStep( false );
}
void ArticleRequest::appendToData( std::string const & str )
{
Mutex::Lock _( dataMutex );
size_t offset = data.size();
data.resize( data.size() + str.size() );
memcpy( &data.front() + offset, str.data(), str.size() );
}
QPair< ArticleRequest::Words, ArticleRequest::Spacings > ArticleRequest::splitIntoWords( QString const & input )
{
QPair< Words, Spacings > result;
QChar const * ptr = input.data();
for( ; ; )
{
QString spacing;
for( ; ptr->unicode() && ( Folding::isPunct( ptr->unicode() ) || Folding::isWhitespace( ptr->unicode() ) ); ++ptr )
spacing.append( *ptr );
result.second.append( spacing );
QString word;
for( ; ptr->unicode() && !( Folding::isPunct( ptr->unicode() ) || Folding::isWhitespace( ptr->unicode() ) ); ++ptr )
word.append( *ptr );
if ( word.isEmpty() )
break;
result.first.append( word );
}
return result;
}
string ArticleRequest::linkWord( QString const & str )
{
QUrl url;
url.setScheme( "gdlookup" );
url.setHost( "localhost" );
url.setPath( str );
string escapedResult = Html::escape( str.toUtf8().data() );
return string( "<a href=\"" ) + url.toEncoded().data() + "\">" + escapedResult +"</a>";
}
std::string ArticleRequest::escapeSpacing( QString const & str )
{
QByteArray spacing = Html::escape( str.toUtf8().data() ).c_str();
spacing.replace( "\n", "<br>" );
return spacing.data();
}

View file

@ -27,7 +27,7 @@ public:
/// On construction, a reference to all dictionaries and a reference all
/// groups' instances are to be passed. Those references are kept stored as
/// references, and as such, any changes to them would reflect on the results
/// of the inquiries, altthough those changes are perfectly legal.
/// of the inquiries, although those changes are perfectly legal.
ArticleMaker( std::vector< sptr< Dictionary::Class > > const & dictionaries,
std::vector< Instances::Group > const & groups,
QString const & displayStyle );
@ -88,6 +88,20 @@ class ArticleRequest: public Dictionary::DataRequest
// be closed after the article ends.
sptr< WordFinder > stemmedWordFinder; // Used when there're no results
/// A sequence of words and spacings between them, including the initial
/// spacing before the first word and the final spacing after the last word.
typedef QList< QString > Words;
typedef QList< QString > Spacings;
/// Splits the given string into words and spacings between them.
QPair< Words, Spacings > splitIntoWords( QString const & );
QPair< Words, Spacings > splittedWords;
int currentSplittedWordStart;
int currentSplittedWordEnd;
QString currentSplittedWordCompound;
bool firstCompoundWasFound;
public:
ArticleRequest( QString const & word, QString const & group,
@ -103,6 +117,25 @@ private slots:
void altSearchFinished();
void bodyFinished();
void stemmedSearchFinished();
void individualWordFinished();
private:
/// Appends the given string to 'data', with locking its mutex.
void appendToData( std::string const & );
/// Uses stemmedWordFinder to perform the next step of looking up word
/// combinations.
void compoundSearchNextStep( bool lastSearchSucceeded );
/// Creates a single word out of the [currentSplittedWordStart..End] range.
QString makeSplittedWordCompound();
/// Makes an html link to the given word.
std::string linkWord( QString const & );
/// Escapes the spacing between the words to include in html.
std::string escapeSpacing( QString const & );
};

View file

@ -636,4 +636,27 @@ wstring trimWhitespace( wstring const & in )
return wstring( wordBegin, wordSize );
}
void normalizeWhitespace( wstring & str )
{
for( size_t x = str.size(); x-- > 1; ) // >1 -- Don't test the first char
{
if ( isWhitespace( str[ x ] ) )
{
size_t y;
for( y = x; y && ( isWhitespace( str[ y - 1 ] ) ) ; --y );
if ( y != x )
{
// Remove extra spaces
str.erase( y, x - y );
x = y;
str[ x ] = ' ';
}
}
}
}
}

View file

@ -67,6 +67,9 @@ wstring trimWhitespaceOrPunct( wstring const & );
/// the word.
wstring trimWhitespace( wstring const & );
/// Turns any sequences of consecutive whitespace into a single basic space.
void normalizeWhitespace( wstring & );
/// Same as apply( wstring ), but without any heap operations, therefore
/// preferable when there're many strings to process. Returns -1 if the
/// operation succeded, or otherwise the minimum value of outSize required

View file

@ -56,7 +56,10 @@ void WordFinder::prefixMatch( QString const & str,
// cancelled, but still it could take some time.
}
void WordFinder::stemmedMatch( QString const & str,
std::vector< sptr< Dictionary::Class > > const & dicts )
std::vector< sptr< Dictionary::Class > > const & dicts,
unsigned minLength,
unsigned maxSuffixVariation,
unsigned long maxResults )
{
cancel();
@ -64,6 +67,9 @@ void WordFinder::stemmedMatch( QString const & str,
searchType = StemmedMatch;
inputWord = str;
inputDicts = &dicts;
stemmedMinLength = minLength;
stemmedMaxSuffixVariation = maxSuffixVariation;
stemmedMaxResults = maxResults;
resultsArray.clear();
resultsIndex.clear();
@ -110,7 +116,7 @@ void WordFinder::startSearch()
sptr< Dictionary::WordSearchRequest > sr =
( searchType == PrefixMatch ) ?
(*inputDicts)[ x ]->prefixMatch( allWordWritings[ y ], 40 ) :
(*inputDicts)[ x ]->stemmedMatch( allWordWritings[ y ], 3, 3, 30 );
(*inputDicts)[ x ]->stemmedMatch( allWordWritings[ y ], stemmedMinLength, stemmedMaxSuffixVariation, stemmedMaxResults );
connect( sr.get(), SIGNAL( finished() ),
this, SLOT( requestFinished() ), Qt::QueuedConnection );

View file

@ -42,6 +42,9 @@ private:
PrefixMatch,
StemmedMatch
} searchType;
unsigned stemmedMinLength;
unsigned stemmedMaxSuffixVariation;
unsigned long stemmedMaxResults;
std::vector< sptr< Dictionary::Class > > const * inputDicts;
@ -76,11 +79,13 @@ public:
void prefixMatch( QString const &,
std::vector< sptr< Dictionary::Class > > const & );
/// Do a stemmed-match search in the given list of dictionaries. All comments
/// from prefixMatch() generally apply as well.
void stemmedMatch( QString const &,
std::vector< sptr< Dictionary::Class > > const & );
std::vector< sptr< Dictionary::Class > > const &,
unsigned minLength = 3,
unsigned maxSuffixVariation = 3,
unsigned long maxResults = 30 );
/// Returns the vector containing search results from the last operation.
/// If it didn't finish yet, the result is not final and may be changing