Apply morphological analysis to compound expressions as well, by querying each word in the expression separately (e.g. "dozing off" -> "doze off").

This commit is contained in:
Konstantin Isakov 2010-08-31 23:26:11 +04:00
parent 7c0b532bd1
commit dd92f5af25

View file

@ -322,6 +322,11 @@ public:
isCancelled.ref(); isCancelled.ref();
hasExited.acquire(); hasExited.acquire();
} }
private:
/// Generates suggestions via hunspell
QVector< wstring > suggest( wstring & word );
}; };
void HunspellHeadwordsRequestRunnable::run() void HunspellHeadwordsRequestRunnable::run()
@ -337,24 +342,93 @@ void HunspellHeadwordsRequest::run()
return; return;
} }
wstring trimmedWord = Folding::trimWhitespaceOrPunct( word );
if ( trimmedWord.size() > 80 )
{
// We won't do anything for overly long sentences since that would probably
// only waste time.
finish();
return;
}
if ( containsWhitespace( trimmedWord ) )
{
// Analyze each word separately and use the first suggestion, if any.
// This is useful for compound expressions where one of the words is
// in different form, e.g. "dozing off" -> "doze off".
// In this mode, we only provide a single suggestion at most.
wstring result;
wstring word;
for( wchar const * c = trimmedWord.c_str(); *c; ++c )
{
if ( Folding::isPunct( *c ) || Folding::isWhitespace( * c ) )
{
if ( word.size() )
{
QVector< wstring > suggestions = suggest( word );
if ( suggestions.size() )
result += suggestions[ 0 ];
else
result += word;
word.clear();
}
result.push_back( *c );
}
else
word.push_back( *c );
}
if ( word.size() )
{
QVector< wstring > suggestions = suggest( trimmedWord );
if ( suggestions.size() )
result += suggestions[ 0 ];
else
result += word;
}
if ( result != trimmedWord )
{
Mutex::Lock _( dataMutex );
matches.push_back( result );
}
}
else
{
QVector< wstring > suggestions = suggest( trimmedWord );
if ( !suggestions.empty() )
{
Mutex::Lock _( dataMutex );
for( int x = 0; x < suggestions.size(); ++x )
matches.push_back( suggestions[ x ] );
}
}
finish();
}
QVector< wstring > HunspellHeadwordsRequest::suggest( wstring & word )
{
QVector< wstring > result;
// We'd need to free this if it gets allocated and an exception shows up // We'd need to free this if it gets allocated and an exception shows up
char ** suggestions = 0; char ** suggestions = 0;
int suggestionsCount = 0; int suggestionsCount = 0;
try try
{ {
wstring trimmedWord = Folding::trimWhitespaceOrPunct( word );
if ( containsWhitespace( trimmedWord ) )
{
// For now we don't analyze whitespace-containing phrases
finish();
return;
}
Mutex::Lock _( hunspellMutex ); Mutex::Lock _( hunspellMutex );
string encodedWord = encodeToHunspell( hunspell, trimmedWord ); string encodedWord = encodeToHunspell( hunspell, word );
suggestionsCount = hunspell.analyze( &suggestions, encodedWord.c_str() ); suggestionsCount = hunspell.analyze( &suggestions, encodedWord.c_str() );
@ -364,7 +438,7 @@ void HunspellHeadwordsRequest::run()
wstring lowercasedWord = Folding::applySimpleCaseOnly( word ); wstring lowercasedWord = Folding::applySimpleCaseOnly( word );
QRegExp cutStem( "^\\s*st:(((\\s+(?!\\w{2}:))|\\S+)+)" ); static QRegExp cutStem( "^\\s*st:(((\\s+(?!\\w{2}:))|\\S+)+)" );
for( int x = 0; x < suggestionsCount; ++x ) for( int x = 0; x < suggestionsCount; ++x )
{ {
@ -379,10 +453,7 @@ void HunspellHeadwordsRequest::run()
if ( Folding::applySimpleCaseOnly( alt ) != lowercasedWord ) // No point in providing same word if ( Folding::applySimpleCaseOnly( alt ) != lowercasedWord ) // No point in providing same word
{ {
printf( ">>>>>Alt: %ls\n", alt.c_str() ); printf( ">>>>>Alt: %ls\n", alt.c_str() );
result.append( alt );
Mutex::Lock _( dataMutex );
matches.push_back( alt );
} }
} }
} }
@ -400,9 +471,10 @@ void HunspellHeadwordsRequest::run()
hunspell.free_list( &suggestions, suggestionsCount ); hunspell.free_list( &suggestions, suggestionsCount );
} }
finish(); return result;
} }
sptr< WordSearchRequest > HunspellDictionary::findHeadwordsForSynonym( wstring const & word ) sptr< WordSearchRequest > HunspellDictionary::findHeadwordsForSynonym( wstring const & word )
throw( std::exception ) throw( std::exception )
{ {