mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-27 19:24:08 +00:00
Refactor of expressions search via morphology suggestion
This commit is contained in:
parent
a6d9228c70
commit
67db58b4fe
|
@ -927,8 +927,8 @@ void ArticleRequest::compoundSearchNextStep( bool lastSearchSucceeded )
|
||||||
|
|
||||||
// DPRINTF( "Looking up %s\n", qPrintable( currentSplittedWordCompound ) );
|
// DPRINTF( "Looking up %s\n", qPrintable( currentSplittedWordCompound ) );
|
||||||
|
|
||||||
stemmedWordFinder->prefixMatch( currentSplittedWordCompound, activeDicts, 40, // Would one be enough? Leave 40 to be safe.
|
stemmedWordFinder->expressionMatch( currentSplittedWordCompound, activeDicts, 40, // Would one be enough? Leave 40 to be safe.
|
||||||
Dictionary::SuitableForCompoundSearching );
|
Dictionary::SuitableForCompoundSearching );
|
||||||
}
|
}
|
||||||
|
|
||||||
QString ArticleRequest::makeSplittedWordCompound()
|
QString ArticleRequest::makeSplittedWordCompound()
|
||||||
|
@ -960,7 +960,6 @@ void ArticleRequest::individualWordFinished()
|
||||||
|
|
||||||
if ( results.size() )
|
if ( results.size() )
|
||||||
{
|
{
|
||||||
// Check if the aliases are acceptable
|
|
||||||
wstring source = Folding::applySimpleCaseOnly( gd::toWString( currentSplittedWordCompound ) );
|
wstring source = Folding::applySimpleCaseOnly( gd::toWString( currentSplittedWordCompound ) );
|
||||||
|
|
||||||
bool hadSomething = false;
|
bool hadSomething = false;
|
||||||
|
@ -968,7 +967,14 @@ void ArticleRequest::individualWordFinished()
|
||||||
for( unsigned x = 0; x < results.size(); ++x )
|
for( unsigned x = 0; x < results.size(); ++x )
|
||||||
{
|
{
|
||||||
if ( results[ x ].second )
|
if ( results[ x ].second )
|
||||||
continue; // We're not interested in suggestions
|
{
|
||||||
|
// Spelling suggestion match found. No need to continue.
|
||||||
|
hadSomething = true;
|
||||||
|
lastGoodCompoundResult = currentSplittedWordCompound;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prefix match found. Check if the aliases are acceptable.
|
||||||
|
|
||||||
wstring result( Folding::applySimpleCaseOnly( gd::toWString( results[ x ].first ) ) );
|
wstring result( Folding::applySimpleCaseOnly( gd::toWString( results[ x ].first ) ) );
|
||||||
|
|
||||||
|
|
179
hunspell.cc
179
hunspell.cc
|
@ -83,6 +83,8 @@ public:
|
||||||
virtual bool isLocalDictionary()
|
virtual bool isLocalDictionary()
|
||||||
{ return true; }
|
{ return true; }
|
||||||
|
|
||||||
|
virtual vector< wstring > getAlternateWritings( const wstring & word ) throw();
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
|
||||||
virtual void loadIcon() throw();
|
virtual void loadIcon() throw();
|
||||||
|
@ -110,6 +112,16 @@ string encodeToHunspell( Hunspell &, wstring const & );
|
||||||
/// Iconv::Ex
|
/// Iconv::Ex
|
||||||
wstring decodeFromHunspell( Hunspell &, char const * );
|
wstring decodeFromHunspell( Hunspell &, char const * );
|
||||||
|
|
||||||
|
/// Generates suggestions via hunspell
|
||||||
|
QVector< wstring > suggest( wstring & word, Mutex & hunspellMutex,
|
||||||
|
Hunspell & hunspell );
|
||||||
|
|
||||||
|
/// Generates suggestions for compound expression
|
||||||
|
void getSuggestionsForExpression( wstring const & expression,
|
||||||
|
vector< wstring > & suggestions,
|
||||||
|
Mutex & hunspellMutex,
|
||||||
|
Hunspell & hunspell );
|
||||||
|
|
||||||
/// Returns true if the string contains whitespace, false otherwise
|
/// Returns true if the string contains whitespace, false otherwise
|
||||||
bool containsWhitespace( wstring const & str )
|
bool containsWhitespace( wstring const & str )
|
||||||
{
|
{
|
||||||
|
@ -142,6 +154,18 @@ void HunspellDictionary::loadIcon() throw()
|
||||||
dictionaryIconLoaded = true;
|
dictionaryIconLoaded = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
vector< wstring > HunspellDictionary::getAlternateWritings( wstring const & word ) throw()
|
||||||
|
{
|
||||||
|
vector< wstring > results;
|
||||||
|
|
||||||
|
if( containsWhitespace( word ) )
|
||||||
|
{
|
||||||
|
getSuggestionsForExpression( word, results, getHunspellMutex(), hunspell );
|
||||||
|
}
|
||||||
|
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
/// HunspellDictionary::getArticle()
|
/// HunspellDictionary::getArticle()
|
||||||
|
|
||||||
class HunspellArticleRequest;
|
class HunspellArticleRequest;
|
||||||
|
@ -377,11 +401,6 @@ public:
|
||||||
isCancelled.ref();
|
isCancelled.ref();
|
||||||
hasExited.acquire();
|
hasExited.acquire();
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
|
||||||
|
|
||||||
/// Generates suggestions via hunspell
|
|
||||||
QVector< wstring > suggest( wstring & word );
|
|
||||||
};
|
};
|
||||||
|
|
||||||
void HunspellHeadwordsRequestRunnable::run()
|
void HunspellHeadwordsRequestRunnable::run()
|
||||||
|
@ -409,58 +428,18 @@ void HunspellHeadwordsRequest::run()
|
||||||
|
|
||||||
if ( containsWhitespace( trimmedWord ) )
|
if ( containsWhitespace( trimmedWord ) )
|
||||||
{
|
{
|
||||||
// Analyze each word separately and use the first suggestion, if any.
|
vector< wstring > results;
|
||||||
// This is useful for compound expressions where one of the words is
|
|
||||||
// in different form, e.g. "dozing off" -> "doze off".
|
|
||||||
// In this mode, we only provide a single suggestion at most.
|
|
||||||
|
|
||||||
wstring result;
|
getSuggestionsForExpression( trimmedWord, results, hunspellMutex, hunspell );
|
||||||
|
|
||||||
wstring word;
|
Mutex::Lock _( dataMutex );
|
||||||
|
for( unsigned i = 0; i < results.size(); i++ )
|
||||||
|
matches.push_back( results[ i ] );
|
||||||
|
|
||||||
for( wchar const * c = trimmedWord.c_str(); ; ++c )
|
|
||||||
{
|
|
||||||
if ( !*c || Folding::isPunct( *c ) || Folding::isWhitespace( * c ) )
|
|
||||||
{
|
|
||||||
if ( word.size() )
|
|
||||||
{
|
|
||||||
QVector< wstring > suggestions = suggest( word );
|
|
||||||
|
|
||||||
if ( suggestions.size() )
|
|
||||||
result += suggestions[ 0 ];
|
|
||||||
else
|
|
||||||
result += word;
|
|
||||||
|
|
||||||
word.clear();
|
|
||||||
}
|
|
||||||
if ( *c )
|
|
||||||
result.push_back( *c );
|
|
||||||
else
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
word.push_back( *c );
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( word.size() )
|
|
||||||
{
|
|
||||||
QVector< wstring > suggestions = suggest( trimmedWord );
|
|
||||||
|
|
||||||
if ( suggestions.size() )
|
|
||||||
result += suggestions[ 0 ];
|
|
||||||
else
|
|
||||||
result += word;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( result != trimmedWord )
|
|
||||||
{
|
|
||||||
Mutex::Lock _( dataMutex );
|
|
||||||
matches.push_back( result );
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
QVector< wstring > suggestions = suggest( trimmedWord );
|
QVector< wstring > suggestions = suggest( trimmedWord, hunspellMutex, hunspell );
|
||||||
|
|
||||||
if ( !suggestions.empty() )
|
if ( !suggestions.empty() )
|
||||||
{
|
{
|
||||||
|
@ -474,7 +453,7 @@ void HunspellHeadwordsRequest::run()
|
||||||
finish();
|
finish();
|
||||||
}
|
}
|
||||||
|
|
||||||
QVector< wstring > HunspellHeadwordsRequest::suggest( wstring & word )
|
QVector< wstring > suggest( wstring & word, Mutex & hunspellMutex, Hunspell & hunspell )
|
||||||
{
|
{
|
||||||
QVector< wstring > result;
|
QVector< wstring > result;
|
||||||
|
|
||||||
|
@ -656,6 +635,102 @@ sptr< WordSearchRequest > HunspellDictionary::prefixMatch( wstring const & word,
|
||||||
return new HunspellPrefixMatchRequest( word, getHunspellMutex(), hunspell );
|
return new HunspellPrefixMatchRequest( word, getHunspellMutex(), hunspell );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void getSuggestionsForExpression( wstring const & expression,
|
||||||
|
vector<wstring> & suggestions,
|
||||||
|
Mutex & hunspellMutex,
|
||||||
|
Hunspell & hunspell )
|
||||||
|
{
|
||||||
|
// Analyze each word separately and use the first two suggestions, if any.
|
||||||
|
// This is useful for compound expressions where some words is
|
||||||
|
// in different form, e.g. "dozing off" -> "doze off".
|
||||||
|
|
||||||
|
wstring trimmedWord = Folding::trimWhitespaceOrPunct( expression );
|
||||||
|
wstring word, punct;
|
||||||
|
QVector< wstring > words;
|
||||||
|
|
||||||
|
suggestions.clear();
|
||||||
|
|
||||||
|
// Parse string to separate words
|
||||||
|
|
||||||
|
for( wchar const * c = trimmedWord.c_str(); ; ++c )
|
||||||
|
{
|
||||||
|
if ( !*c || Folding::isPunct( *c ) || Folding::isWhitespace( * c ) )
|
||||||
|
{
|
||||||
|
if ( word.size() )
|
||||||
|
{
|
||||||
|
words.push_back( word );
|
||||||
|
word.clear();
|
||||||
|
}
|
||||||
|
if ( *c )
|
||||||
|
punct.push_back( *c );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if( punct.size() )
|
||||||
|
{
|
||||||
|
words.push_back( punct );
|
||||||
|
punct.clear();
|
||||||
|
}
|
||||||
|
if( *c )
|
||||||
|
word.push_back( *c );
|
||||||
|
}
|
||||||
|
if( !*c )
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( words.size() > 21 )
|
||||||
|
{
|
||||||
|
// Too many words - no suggestions
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Combine result strings from suggestions
|
||||||
|
|
||||||
|
QVector< wstring > results;
|
||||||
|
|
||||||
|
for( int i = 0; i < words.size(); i++ )
|
||||||
|
{
|
||||||
|
word = words.at( i );
|
||||||
|
if( Folding::isPunct( word[ 0 ] ) || Folding::isWhitespace( word[ 0 ] ) )
|
||||||
|
{
|
||||||
|
for( int j = 0; j < results.size(); j++ )
|
||||||
|
results[ j ].append( word );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
QVector< wstring > sugg = suggest( word, hunspellMutex, hunspell );
|
||||||
|
int suggNum = sugg.size() + 1;
|
||||||
|
if( suggNum > 3 )
|
||||||
|
suggNum = 3;
|
||||||
|
int resNum = results.size();
|
||||||
|
wstring resultStr;
|
||||||
|
|
||||||
|
if( resNum == 0 )
|
||||||
|
{
|
||||||
|
for( int k = 0; k < suggNum; k++ )
|
||||||
|
results.push_back( k == 0 ? word : sugg.at( k - 1 ) );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for( int j = 0; j < resNum; j++ )
|
||||||
|
{
|
||||||
|
resultStr = results.at( j );
|
||||||
|
for( int k = 0; k < suggNum; k++ )
|
||||||
|
{
|
||||||
|
if( k == 0)
|
||||||
|
results[ j ].append( word );
|
||||||
|
else
|
||||||
|
results.push_back( resultStr + sugg.at( k - 1 ) );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for( int i = 0; i < results.size(); i++ )
|
||||||
|
if( results.at( i ) != trimmedWord )
|
||||||
|
suggestions.push_back( results.at( i ) );
|
||||||
|
}
|
||||||
|
|
||||||
string encodeToHunspell( Hunspell & hunspell, wstring const & str )
|
string encodeToHunspell( Hunspell & hunspell, wstring const & str )
|
||||||
{
|
{
|
||||||
|
|
|
@ -86,6 +86,31 @@ void WordFinder::stemmedMatch( QString const & str,
|
||||||
startSearch();
|
startSearch();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void WordFinder::expressionMatch( QString const & str,
|
||||||
|
std::vector< sptr< Dictionary::Class > > const & dicts,
|
||||||
|
unsigned long maxResults,
|
||||||
|
Dictionary::Features features )
|
||||||
|
{
|
||||||
|
cancel();
|
||||||
|
|
||||||
|
searchQueued = true;
|
||||||
|
searchType = ExpressionMatch;
|
||||||
|
inputWord = str;
|
||||||
|
inputDicts = &dicts;
|
||||||
|
requestedMaxResults = maxResults;
|
||||||
|
requestedFeatures = features;
|
||||||
|
|
||||||
|
resultsArray.clear();
|
||||||
|
resultsIndex.clear();
|
||||||
|
searchResults.clear();
|
||||||
|
|
||||||
|
if ( queuedRequests.empty() )
|
||||||
|
{
|
||||||
|
// No requests are queued, no need to wait for them to finish.
|
||||||
|
startSearch();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void WordFinder::startSearch()
|
void WordFinder::startSearch()
|
||||||
{
|
{
|
||||||
if ( !searchQueued )
|
if ( !searchQueued )
|
||||||
|
@ -127,7 +152,7 @@ void WordFinder::startSearch()
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
sptr< Dictionary::WordSearchRequest > sr =
|
sptr< Dictionary::WordSearchRequest > sr =
|
||||||
( searchType == PrefixMatch ) ?
|
( searchType == PrefixMatch || searchType == ExpressionMatch ) ?
|
||||||
(*inputDicts)[ x ]->prefixMatch( allWordWritings[ y ], requestedMaxResults ) :
|
(*inputDicts)[ x ]->prefixMatch( allWordWritings[ y ], requestedMaxResults ) :
|
||||||
(*inputDicts)[ x ]->stemmedMatch( allWordWritings[ y ], stemmedMinLength, stemmedMaxSuffixVariation, requestedMaxResults );
|
(*inputDicts)[ x ]->stemmedMatch( allWordWritings[ y ], stemmedMinLength, stemmedMaxSuffixVariation, requestedMaxResults );
|
||||||
|
|
||||||
|
@ -273,6 +298,8 @@ void WordFinder::updateResults()
|
||||||
if ( updateResultsTimer.isActive() )
|
if ( updateResultsTimer.isActive() )
|
||||||
updateResultsTimer.stop(); // Can happen when we were done before it'd expire
|
updateResultsTimer.stop(); // Can happen when we were done before it'd expire
|
||||||
|
|
||||||
|
wstring original = Folding::applySimpleCaseOnly( allWordWritings[ 0 ] );
|
||||||
|
|
||||||
for( list< sptr< Dictionary::WordSearchRequest > >::iterator i =
|
for( list< sptr< Dictionary::WordSearchRequest > >::iterator i =
|
||||||
finishedRequests.begin(); i != finishedRequests.end(); )
|
finishedRequests.begin(); i != finishedRequests.end(); )
|
||||||
{
|
{
|
||||||
|
@ -282,6 +309,30 @@ void WordFinder::updateResults()
|
||||||
int weight = (**i)[ x ].weight;
|
int weight = (**i)[ x ].weight;
|
||||||
wstring lowerCased = Folding::applySimpleCaseOnly( match );
|
wstring lowerCased = Folding::applySimpleCaseOnly( match );
|
||||||
|
|
||||||
|
if( searchType == ExpressionMatch )
|
||||||
|
{
|
||||||
|
unsigned ws;
|
||||||
|
|
||||||
|
for( ws = 0; ws < allWordWritings.size(); ws++ )
|
||||||
|
{
|
||||||
|
if( ws == 0 )
|
||||||
|
{
|
||||||
|
// Check for prefix match with original expression
|
||||||
|
if( lowerCased.compare( 0, original.size(), original ) == 0 )
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
if( lowerCased == Folding::applySimpleCaseOnly( allWordWritings[ ws ] ) )
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( ws >= allWordWritings.size() )
|
||||||
|
{
|
||||||
|
// No exact matches found
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
weight = ws;
|
||||||
|
}
|
||||||
pair< ResultsIndex::iterator, bool > insertResult =
|
pair< ResultsIndex::iterator, bool > insertResult =
|
||||||
resultsIndex.insert( pair< wstring, ResultsArray::iterator >( lowerCased,
|
resultsIndex.insert( pair< wstring, ResultsArray::iterator >( lowerCased,
|
||||||
resultsArray.end() ) );
|
resultsArray.end() ) );
|
||||||
|
@ -400,6 +451,7 @@ void WordFinder::updateResults()
|
||||||
resultsArray.sort( SortByRank() );
|
resultsArray.sort( SortByRank() );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
if( searchType == StemmedMatch )
|
||||||
{
|
{
|
||||||
// Handling stemmed matches
|
// Handling stemmed matches
|
||||||
|
|
||||||
|
|
|
@ -41,7 +41,8 @@ private:
|
||||||
enum SearchType
|
enum SearchType
|
||||||
{
|
{
|
||||||
PrefixMatch,
|
PrefixMatch,
|
||||||
StemmedMatch
|
StemmedMatch,
|
||||||
|
ExpressionMatch
|
||||||
} searchType;
|
} searchType;
|
||||||
unsigned long requestedMaxResults;
|
unsigned long requestedMaxResults;
|
||||||
Dictionary::Features requestedFeatures;
|
Dictionary::Features requestedFeatures;
|
||||||
|
@ -94,6 +95,13 @@ public:
|
||||||
unsigned long maxResults = 30,
|
unsigned long maxResults = 30,
|
||||||
Dictionary::Features = Dictionary::NoFeatures );
|
Dictionary::Features = Dictionary::NoFeatures );
|
||||||
|
|
||||||
|
/// Do the expression-match search in the given list of dictionaries.
|
||||||
|
/// Function find exact matches for one of spelling suggestions.
|
||||||
|
void expressionMatch( QString const &,
|
||||||
|
std::vector< sptr< Dictionary::Class > > const &,
|
||||||
|
unsigned long maxResults = 40,
|
||||||
|
Dictionary::Features = Dictionary::NoFeatures );
|
||||||
|
|
||||||
/// Returns the vector containing search results from the last operation.
|
/// Returns the vector containing search results from the last operation.
|
||||||
/// If it didn't finish yet, the result is not final and may be changing
|
/// If it didn't finish yet, the result is not final and may be changing
|
||||||
/// over time.
|
/// over time.
|
||||||
|
|
Loading…
Reference in a new issue