Successfully find arbitrarily large compound expressions.

Previously the program could only safely find two-word compounds. Now it always
finds all of them, even if they are large sentences with many words.

To choose the source for compounds, a notion of dictionary features was added.
It may be utilized later for some more interesting things.
This commit is contained in:
Konstantin Isakov 2010-05-30 00:50:16 +04:00
parent 9eb20cf0b6
commit 67ca2ee1dd
6 changed files with 77 additions and 17 deletions

View file

@ -573,12 +573,10 @@ void ArticleRequest::compoundSearchNextStep( bool lastSearchSucceeded )
string footer; string footer;
if ( currentSplittedWordEnd - currentSplittedWordStart > 1 ) // We have something to append if ( lastGoodCompoundResult.size() ) // We have something to append
{ {
// printf( "Appending\n" ); // printf( "Appending\n" );
--currentSplittedWordEnd;
if ( !firstCompoundWasFound ) if ( !firstCompoundWasFound )
{ {
// Append the beginning // Append the beginning
@ -594,7 +592,9 @@ void ArticleRequest::compoundSearchNextStep( bool lastSearchSucceeded )
footer += " / "; footer += " / ";
} }
footer += linkWord( makeSplittedWordCompound() ); footer += linkWord( lastGoodCompoundResult );
lastGoodCompoundResult.clear();
} }
// Then, start a new search for the next word, if possible // Then, start a new search for the next word, if possible
@ -672,7 +672,8 @@ void ArticleRequest::compoundSearchNextStep( bool lastSearchSucceeded )
// printf( "Looking up %s\n", qPrintable( currentSplittedWordCompound ) ); // printf( "Looking up %s\n", qPrintable( currentSplittedWordCompound ) );
stemmedWordFinder->stemmedMatch( currentSplittedWordCompound, activeDicts, 0, 0, 1 ); stemmedWordFinder->prefixMatch( currentSplittedWordCompound, activeDicts, 40, // Would one be enough? Leave 40 to be safe.
Dictionary::SuitableForCompoundSearching );
} }
QString ArticleRequest::makeSplittedWordCompound() QString ArticleRequest::makeSplittedWordCompound()
@ -707,10 +708,32 @@ void ArticleRequest::individualWordFinished()
// Check if the aliases are acceptable // Check if the aliases are acceptable
wstring source = Folding::applySimpleCaseOnly( gd::toWString( currentSplittedWordCompound ) ); wstring source = Folding::applySimpleCaseOnly( gd::toWString( currentSplittedWordCompound ) );
bool hadSomething = false;
for( unsigned x = 0; x < results.size(); ++x ) for( unsigned x = 0; x < results.size(); ++x )
if ( source == Folding::applySimpleCaseOnly( gd::toWString( results[ x ].first ) ) )
{ {
// Ok, good enough if ( results[ x ].second )
continue; // We're not interested in suggestions
wstring result( Folding::applySimpleCaseOnly( gd::toWString( results[ x ].first ) ) );
if ( source.size() <= result.size() && result.compare( 0, source.size(), source ) == 0 )
{
// The resulting string begins with the source one
hadSomething = true;
if ( source.size() == result.size() )
{
// Got the match. No need to continue.
lastGoodCompoundResult = currentSplittedWordCompound;
break;
}
}
}
if ( hadSomething )
{
compoundSearchNextStep( true ); compoundSearchNextStep( true );
return; return;
} }

View file

@ -100,6 +100,7 @@ class ArticleRequest: public Dictionary::DataRequest
int currentSplittedWordStart; int currentSplittedWordStart;
int currentSplittedWordEnd; int currentSplittedWordEnd;
QString currentSplittedWordCompound; QString currentSplittedWordCompound;
QString lastGoodCompoundResult;
bool firstCompoundWasFound; bool firstCompoundWasFound;
public: public:

View file

@ -139,6 +139,10 @@ public:
BtreeDictionary( string const & id, vector< string > const & dictionaryFiles ); BtreeDictionary( string const & id, vector< string > const & dictionaryFiles );
/// Btree-indexed dictionaries are usually a good source for compound searches.
virtual Dictionary::Features getFeatures() const throw()
{ return Dictionary::SuitableForCompoundSearching; }
/// This function does the search using the btree index. Derivatives /// This function does the search using the btree index. Derivatives
/// need not to implement this function. /// need not to implement this function.
virtual sptr< Dictionary::WordSearchRequest > prefixMatch( wstring const &, virtual sptr< Dictionary::WordSearchRequest > prefixMatch( wstring const &,

View file

@ -222,6 +222,18 @@ public:
{ return data; } { return data; }
}; };
/// Dictionary features. Different dictionaries can possess different features,
/// which hint at some of their aspects.
enum Feature
{
/// No features
NoFeatures = 0,
/// The dictionary is suitable to query when searching for compound expressions.
SuitableForCompoundSearching = 1
};
Q_DECLARE_FLAGS( Features, Feature )
Q_DECLARE_OPERATORS_FOR_FLAGS( Features )
/// A dictionary. Can be used to query words. /// A dictionary. Can be used to query words.
class Class class Class
@ -259,6 +271,11 @@ public:
/// description etc. All strings are in utf8. /// description etc. All strings are in utf8.
virtual map< Property, string > getProperties() throw()=0; virtual map< Property, string > getProperties() throw()=0;
/// Returns the features the dictionary possess. See the Feature enum for
/// their list.
virtual Features getFeatures() const throw()
{ return NoFeatures; }
/// Returns the number of articles in the dictionary. /// Returns the number of articles in the dictionary.
virtual unsigned long getArticleCount() throw()=0; virtual unsigned long getArticleCount() throw()=0;

View file

@ -32,7 +32,9 @@ WordFinder::~WordFinder()
} }
void WordFinder::prefixMatch( QString const & str, void WordFinder::prefixMatch( QString const & str,
std::vector< sptr< Dictionary::Class > > const & dicts ) std::vector< sptr< Dictionary::Class > > const & dicts,
unsigned long maxResults,
Dictionary::Features features )
{ {
cancel(); cancel();
@ -40,6 +42,8 @@ void WordFinder::prefixMatch( QString const & str,
searchType = PrefixMatch; searchType = PrefixMatch;
inputWord = str; inputWord = str;
inputDicts = &dicts; inputDicts = &dicts;
requestedMaxResults = maxResults;
requestedFeatures = features;
resultsArray.clear(); resultsArray.clear();
resultsIndex.clear(); resultsIndex.clear();
@ -59,7 +63,8 @@ void WordFinder::stemmedMatch( QString const & str,
std::vector< sptr< Dictionary::Class > > const & dicts, std::vector< sptr< Dictionary::Class > > const & dicts,
unsigned minLength, unsigned minLength,
unsigned maxSuffixVariation, unsigned maxSuffixVariation,
unsigned long maxResults ) unsigned long maxResults,
Dictionary::Features features )
{ {
cancel(); cancel();
@ -67,9 +72,10 @@ void WordFinder::stemmedMatch( QString const & str,
searchType = StemmedMatch; searchType = StemmedMatch;
inputWord = str; inputWord = str;
inputDicts = &dicts; inputDicts = &dicts;
requestedMaxResults = maxResults;
requestedFeatures = features;
stemmedMinLength = minLength; stemmedMinLength = minLength;
stemmedMaxSuffixVariation = maxSuffixVariation; stemmedMaxSuffixVariation = maxSuffixVariation;
stemmedMaxResults = maxResults;
resultsArray.clear(); resultsArray.clear();
resultsIndex.clear(); resultsIndex.clear();
@ -111,12 +117,15 @@ void WordFinder::startSearch()
for( size_t x = 0; x < inputDicts->size(); ++x ) for( size_t x = 0; x < inputDicts->size(); ++x )
{ {
if ( ( (*inputDicts)[ x ]->getFeatures() & requestedFeatures ) != requestedFeatures )
continue;
for( size_t y = 0; y < allWordWritings.size(); ++y ) for( size_t y = 0; y < allWordWritings.size(); ++y )
{ {
sptr< Dictionary::WordSearchRequest > sr = sptr< Dictionary::WordSearchRequest > sr =
( searchType == PrefixMatch ) ? ( searchType == PrefixMatch ) ?
(*inputDicts)[ x ]->prefixMatch( allWordWritings[ y ], 40 ) : (*inputDicts)[ x ]->prefixMatch( allWordWritings[ y ], requestedMaxResults ) :
(*inputDicts)[ x ]->stemmedMatch( allWordWritings[ y ], stemmedMinLength, stemmedMaxSuffixVariation, stemmedMaxResults ); (*inputDicts)[ x ]->stemmedMatch( allWordWritings[ y ], stemmedMinLength, stemmedMaxSuffixVariation, requestedMaxResults );
connect( sr.get(), SIGNAL( finished() ), connect( sr.get(), SIGNAL( finished() ),
this, SLOT( requestFinished() ), Qt::QueuedConnection ); this, SLOT( requestFinished() ), Qt::QueuedConnection );

View file

@ -42,9 +42,10 @@ private:
PrefixMatch, PrefixMatch,
StemmedMatch StemmedMatch
} searchType; } searchType;
unsigned long requestedMaxResults;
Dictionary::Features requestedFeatures;
unsigned stemmedMinLength; unsigned stemmedMinLength;
unsigned stemmedMaxSuffixVariation; unsigned stemmedMaxSuffixVariation;
unsigned long stemmedMaxResults;
std::vector< sptr< Dictionary::Class > > const * inputDicts; std::vector< sptr< Dictionary::Class > > const * inputDicts;
@ -74,10 +75,14 @@ public:
/// the exact matches would be found. All search results are put into a single /// the exact matches would be found. All search results are put into a single
/// list containing the exact matches first, then the prefix ones. Duplicate /// list containing the exact matches first, then the prefix ones. Duplicate
/// matches from different dictionaries are merged together. /// matches from different dictionaries are merged together.
/// If a list of features is specified, the search will only be performed in
/// the dictionaries which possess all the features requested.
/// If there already was a prefixMatch operation underway, it gets cancelled /// If there already was a prefixMatch operation underway, it gets cancelled
/// and the new one replaces it. /// and the new one replaces it.
void prefixMatch( QString const &, void prefixMatch( QString const &,
std::vector< sptr< Dictionary::Class > > const & ); std::vector< sptr< Dictionary::Class > > const &,
unsigned long maxResults = 40,
Dictionary::Features = Dictionary::NoFeatures );
/// Do a stemmed-match search in the given list of dictionaries. All comments /// Do a stemmed-match search in the given list of dictionaries. All comments
/// from prefixMatch() generally apply as well. /// from prefixMatch() generally apply as well.
@ -85,7 +90,8 @@ public:
std::vector< sptr< Dictionary::Class > > const &, std::vector< sptr< Dictionary::Class > > const &,
unsigned minLength = 3, unsigned minLength = 3,
unsigned maxSuffixVariation = 3, unsigned maxSuffixVariation = 3,
unsigned long maxResults = 30 ); unsigned long maxResults = 30,
Dictionary::Features = Dictionary::NoFeatures );
/// Returns the vector containing search results from the last operation. /// Returns the vector containing search results from the last operation.
/// If it didn't finish yet, the result is not final and may be changing /// If it didn't finish yet, the result is not final and may be changing