goldendict-ng/wordfinder.cc
Konstantin Isakov 04bdf3aa36 For websites and forvo, don't add dummy word search results (as they don't have any index to search in) -- instead, make results empty, but mark the search uncertain, and don't mark the word input line as reddish in that case.
This is just a refinement on how the word search is done in the dictionaries
that don't actually ability to search words in. Previously they emitted dummy
italicized suggestions, which were getting in the way. Now they don't emit
anything, but mark the search as uncertain. Any uncertain searches don't mark
the word input with the different color to indicate the search has failed.
2010-06-22 16:43:11 +04:00

464 lines
14 KiB
C++

/* This file is (c) 2008-2010 Konstantin Isakov <ikm@users.berlios.de>
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#include "wordfinder.hh"
#include "folding.hh"
#include "wstring_qt.hh"
#include <QThreadPool>
#include <map>
using std::vector;
using std::list;
using gd::wstring;
using gd::wchar;
using std::map;
using std::pair;
WordFinder::WordFinder( QObject * parent ):
QObject( parent ), searchInProgress( false ),
updateResultsTimer( this ),
searchQueued( false )
{
updateResultsTimer.setInterval( 1000 ); // We use a one second update timer
updateResultsTimer.setSingleShot( true );
connect( &updateResultsTimer, SIGNAL( timeout() ),
this, SLOT( updateResults() ) );
}
WordFinder::~WordFinder()
{
clear();
}
void WordFinder::prefixMatch( QString const & str,
std::vector< sptr< Dictionary::Class > > const & dicts,
unsigned long maxResults,
Dictionary::Features features )
{
cancel();
searchQueued = true;
searchType = PrefixMatch;
inputWord = str;
inputDicts = &dicts;
requestedMaxResults = maxResults;
requestedFeatures = features;
resultsArray.clear();
resultsIndex.clear();
searchResults.clear();
if ( queuedRequests.empty() )
{
// No requests are queued, no need to wait for them to finish.
startSearch();
}
// Else some requests are still queued, last one to finish would trigger
// new search. This shouldn't take a lot of time, since they were all
// cancelled, but still it could take some time.
}
void WordFinder::stemmedMatch( QString const & str,
std::vector< sptr< Dictionary::Class > > const & dicts,
unsigned minLength,
unsigned maxSuffixVariation,
unsigned long maxResults,
Dictionary::Features features )
{
cancel();
searchQueued = true;
searchType = StemmedMatch;
inputWord = str;
inputDicts = &dicts;
requestedMaxResults = maxResults;
requestedFeatures = features;
stemmedMinLength = minLength;
stemmedMaxSuffixVariation = maxSuffixVariation;
resultsArray.clear();
resultsIndex.clear();
searchResults.clear();
if ( queuedRequests.empty() )
startSearch();
}
void WordFinder::startSearch()
{
if ( !searchQueued )
return; // Search was probably cancelled
// Clear the requests just in case
queuedRequests.clear();
finishedRequests.clear();
searchErrorString.clear();
searchResultsUncertain = false;
searchQueued = false;
searchInProgress = true;
// Gather all writings of the word
if ( allWordWritings.size() != 1 )
allWordWritings.resize( 1 );
allWordWritings[ 0 ] = gd::toWString( inputWord );
for( size_t x = 0; x < inputDicts->size(); ++x )
{
vector< wstring > writings = (*inputDicts)[ x ]->getAlternateWritings( allWordWritings[ 0 ] );
allWordWritings.insert( allWordWritings.end(), writings.begin(), writings.end() );
}
// Query each dictionary for all word writings
for( size_t x = 0; x < inputDicts->size(); ++x )
{
if ( ( (*inputDicts)[ x ]->getFeatures() & requestedFeatures ) != requestedFeatures )
continue;
for( size_t y = 0; y < allWordWritings.size(); ++y )
{
sptr< Dictionary::WordSearchRequest > sr =
( searchType == PrefixMatch ) ?
(*inputDicts)[ x ]->prefixMatch( allWordWritings[ y ], requestedMaxResults ) :
(*inputDicts)[ x ]->stemmedMatch( allWordWritings[ y ], stemmedMinLength, stemmedMaxSuffixVariation, requestedMaxResults );
connect( sr.get(), SIGNAL( finished() ),
this, SLOT( requestFinished() ), Qt::QueuedConnection );
queuedRequests.push_back( sr );
}
}
// Handle any requests finished already
requestFinished();
}
void WordFinder::cancel()
{
searchQueued = false;
searchInProgress = false;
cancelSearches();
}
void WordFinder::clear()
{
cancel();
queuedRequests.clear();
finishedRequests.clear();
}
void WordFinder::requestFinished()
{
bool newResults = false;
// See how many new requests have finished, and if we have any new results
for( list< sptr< Dictionary::WordSearchRequest > >::iterator i =
queuedRequests.begin(); i != queuedRequests.end(); )
{
if ( (*i)->isFinished() )
{
if ( searchInProgress && !(*i)->getErrorString().isEmpty() )
searchErrorString = tr( "Failed to query some dictionaries." );
if ( (*i)->isUncertain() )
searchResultsUncertain = true;
if ( (*i)->matchesCount() )
{
newResults = true;
// This list is handled by updateResults()
finishedRequests.splice( finishedRequests.end(), queuedRequests, i++ );
}
else // We won't do anything with it anymore, so we erase it
queuedRequests.erase( i++ );
}
else
++i;
}
if ( !searchInProgress )
{
// There is no search in progress, so we just wait until there's
// no requests left
if ( queuedRequests.empty() )
{
// We got rid of all queries, queued search can now start
finishedRequests.clear();
if ( searchQueued )
startSearch();
}
return;
}
if ( newResults && queuedRequests.size() && !updateResultsTimer.isActive() )
{
// If we have got some new results, but not all of them, we would start a
// timer to update a user some time in the future
updateResultsTimer.start();
}
if ( queuedRequests.empty() )
{
// Search is finished.
updateResults();
}
}
namespace {
unsigned saturated( unsigned x )
{
return x < 255 ? x : 255;
}
/// Checks whether the first string has the second one inside, surrounded from
/// both sides by either whitespace, punctuation or begin/end of string.
/// If true is returned, pos holds the offset in the haystack. If the offset
/// is larger than 255, it is set to 255.
bool hasSurroundedWithWs( wstring const & haystack, wstring const & needle,
wstring::size_type & pos )
{
if ( haystack.size() < needle.size() )
return false; // Needle won't even fit into a haystack
for( pos = 0; ; ++pos )
{
pos = haystack.find( needle, pos );
if ( pos == wstring::npos )
return false; // Not found
if ( ( !pos || Folding::isWhitespace( haystack[ pos - 1 ] ) ||
Folding::isPunct( haystack[ pos - 1 ] ) ) &&
( ( pos + needle.size() == haystack.size() ) ||
Folding::isWhitespace( haystack[ pos + needle.size() ] ) ||
Folding::isPunct( haystack[ pos + needle.size() ] ) ) )
{
pos = saturated( pos );
return true;
}
}
}
}
void WordFinder::updateResults()
{
if ( !searchInProgress )
return; // Old queued signal
if ( updateResultsTimer.isActive() )
updateResultsTimer.stop(); // Can happen when we were done before it'd expire
for( list< sptr< Dictionary::WordSearchRequest > >::iterator i =
finishedRequests.begin(); i != finishedRequests.end(); )
{
for( size_t count = (*i)->matchesCount(), x = 0; x < count; ++x )
{
wstring match = (**i)[ x ].word;
int weight = (**i)[ x ].weight;
wstring lowerCased = Folding::applySimpleCaseOnly( match );
pair< ResultsIndex::iterator, bool > insertResult =
resultsIndex.insert( pair< wstring, ResultsArray::iterator >( lowerCased,
resultsArray.end() ) );
if ( !insertResult.second )
{
// Wasn't inserted since there was already an item -- check the case
if ( insertResult.first->second->word != match )
{
// The case is different -- agree on a lowercase version
insertResult.first->second->word = lowerCased;
}
if ( !weight && insertResult.first->second->wasSuggested )
insertResult.first->second->wasSuggested = false;
}
else
{
resultsArray.push_back( OneResult() );
resultsArray.back().word = match;
resultsArray.back().rank = INT_MAX;
resultsArray.back().wasSuggested = ( weight != 0 );
insertResult.first->second = --resultsArray.end();
}
}
finishedRequests.erase( i++ );
}
size_t maxSearchResults = 500;
if ( resultsArray.size() )
{
if ( searchType == PrefixMatch )
{
/// Assign each result a category, storing it in the rank's field
enum Category
{
ExactMatch,
ExactNoFullCaseMatch,
ExactNoDiaMatch,
ExactNoPunctMatch,
ExactNoWsMatch,
ExactInsideMatch,
ExactNoDiaInsideMatch,
ExactNoPunctInsideMatch,
PrefixMatch,
PrefixNoDiaMatch,
PrefixNoPunctMatch,
PrefixNoWsMatch,
WorstMatch,
Multiplier = 256 // Categories should be multiplied by Multiplier
};
for( unsigned wr = 0; wr < allWordWritings.size(); ++wr )
{
wstring target = Folding::applySimpleCaseOnly( allWordWritings[ wr ] );
wstring targetNoFullCase = Folding::applyFullCaseOnly( target );
wstring targetNoDia = Folding::applyDiacriticsOnly( targetNoFullCase );
wstring targetNoPunct = Folding::applyPunctOnly( targetNoDia );
wstring targetNoWs = Folding::applyWhitespaceOnly( targetNoPunct );
wstring::size_type matchPos = 0;
for( ResultsIndex::const_iterator i = resultsIndex.begin(), j = resultsIndex.end();
i != j; ++i )
{
wstring resultNoFullCase, resultNoDia, resultNoPunct, resultNoWs;
int rank;
if ( i->first == target )
rank = ExactMatch * Multiplier;
else
if ( ( resultNoFullCase = Folding::applyFullCaseOnly( i->first ) ) == targetNoFullCase )
rank = ExactNoFullCaseMatch * Multiplier;
else
if ( ( resultNoDia = Folding::applyDiacriticsOnly( resultNoFullCase ) ) == targetNoDia )
rank = ExactNoDiaMatch * Multiplier;
else
if ( ( resultNoPunct = Folding::applyPunctOnly( resultNoDia ) ) == targetNoPunct )
rank = ExactNoPunctMatch * Multiplier;
else
if ( ( resultNoWs = Folding::applyWhitespaceOnly( resultNoPunct ) ) == targetNoWs )
rank = ExactNoWsMatch * Multiplier;
else
if ( hasSurroundedWithWs( i->first, target, matchPos ) )
rank = ExactInsideMatch * Multiplier + matchPos;
else
if ( hasSurroundedWithWs( resultNoDia, targetNoDia, matchPos ) )
rank = ExactNoDiaInsideMatch * Multiplier + matchPos;
else
if ( hasSurroundedWithWs( resultNoPunct, targetNoPunct, matchPos ) )
rank = ExactNoPunctInsideMatch * Multiplier + matchPos;
else
if ( i->first.size() > target.size() && i->first.compare( 0, target.size(), target ) == 0 )
rank = PrefixMatch * Multiplier + saturated( i->first.size() );
else
if ( resultNoDia.size() > targetNoDia.size() && resultNoDia.compare( 0, targetNoDia.size(), targetNoDia ) == 0 )
rank = PrefixNoDiaMatch * Multiplier + saturated( i->first.size() );
else
if ( resultNoPunct.size() > targetNoPunct.size() && resultNoPunct.compare( 0, targetNoPunct.size(), targetNoPunct ) == 0 )
rank = PrefixNoPunctMatch * Multiplier + saturated( i->first.size() );
else
if ( resultNoWs.size() > targetNoWs.size() && resultNoWs.compare( 0, targetNoWs.size(), targetNoWs ) == 0 )
rank = PrefixNoWsMatch * Multiplier + saturated( i->first.size() );
else
rank = WorstMatch * Multiplier;
if ( i->second->rank > rank )
i->second->rank = rank; // We store the best rank of any writing
}
}
resultsArray.sort( SortByRank() );
}
else
{
// Handling stemmed matches
// We use two factors -- first is the number of characters strings share
// in their beginnings, and second, the length of the strings. Here we assign
// only the first one, storing it in rank. Then we sort the results using
// SortByRankAndLength.
for( unsigned wr = 0; wr < allWordWritings.size(); ++wr )
{
wstring target = Folding::apply( allWordWritings[ wr ] );
for( ResultsIndex::const_iterator i = resultsIndex.begin(), j = resultsIndex.end();
i != j; ++i )
{
wstring resultFolded = Folding::apply( i->first );
int charsInCommon = 0;
for( wchar const * t = target.c_str(), * r = resultFolded.c_str();
*t && *t == *r; ++t, ++r, ++charsInCommon ) ;
int rank = -charsInCommon; // Negated so the lesser-than
// comparison would yield right
// results.
if ( i->second->rank > rank )
i->second->rank = rank; // We store the best rank of any writing
}
}
resultsArray.sort( SortByRankAndLength() );
maxSearchResults = 15;
}
}
searchResults.clear();
searchResults.reserve( resultsArray.size() < maxSearchResults ? resultsArray.size() : maxSearchResults );
for( ResultsArray::const_iterator i = resultsArray.begin(), j = resultsArray.end();
i != j; ++i )
{
//printf( "%d: %ls\n", i->second, i->first.c_str() );
if ( searchResults.size() < maxSearchResults )
searchResults.push_back( std::pair< QString, bool >( gd::toQString( i->word ), i->wasSuggested ) );
else
break;
}
if ( queuedRequests.size() )
{
// There are still some unhandled results.
emit updated();
}
else
{
// That were all of them.
searchInProgress = false;
emit finished();
}
}
void WordFinder::cancelSearches()
{
for( list< sptr< Dictionary::WordSearchRequest > >::iterator i =
queuedRequests.begin(); i != queuedRequests.end(); ++i )
(*i)->cancel();
}