goldendict-ng/src/wordfinder.cc

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

500 lines
15 KiB
C++
Raw Normal View History

2012-02-20 21:47:14 +00:00
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
2009-01-29 19:16:25 +00:00
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#include "wordfinder.hh"
#include "folding.hh"
#include <map>
2024-11-21 16:48:11 +00:00
2009-01-29 19:16:25 +00:00
using std::vector;
using std::list;
2009-01-29 19:16:25 +00:00
using std::map;
using std::pair;
WordFinder::WordFinder( QObject * parent ):
QObject( parent ),
searchInProgress( false ),
updateResultsTimer( this ),
searchQueued( false )
2009-01-29 19:16:25 +00:00
{
updateResultsTimer.setInterval( 1000 ); // We use a one second update timer
updateResultsTimer.setSingleShot( true );
connect( &updateResultsTimer, &QTimer::timeout, this, &WordFinder::updateResults, Qt::QueuedConnection );
2009-01-29 19:16:25 +00:00
}
WordFinder::~WordFinder()
2009-01-29 19:16:25 +00:00
{
clear();
2009-01-29 19:16:25 +00:00
}
void WordFinder::prefixMatch( QString const & str,
std::vector< sptr< Dictionary::Class > > const & dicts,
unsigned long maxResults,
Dictionary::Features features )
2009-01-29 19:16:25 +00:00
{
cancel();
2009-01-29 19:16:25 +00:00
searchQueued = true;
searchType = PrefixMatch;
inputWord = str;
inputDicts = &dicts;
requestedMaxResults = maxResults;
requestedFeatures = features;
2009-01-29 19:16:25 +00:00
resultsArray.clear();
resultsIndex.clear();
searchResults.clear();
2009-01-29 19:16:25 +00:00
if ( queuedRequests.empty() ) {
// No requests are queued, no need to wait for them to finish.
startSearch();
}
2009-01-29 19:16:25 +00:00
// Else some requests are still queued, last one to finish would trigger
// new search. This shouldn't take a lot of time, since they were all
// cancelled, but still it could take some time.
2009-01-29 19:16:25 +00:00
}
void WordFinder::stemmedMatch( QString const & str,
std::vector< sptr< Dictionary::Class > > const & dicts,
unsigned minLength,
unsigned maxSuffixVariation,
unsigned long maxResults,
Dictionary::Features features )
{
cancel();
searchQueued = true;
searchType = StemmedMatch;
inputWord = str;
inputDicts = &dicts;
requestedMaxResults = maxResults;
requestedFeatures = features;
stemmedMinLength = minLength;
stemmedMaxSuffixVariation = maxSuffixVariation;
resultsArray.clear();
resultsIndex.clear();
searchResults.clear();
if ( queuedRequests.empty() ) {
startSearch();
}
}
2009-01-29 19:16:25 +00:00
void WordFinder::expressionMatch( QString const & str,
std::vector< sptr< Dictionary::Class > > const & dicts,
unsigned long maxResults,
Dictionary::Features features )
{
cancel();
searchQueued = true;
searchType = ExpressionMatch;
inputWord = str;
inputDicts = &dicts;
requestedMaxResults = maxResults;
requestedFeatures = features;
resultsArray.clear();
resultsIndex.clear();
searchResults.clear();
if ( queuedRequests.empty() ) {
// No requests are queued, no need to wait for them to finish.
startSearch();
}
}
void WordFinder::startSearch()
2009-01-29 19:16:25 +00:00
{
if ( !searchQueued ) {
return; // Search was probably cancelled
}
// Clear the requests just in case
queuedRequests.clear();
finishedRequests.clear();
searchErrorString.clear();
searchResultsUncertain = false;
searchQueued = false;
searchInProgress = true;
// Gather all writings of the word
if ( allWordWritings.size() != 1 ) {
allWordWritings.resize( 1 );
}
allWordWritings[ 0 ] = inputWord.toStdU32String();
for ( const auto & inputDict : *inputDicts ) {
vector< std::u32string > writings = inputDict->getAlternateWritings( allWordWritings[ 0 ] );
allWordWritings.insert( allWordWritings.end(), writings.begin(), writings.end() );
}
// Query each dictionary for all word writings
for ( const auto & inputDict : *inputDicts ) {
if ( ( inputDict->getFeatures() & requestedFeatures ) != requestedFeatures ) {
continue;
}
for ( const auto & allWordWriting : allWordWritings ) {
try {
sptr< Dictionary::WordSearchRequest > sr = ( searchType == PrefixMatch || searchType == ExpressionMatch ) ?
inputDict->prefixMatch( allWordWriting, requestedMaxResults ) :
inputDict->stemmedMatch( allWordWriting, stemmedMinLength, stemmedMaxSuffixVariation, requestedMaxResults );
2013-09-19 19:43:16 +00:00
connect( sr.get(), &Dictionary::Request::finished, this, &WordFinder::requestFinished, Qt::QueuedConnection );
2013-09-19 19:43:16 +00:00
queuedRequests.push_back( sr );
}
catch ( std::exception & e ) {
qWarning( "Word \"%s\" search error (%s) in \"%s\"",
inputWord.toUtf8().data(),
e.what(),
inputDict->getName().c_str() );
2013-09-19 19:43:16 +00:00
}
}
}
2009-01-29 19:16:25 +00:00
// Handle any requests finished already
2009-01-29 19:16:25 +00:00
requestFinished();
}
2009-01-29 19:16:25 +00:00
void WordFinder::cancel()
{
searchQueued = false;
searchInProgress = false;
cancelSearches();
2009-01-29 19:16:25 +00:00
}
void WordFinder::clear()
2009-01-29 19:16:25 +00:00
{
cancel();
queuedRequests.clear();
finishedRequests.clear();
}
2009-01-29 19:16:25 +00:00
void WordFinder::requestFinished()
{
bool newResults = false;
2009-01-29 19:16:25 +00:00
// See how many new requests have finished, and if we have any new results
for ( auto i = queuedRequests.begin(); i != queuedRequests.end(); ) {
if ( ( *i )->isFinished() ) {
if ( searchInProgress && !( *i )->getErrorString().isEmpty() ) {
searchErrorString = tr( "Failed to query some dictionaries." );
}
2009-01-29 19:16:25 +00:00
if ( ( *i )->isUncertain() ) {
searchResultsUncertain = true;
}
if ( ( *i )->matchesCount() ) {
newResults = true;
2009-01-29 19:16:25 +00:00
// This list is handled by updateResults()
finishedRequests.splice( finishedRequests.end(), queuedRequests, i++ );
}
else { // We won't do anything with it anymore, so we erase it
queuedRequests.erase( i++ );
}
}
else {
++i;
}
}
2009-01-29 19:16:25 +00:00
if ( !searchInProgress ) {
// There is no search in progress, so we just wait until there's
// no requests left
if ( queuedRequests.empty() ) {
// We got rid of all queries, queued search can now start
finishedRequests.clear();
if ( searchQueued ) {
startSearch();
}
}
2009-01-29 19:16:25 +00:00
return;
}
2009-01-29 19:16:25 +00:00
if ( newResults && !queuedRequests.empty() && !updateResultsTimer.isActive() ) {
// If we have got some new results, but not all of them, we would start a
// timer to update a user some time in the future
updateResultsTimer.start();
}
2009-01-29 19:16:25 +00:00
if ( queuedRequests.empty() ) {
// Search is finished.
updateResults();
}
}
namespace {
unsigned saturated( unsigned x )
{
return x < 255 ? x : 255;
}
/// Checks whether the first string has the second one inside, surrounded from
/// both sides by either whitespace, punctuation or begin/end of string.
/// If true is returned, pos holds the offset in the haystack. If the offset
/// is larger than 255, it is set to 255.
bool hasSurroundedWithWs( std::u32string const & haystack,
std::u32string const & needle,
std::u32string::size_type & pos )
{
if ( haystack.size() < needle.size() ) {
return false; // Needle won't even fit into a haystack
}
for ( pos = 0;; ++pos ) {
pos = haystack.find( needle, pos );
if ( pos == std::u32string::npos ) {
return false; // Not found
}
if ( ( !pos || Folding::isWhitespace( haystack[ pos - 1 ] ) || Folding::isPunct( haystack[ pos - 1 ] ) )
&& ( ( pos + needle.size() == haystack.size() ) || Folding::isWhitespace( haystack[ pos + needle.size() ] )
|| Folding::isPunct( haystack[ pos + needle.size() ] ) ) ) {
pos = saturated( pos );
return true;
}
}
}
} // namespace
void WordFinder::updateResults()
{
if ( !searchInProgress ) {
return; // Old queued signal
}
2009-01-29 19:16:25 +00:00
if ( updateResultsTimer.isActive() ) {
updateResultsTimer.stop(); // Can happen when we were done before it'd expire
}
2009-01-29 19:16:25 +00:00
std::u32string original = Folding::applySimpleCaseOnly( allWordWritings[ 0 ] );
for ( auto i = finishedRequests.begin(); i != finishedRequests.end(); ) {
for ( size_t count = ( *i )->matchesCount(), x = 0; x < count; ++x ) {
std::u32string match = ( **i )[ x ].word;
int weight = ( **i )[ x ].weight;
std::u32string lowerCased = Folding::applySimpleCaseOnly( match );
2009-01-29 19:16:25 +00:00
if ( searchType == ExpressionMatch ) {
unsigned ws;
for ( ws = 0; ws < allWordWritings.size(); ws++ ) {
if ( ws == 0 ) {
// Check for prefix match with original expression
if ( lowerCased.compare( 0, original.size(), original ) == 0 ) {
break;
}
}
else if ( lowerCased == Folding::applySimpleCaseOnly( allWordWritings[ ws ] ) ) {
break;
}
}
if ( ws >= allWordWritings.size() ) {
// No exact matches found
continue;
}
weight = ws;
}
auto insertResult =
resultsIndex.insert( pair< std::u32string, ResultsArray::iterator >( lowerCased, resultsArray.end() ) );
if ( !insertResult.second ) {
// Wasn't inserted since there was already an item -- check the case
if ( insertResult.first->second->word != match ) {
// The case is different -- agree on a lowercase version
insertResult.first->second->word = lowerCased;
2009-01-29 19:16:25 +00:00
}
if ( !weight && insertResult.first->second->wasSuggested ) {
insertResult.first->second->wasSuggested = false;
}
2009-01-29 19:16:25 +00:00
}
else {
resultsArray.emplace_back();
resultsArray.back().word = match;
resultsArray.back().rank = INT_MAX;
resultsArray.back().wasSuggested = ( weight != 0 );
insertResult.first->second = --resultsArray.end();
}
2009-01-29 19:16:25 +00:00
}
finishedRequests.erase( i++ );
}
2009-01-29 19:16:25 +00:00
size_t maxSearchResults = 500;
if ( !resultsArray.empty() ) {
if ( searchType == PrefixMatch ) {
/// Assign each result a category, storing it in the rank's field
enum Category {
ExactMatch,
ExactNoFullCaseMatch,
ExactNoDiaMatch,
ExactNoPunctMatch,
ExactNoWsMatch,
ExactInsideMatch,
ExactNoDiaInsideMatch,
ExactNoPunctInsideMatch,
PrefixMatch,
PrefixNoDiaMatch,
PrefixNoPunctMatch,
PrefixNoWsMatch,
WorstMatch,
Multiplier = 256 // Categories should be multiplied by Multiplier
};
for ( const auto & allWordWriting : allWordWritings ) {
std::u32string target = Folding::applySimpleCaseOnly( allWordWriting );
std::u32string targetNoFullCase = Folding::applyFullCaseOnly( target );
std::u32string targetNoDia = Folding::applyDiacriticsOnly( targetNoFullCase );
std::u32string targetNoPunct = Folding::applyPunctOnly( targetNoDia );
std::u32string targetNoWs = Folding::applyWhitespaceOnly( targetNoPunct );
std::u32string::size_type matchPos = 0;
for ( const auto & i : resultsIndex ) {
std::u32string resultNoFullCase, resultNoDia, resultNoPunct, resultNoWs;
int rank;
if ( i.first == target ) {
rank = ExactMatch * Multiplier;
}
else if ( ( resultNoFullCase = Folding::applyFullCaseOnly( i.first ) ) == targetNoFullCase ) {
rank = ExactNoFullCaseMatch * Multiplier;
}
else if ( ( resultNoDia = Folding::applyDiacriticsOnly( resultNoFullCase ) ) == targetNoDia ) {
rank = ExactNoDiaMatch * Multiplier;
}
else if ( ( resultNoPunct = Folding::applyPunctOnly( resultNoDia ) ) == targetNoPunct ) {
rank = ExactNoPunctMatch * Multiplier;
}
else if ( ( resultNoWs = Folding::applyWhitespaceOnly( resultNoPunct ) ) == targetNoWs ) {
rank = ExactNoWsMatch * Multiplier;
}
else if ( hasSurroundedWithWs( i.first, target, matchPos ) ) {
rank = ExactInsideMatch * Multiplier + matchPos;
}
else if ( hasSurroundedWithWs( resultNoDia, targetNoDia, matchPos ) ) {
rank = ExactNoDiaInsideMatch * Multiplier + matchPos;
}
else if ( hasSurroundedWithWs( resultNoPunct, targetNoPunct, matchPos ) ) {
rank = ExactNoPunctInsideMatch * Multiplier + matchPos;
}
else if ( i.first.size() > target.size() && i.first.compare( 0, target.size(), target ) == 0 ) {
rank = PrefixMatch * Multiplier + saturated( i.first.size() );
}
else if ( resultNoDia.size() > targetNoDia.size()
&& resultNoDia.compare( 0, targetNoDia.size(), targetNoDia ) == 0 ) {
rank = PrefixNoDiaMatch * Multiplier + saturated( i.first.size() );
}
else if ( resultNoPunct.size() > targetNoPunct.size()
&& resultNoPunct.compare( 0, targetNoPunct.size(), targetNoPunct ) == 0 ) {
rank = PrefixNoPunctMatch * Multiplier + saturated( i.first.size() );
}
else if ( resultNoWs.size() > targetNoWs.size()
&& resultNoWs.compare( 0, targetNoWs.size(), targetNoWs ) == 0 ) {
rank = PrefixNoWsMatch * Multiplier + saturated( i.first.size() );
}
else {
rank = WorstMatch * Multiplier;
}
if ( i.second->rank > rank ) {
i.second->rank = rank; // We store the best rank of any writing
}
}
}
resultsArray.sort( SortByRank() );
}
else if ( searchType == StemmedMatch ) {
// Handling stemmed matches
// We use two factors -- first is the number of characters strings share
// in their beginnings, and second, the length of the strings. Here we assign
// only the first one, storing it in rank. Then we sort the results using
// SortByRankAndLength.
for ( const auto & allWordWriting : allWordWritings ) {
std::u32string target = Folding::apply( allWordWriting );
for ( const auto & i : resultsIndex ) {
std::u32string resultFolded = Folding::apply( i.first );
int charsInCommon = 0;
for ( char32_t const *t = target.c_str(), *r = resultFolded.c_str(); *t && *t == *r;
++t, ++r, ++charsInCommon ) {
;
}
int rank = -charsInCommon; // Negated so the lesser-than
// comparison would yield right
// results.
if ( i.second->rank > rank ) {
i.second->rank = rank; // We store the best rank of any writing
}
}
}
resultsArray.sort( SortByRankAndLength() );
maxSearchResults = 15;
}
}
2009-01-29 19:16:25 +00:00
searchResults.clear();
searchResults.reserve( resultsArray.size() < maxSearchResults ? resultsArray.size() : maxSearchResults );
2009-01-29 19:16:25 +00:00
for ( const auto & i : resultsArray ) {
if ( searchResults.size() < maxSearchResults ) {
searchResults.emplace_back( QString::fromStdU32String( i.word ), i.wasSuggested );
}
else {
break;
}
}
2009-01-29 19:16:25 +00:00
if ( !queuedRequests.empty() ) {
// There are still some unhandled results.
emit updated();
2009-01-29 19:16:25 +00:00
}
else {
// That were all of them.
searchInProgress = false;
emit finished();
}
}
2009-01-29 19:16:25 +00:00
void WordFinder::cancelSearches()
{
for ( auto & queuedRequest : queuedRequests ) {
queuedRequest->cancel();
}
2009-01-29 19:16:25 +00:00
}