Merge pull request #929 from xiaoyifang/fix/query-hypen

fix: when search hypen `-` alone will result too much unnecessary result.
This commit is contained in:
xiaoyifang 2023-07-06 17:07:05 +08:00 committed by GitHub
commit d9b81e7a68
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 65 additions and 56 deletions

View file

@ -34,7 +34,8 @@ enum
}; };
BtreeIndex::BtreeIndex(): BtreeIndex::BtreeIndex():
idxFile( 0 ), rootNodeLoaded( false ) idxFile( nullptr ),
rootNodeLoaded( false )
{ {
} }
@ -72,7 +73,7 @@ vector< WordArticleLink > BtreeIndex::findArticles( wstring const & search_word,
try try
{ {
wstring folded = Folding::apply( word ); wstring folded = Folding::apply( word );
if( folded.empty() ) if ( folded.empty() )
folded = Folding::applyWhitespaceOnly( word ); folded = Folding::applyWhitespaceOnly( word );
bool exactMatch; bool exactMatch;
@ -175,10 +176,7 @@ void BtreeWordSearchRequest::findMatches()
bool insideSet = false; bool insideSet = false;
bool escaped = false; bool escaped = false;
for( wstring::size_type x = 0; x < foldedWithWildcards.size(); x++ ) for ( char32_t ch : foldedWithWildcards ) {
{
wchar ch = foldedWithWildcards[ x ];
if( ch == L'\\' && !escaped ) if( ch == L'\\' && !escaped )
{ {
escaped = true; escaped = true;
@ -216,10 +214,7 @@ void BtreeWordSearchRequest::findMatches()
folded.clear(); folded.clear();
folded.reserve( foldedWithWildcards.size() ); folded.reserve( foldedWithWildcards.size() );
escaped = false; escaped = false;
for( wstring::size_type x = 0; x < foldedWithWildcards.size(); x++ ) for ( char32_t ch : foldedWithWildcards ) {
{
wchar ch = foldedWithWildcards[ x ];
if( escaped ) if( escaped )
{ {
if( bNoLetters || ( ch != L'*' && ch != L'?' && ch != L'[' && ch != L']' ) ) if( bNoLetters || ( ch != L'*' && ch != L'?' && ch != L'[' && ch != L']' ) )
@ -303,11 +298,10 @@ void BtreeWordSearchRequest::findMatches()
QMutexLocker _( &dataMutex ); QMutexLocker _( &dataMutex );
for( unsigned x = 0; x < chain.size(); ++x ) for ( auto & x : chain ) {
{
if( useWildcards ) if( useWildcards )
{ {
wstring word = Utf8::decode( chain[ x ].prefix + chain[ x ].word ); wstring word = Utf8::decode( x.prefix + x.word );
wstring result = Folding::applyDiacriticsOnly( word ); wstring result = Folding::applyDiacriticsOnly( word );
if( result.size() >= (wstring::size_type)minMatchLength ) if( result.size() >= (wstring::size_type)minMatchLength )
{ {
@ -322,9 +316,9 @@ void BtreeWordSearchRequest::findMatches()
{ {
// Skip middle matches, if requested. If suffix variation is specified, // Skip middle matches, if requested. If suffix variation is specified,
// make sure the string isn't larger than requested. // make sure the string isn't larger than requested.
if ( ( allowMiddleMatches || Folding::apply( Utf8::decode( chain[ x ].prefix ) ).empty() ) && if ( ( allowMiddleMatches || Folding::apply( Utf8::decode( x.prefix ) ).empty() )
( maxSuffixVariation < 0 || (int)resultFolded.size() - initialFoldedSize <= maxSuffixVariation ) ) && ( maxSuffixVariation < 0 || (int)resultFolded.size() - initialFoldedSize <= maxSuffixVariation ) )
addMatch( Utf8::decode( chain[ x ].prefix + chain[ x ].word ) ); addMatch( Utf8::decode( x.prefix + x.word ) );
} }
} }
@ -517,7 +511,7 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix( wstring const & target,
nextLeaf = 0; nextLeaf = 0;
} }
if( !leafEntries ) if( !leafEntries )
return 0; return nullptr;
return leaf + sizeof( uint32_t ); return leaf + sizeof( uint32_t );
} }
@ -645,7 +639,7 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix( wstring const & target,
if ( currentNodeOffset != rootOffset ) if ( currentNodeOffset != rootOffset )
throw exCorruptedChainData(); throw exCorruptedChainData();
else else
return 0; // No match return nullptr; // No match
} }
// Build an array containing all chain pointers // Build an array containing all chain pointers
@ -745,7 +739,7 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix( wstring const & target,
return &extLeaf.front() + sizeof( uint32_t ); return &extLeaf.front() + sizeof( uint32_t );
} }
else else
return 0; // This was the last leaf return nullptr; // This was the last leaf
} }
else else
return chainToCheck[ 1 ]; return chainToCheck[ 1 ];
@ -1117,7 +1111,7 @@ void BtreeIndex::getAllHeadwords( QSet< QString > & headwords )
if ( !idxFile ) if ( !idxFile )
throw exIndexWasNotOpened(); throw exIndexWasNotOpened();
findArticleLinks( NULL, NULL, &headwords ); findArticleLinks( nullptr, nullptr, &headwords );
} }
void BtreeIndex::findAllArticleLinks( QVector< WordArticleLink > & articleLinks ) void BtreeIndex::findAllArticleLinks( QVector< WordArticleLink > & articleLinks )
@ -1127,7 +1121,7 @@ void BtreeIndex::findAllArticleLinks( QVector< WordArticleLink > & articleLinks
QSet< uint32_t > offsets; QSet< uint32_t > offsets;
findArticleLinks( &articleLinks, &offsets, NULL ); findArticleLinks( &articleLinks, &offsets, nullptr );
} }
void BtreeIndex::findArticleLinks( QVector< WordArticleLink > * articleLinks, void BtreeIndex::findArticleLinks( QVector< WordArticleLink > * articleLinks,
@ -1150,7 +1144,7 @@ void BtreeIndex::findArticleLinks( QVector< WordArticleLink > * articleLinks,
char const * leaf = &rootNode.front(); char const * leaf = &rootNode.front();
char const * leafEnd = leaf + rootNode.size(); char const * leafEnd = leaf + rootNode.size();
char const * chainPtr = 0; char const * chainPtr = nullptr;
vector< char > extLeaf; vector< char > extLeaf;
@ -1213,22 +1207,21 @@ void BtreeIndex::findArticleLinks( QVector< WordArticleLink > * articleLinks,
articleLinks->reserve( n + n / 10 ); articleLinks->reserve( n + n / 10 );
} }
for( unsigned i = 0; i < result.size(); i++ ) for ( auto & i : result ) {
{
if( isCancelled && Utils::AtomicInt::loadAcquire( *isCancelled ) ) if( isCancelled && Utils::AtomicInt::loadAcquire( *isCancelled ) )
return; return;
if( headwords ) if( headwords )
headwords->insert( QString::fromUtf8( ( result[ i ].prefix + result[ i ].word ).c_str() ) ); headwords->insert( QString::fromUtf8( ( i.prefix + i.word ).c_str() ) );
if( offsets && offsets->contains( result[ i ].articleOffset ) ) if ( offsets && offsets->contains( i.articleOffset ) )
continue; continue;
if( offsets ) if( offsets )
offsets->insert( result[ i ].articleOffset ); offsets->insert( i.articleOffset );
if( articleLinks ) if( articleLinks )
articleLinks->push_back( WordArticleLink( result[ i ].prefix + result[ i ].word, result[ i ].articleOffset ) ); articleLinks->push_back( WordArticleLink( i.prefix + i.word, i.articleOffset ) );
} }
if ( chainPtr >= leafEnd ) if ( chainPtr >= leafEnd )
@ -1279,9 +1272,9 @@ void BtreeIndex::findSingleNodeHeadwords( uint32_t offsets,
QMutexLocker _( idxFileMutex ); QMutexLocker _( idxFileMutex );
char const * leaf = 0; char const * leaf = nullptr;
char const * leafEnd = 0; char const * leafEnd = nullptr;
char const * chainPtr = 0; char const * chainPtr = nullptr;
vector< char > extLeaf; vector< char > extLeaf;
@ -1299,9 +1292,8 @@ void BtreeIndex::findSingleNodeHeadwords( uint32_t offsets,
if( headwords ) if( headwords )
{ {
for( unsigned i = 0; i < result.size(); i++ ) for ( auto & i : result ) {
{ headwords->insert( QString::fromUtf8( ( i.prefix + i.word ).c_str() ) );
headwords->insert( QString::fromUtf8( ( result[ i ].prefix + result[ i ].word ).c_str() ) );
} }
} }
@ -1368,7 +1360,7 @@ void BtreeIndex::getHeadwordsFromOffsets( QList<uint32_t> & offsets,
char const * leaf = &rootNode.front(); char const * leaf = &rootNode.front();
char const * leafEnd = leaf + rootNode.size(); char const * leafEnd = leaf + rootNode.size();
char const * chainPtr = 0; char const * chainPtr = nullptr;
vector< char > extLeaf; vector< char > extLeaf;
@ -1416,9 +1408,8 @@ void BtreeIndex::getHeadwordsFromOffsets( QList<uint32_t> & offsets,
{ {
vector< WordArticleLink > result = readChain( chainPtr ); vector< WordArticleLink > result = readChain( chainPtr );
for( unsigned i = 0; i < result.size(); i++ ) for ( auto & i : result ) {
{ uint32_t articleOffset = i.articleOffset;
uint32_t articleOffset = result.at(i).articleOffset;
QList<uint32_t>::Iterator it = std::lower_bound( begOffsets, endOffsets, QList<uint32_t>::Iterator it = std::lower_bound( begOffsets, endOffsets,
articleOffset ); articleOffset );
@ -1428,10 +1419,12 @@ void BtreeIndex::getHeadwordsFromOffsets( QList<uint32_t> & offsets,
if( isCancelled && Utils::AtomicInt::loadAcquire( *isCancelled ) ) if( isCancelled && Utils::AtomicInt::loadAcquire( *isCancelled ) )
return; return;
auto word = QString::fromUtf8( ( result[ i ].prefix + result[ i ].word ).c_str() ); auto word = QString::fromUtf8( ( i.prefix + i.word ).c_str() );
headwords.append( word ); if ( headwords.indexOf( word ) == -1 ) {
offsets.erase( it); headwords.append( word );
}
offsets.erase( it );
begOffsets = offsets.begin(); begOffsets = offsets.begin();
endOffsets = offsets.end(); endOffsets = offsets.end();
} }

View file

@ -9,7 +9,6 @@
namespace Folding { namespace Folding {
/// Tests if the given char is one of the Unicode combining marks. Some are /// Tests if the given char is one of the Unicode combining marks. Some are
/// caught by the diacritics folding table, but they are only handled there /// caught by the diacritics folding table, but they are only handled there
/// when they come with their main characters, not by themselves. The rest /// when they come with their main characters, not by themselves. The rest
@ -154,9 +153,10 @@ wstring applyWhitespaceAndPunctOnly( wstring const & in )
out.reserve( in.size() ); out.reserve( in.size() );
for( size_t left = in.size(); left--; ++nextChar ) for ( size_t left = in.size(); left--; ++nextChar ) {
if ( !isWhitespace( *nextChar ) && !isPunct( *nextChar ) ) if ( !isWhitespace( *nextChar ) && !isPunct( *nextChar ) )
out.push_back( *nextChar ); out.push_back( *nextChar );
}
return out; return out;
} }
@ -166,6 +166,11 @@ bool isWhitespace( wchar ch )
return QChar::isSpace( ch ); return QChar::isSpace( ch );
} }
bool isWhitespaceOrPunct( wchar ch )
{
return QChar::isSpace( ch ) || QChar::isPunct( ch );
}
bool isPunct( wchar ch ) bool isPunct( wchar ch )
{ {
return QChar::isPunct( ch ); return QChar::isPunct( ch );
@ -241,7 +246,7 @@ QString trimWhitespace( QString const & in )
QString escapeWildcardSymbols( const QString & str ) QString escapeWildcardSymbols( const QString & str )
{ {
QString escaped( str ); QString escaped( str );
escaped.replace( QRegularExpression( R"(([\[\]\?\*]))" ), "\\\\1" ); escaped.replace( QRegularExpression( R"(([\[\]\?\*]))" ), R"(\\1)" );
return escaped; return escaped;
} }
@ -249,8 +254,8 @@ QString escapeWildcardSymbols( const QString & str )
QString unescapeWildcardSymbols( const QString & str ) QString unescapeWildcardSymbols( const QString & str )
{ {
QString unescaped( str ); QString unescaped( str );
unescaped.replace( QRegularExpression( R"(\\([\[\]\?\*]))" ), "\\1" ); unescaped.replace( QRegularExpression( R"(\\([\[\]\?\*]))" ), R"(\1)" );
return unescaped; return unescaped;
} }
} } // namespace Folding

View file

@ -57,6 +57,7 @@ wstring applyWhitespaceAndPunctOnly( wstring const & );
/// otherwise. Whitespace corresponds to Zl/Zp/Zs Unicode classes, and also /// otherwise. Whitespace corresponds to Zl/Zp/Zs Unicode classes, and also
/// includes \n, \r and \t. /// includes \n, \r and \t.
bool isWhitespace( wchar ch ); bool isWhitespace( wchar ch );
bool isWhitespaceOrPunct( wchar ch );
/// Returns true if the given character is any form of punctuation, false /// Returns true if the given character is any form of punctuation, false
/// otherwise. Punctuation corresponds to Pc/Pd/Pe/Pf/Pi/Po/Ps classes. /// otherwise. Punctuation corresponds to Pc/Pd/Pe/Pf/Pi/Po/Ps classes.

View file

@ -1358,18 +1358,23 @@ void StardictArticleRequest::run()
{ {
vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics ); vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics );
for( unsigned x = 0; x < alts.size(); ++x ) //if alts has more than 100 , great probability that the dictionary is wrong produced or parsed.
{ if ( alts.size() < 100 ) {
/// Make an additional query for each alt for ( unsigned x = 0; x < alts.size(); ++x ) {
/// Make an additional query for each alt
vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics ); vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics );
if ( altChain.size() > 100 ) {
chain.insert( chain.end(), altChain.begin(), altChain.end() ); continue;
}
chain.insert( chain.end(), altChain.begin(), altChain.end() );
}
} }
multimap< wstring, pair< string, string > > mainArticles, alternateArticles; multimap< wstring, pair< string, string > > mainArticles, alternateArticles;
set< uint32_t > articlesIncluded; // Some synonims make it that the articles set< uint32_t > articlesIncluded; // Some synonms make it that the articles
// appear several times. We combat this // appear several times. We combat this
// by only allowing them to appear once. // by only allowing them to appear once.
@ -1377,8 +1382,8 @@ void StardictArticleRequest::run()
if( ignoreDiacritics ) if( ignoreDiacritics )
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
for( unsigned x = 0; x < chain.size(); ++x ) //if the chain is too large, it is more likely has some dictionary making or parsing issue.
{ for ( unsigned x = 0; x < qMin( 10, (int)chain.size() ); ++x ) {
if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
{ {
finish(); finish();
@ -1883,6 +1888,11 @@ static void handleIdxSynFile( string const & fileName,
if ( strchr( word, '/' ) ) if ( strchr( word, '/' ) )
continue; // Skip this entry continue; // Skip this entry
} }
// if the entry is hypen, skip
if ( wordLen == 1 && *word == '-' ) {
continue; // Skip this entry
}
} }
// Insert new entry into an index // Insert new entry into an index

View file

@ -311,7 +311,7 @@ void FTSResultsRequest::run()
emit matchCount(matches.get_matches_estimated()); emit matchCount(matches.get_matches_estimated());
// Display the results. // Display the results.
qDebug() << matches.get_matches_estimated() << " results found.\n"; qDebug() << matches.get_matches_estimated() << " results found.\n";
qDebug() << "Matches 1-" << matches.size() << ":\n\n"; qDebug() << "Matches " << matches.size() << ":\n\n";
QList< uint32_t > offsetsForHeadwords; QList< uint32_t > offsetsForHeadwords;
for( Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i ) for( Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i )
{ {