From 9bbb23e5d4d1a1a2a83bb7c9f7369efe07d44a3e Mon Sep 17 00:00:00 2001 From: YiFang Xiao Date: Mon, 3 Jul 2023 21:16:31 +0800 Subject: [PATCH 1/2] fix: when search hypen alone will result too much unnecessary result. fix #928 --- src/btreeidx.cc | 8 +++++--- src/common/folding.cc | 5 +++++ src/common/folding.hh | 1 + src/dict/stardict.cc | 28 +++++++++++++++++++--------- src/ftshelpers.cc | 2 +- 5 files changed, 31 insertions(+), 13 deletions(-) diff --git a/src/btreeidx.cc b/src/btreeidx.cc index b75914bc..c4e2cfda 100644 --- a/src/btreeidx.cc +++ b/src/btreeidx.cc @@ -72,7 +72,7 @@ vector< WordArticleLink > BtreeIndex::findArticles( wstring const & search_word, try { wstring folded = Folding::apply( word ); - if( folded.empty() ) + if ( folded.empty() ) folded = Folding::applyWhitespaceOnly( word ); bool exactMatch; @@ -1430,8 +1430,10 @@ void BtreeIndex::getHeadwordsFromOffsets( QList & offsets, auto word = QString::fromUtf8( ( result[ i ].prefix + result[ i ].word ).c_str() ); - headwords.append( word ); - offsets.erase( it); + if ( headwords.indexOf( word ) == -1 ) { + headwords.append( word ); + } + offsets.erase( it ); begOffsets = offsets.begin(); endOffsets = offsets.end(); } diff --git a/src/common/folding.cc b/src/common/folding.cc index 90bed687..fe1efa2a 100644 --- a/src/common/folding.cc +++ b/src/common/folding.cc @@ -166,6 +166,11 @@ bool isWhitespace( wchar ch ) return QChar::isSpace( ch ); } +bool isWhitespaceOrPunct( wchar ch ) +{ + return QChar::isSpace( ch ) || QChar::isPunct( ch ); +} + bool isPunct( wchar ch ) { return QChar::isPunct( ch ); diff --git a/src/common/folding.hh b/src/common/folding.hh index f89b01ec..50b24fdf 100644 --- a/src/common/folding.hh +++ b/src/common/folding.hh @@ -57,6 +57,7 @@ wstring applyWhitespaceAndPunctOnly( wstring const & ); /// otherwise. Whitespace corresponds to Zl/Zp/Zs Unicode classes, and also /// includes \n, \r and \t. bool isWhitespace( wchar ch ); +bool isWhitespaceOrPunct( wchar ch ); /// Returns true if the given character is any form of punctuation, false /// otherwise. Punctuation corresponds to Pc/Pd/Pe/Pf/Pi/Po/Ps classes. diff --git a/src/dict/stardict.cc b/src/dict/stardict.cc index 3440d88f..a1ed54bb 100644 --- a/src/dict/stardict.cc +++ b/src/dict/stardict.cc @@ -1358,18 +1358,23 @@ void StardictArticleRequest::run() { vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics ); - for( unsigned x = 0; x < alts.size(); ++x ) - { - /// Make an additional query for each alt + //if alts has more than 100 , great probability that the dictionary is wrong produced or parsed. + if ( alts.size() < 100 ) { + for ( unsigned x = 0; x < alts.size(); ++x ) { + /// Make an additional query for each alt - vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics ); - - chain.insert( chain.end(), altChain.begin(), altChain.end() ); + vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics ); + if ( altChain.size() > 100 ) { + continue; + } + chain.insert( chain.end(), altChain.begin(), altChain.end() ); + } } + multimap< wstring, pair< string, string > > mainArticles, alternateArticles; - set< uint32_t > articlesIncluded; // Some synonims make it that the articles + set< uint32_t > articlesIncluded; // Some synonms make it that the articles // appear several times. We combat this // by only allowing them to appear once. @@ -1377,8 +1382,8 @@ void StardictArticleRequest::run() if( ignoreDiacritics ) wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded ); - for( unsigned x = 0; x < chain.size(); ++x ) - { + //if the chain is too large, it is more likely has some dictionary making or parsing issue. + for ( unsigned x = 0; x < qMin( 10, (int)chain.size() ); ++x ) { if ( Utils::AtomicInt::loadAcquire( isCancelled ) ) { finish(); @@ -1883,6 +1888,11 @@ static void handleIdxSynFile( string const & fileName, if ( strchr( word, '/' ) ) continue; // Skip this entry } + + // if the entry is hypen, skip + if ( wordLen == 1 && *word == '-' ) { + continue; // Skip this entry + } } // Insert new entry into an index diff --git a/src/ftshelpers.cc b/src/ftshelpers.cc index d8402068..42de4b3b 100644 --- a/src/ftshelpers.cc +++ b/src/ftshelpers.cc @@ -311,7 +311,7 @@ void FTSResultsRequest::run() emit matchCount(matches.get_matches_estimated()); // Display the results. qDebug() << matches.get_matches_estimated() << " results found.\n"; - qDebug() << "Matches 1-" << matches.size() << ":\n\n"; + qDebug() << "Matches " << matches.size() << ":\n\n"; QList< uint32_t > offsetsForHeadwords; for( Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i ) { From 5290d0ceb91cabd653c732d79713dba2e5dd4217 Mon Sep 17 00:00:00 2001 From: YiFang Xiao Date: Wed, 5 Jul 2023 22:26:27 +0800 Subject: [PATCH 2/2] fix: code smells MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🎨 apply clang-format changes --- src/btreeidx.cc | 67 +++++++++++++++++++------------------------ src/common/folding.cc | 10 +++---- 2 files changed, 34 insertions(+), 43 deletions(-) diff --git a/src/btreeidx.cc b/src/btreeidx.cc index c4e2cfda..e920f76e 100644 --- a/src/btreeidx.cc +++ b/src/btreeidx.cc @@ -34,7 +34,8 @@ enum }; BtreeIndex::BtreeIndex(): - idxFile( 0 ), rootNodeLoaded( false ) + idxFile( nullptr ), + rootNodeLoaded( false ) { } @@ -175,10 +176,7 @@ void BtreeWordSearchRequest::findMatches() bool insideSet = false; bool escaped = false; - for( wstring::size_type x = 0; x < foldedWithWildcards.size(); x++ ) - { - wchar ch = foldedWithWildcards[ x ]; - + for ( char32_t ch : foldedWithWildcards ) { if( ch == L'\\' && !escaped ) { escaped = true; @@ -216,10 +214,7 @@ void BtreeWordSearchRequest::findMatches() folded.clear(); folded.reserve( foldedWithWildcards.size() ); escaped = false; - for( wstring::size_type x = 0; x < foldedWithWildcards.size(); x++ ) - { - wchar ch = foldedWithWildcards[ x ]; - + for ( char32_t ch : foldedWithWildcards ) { if( escaped ) { if( bNoLetters || ( ch != L'*' && ch != L'?' && ch != L'[' && ch != L']' ) ) @@ -303,11 +298,10 @@ void BtreeWordSearchRequest::findMatches() QMutexLocker _( &dataMutex ); - for( unsigned x = 0; x < chain.size(); ++x ) - { + for ( auto & x : chain ) { if( useWildcards ) { - wstring word = Utf8::decode( chain[ x ].prefix + chain[ x ].word ); + wstring word = Utf8::decode( x.prefix + x.word ); wstring result = Folding::applyDiacriticsOnly( word ); if( result.size() >= (wstring::size_type)minMatchLength ) { @@ -322,9 +316,9 @@ void BtreeWordSearchRequest::findMatches() { // Skip middle matches, if requested. If suffix variation is specified, // make sure the string isn't larger than requested. - if ( ( allowMiddleMatches || Folding::apply( Utf8::decode( chain[ x ].prefix ) ).empty() ) && - ( maxSuffixVariation < 0 || (int)resultFolded.size() - initialFoldedSize <= maxSuffixVariation ) ) - addMatch( Utf8::decode( chain[ x ].prefix + chain[ x ].word ) ); + if ( ( allowMiddleMatches || Folding::apply( Utf8::decode( x.prefix ) ).empty() ) + && ( maxSuffixVariation < 0 || (int)resultFolded.size() - initialFoldedSize <= maxSuffixVariation ) ) + addMatch( Utf8::decode( x.prefix + x.word ) ); } } @@ -517,7 +511,7 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix( wstring const & target, nextLeaf = 0; } if( !leafEntries ) - return 0; + return nullptr; return leaf + sizeof( uint32_t ); } @@ -645,7 +639,7 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix( wstring const & target, if ( currentNodeOffset != rootOffset ) throw exCorruptedChainData(); else - return 0; // No match + return nullptr; // No match } // Build an array containing all chain pointers @@ -745,7 +739,7 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix( wstring const & target, return &extLeaf.front() + sizeof( uint32_t ); } else - return 0; // This was the last leaf + return nullptr; // This was the last leaf } else return chainToCheck[ 1 ]; @@ -1117,7 +1111,7 @@ void BtreeIndex::getAllHeadwords( QSet< QString > & headwords ) if ( !idxFile ) throw exIndexWasNotOpened(); - findArticleLinks( NULL, NULL, &headwords ); + findArticleLinks( nullptr, nullptr, &headwords ); } void BtreeIndex::findAllArticleLinks( QVector< WordArticleLink > & articleLinks ) @@ -1127,7 +1121,7 @@ void BtreeIndex::findAllArticleLinks( QVector< WordArticleLink > & articleLinks QSet< uint32_t > offsets; - findArticleLinks( &articleLinks, &offsets, NULL ); + findArticleLinks( &articleLinks, &offsets, nullptr ); } void BtreeIndex::findArticleLinks( QVector< WordArticleLink > * articleLinks, @@ -1150,7 +1144,7 @@ void BtreeIndex::findArticleLinks( QVector< WordArticleLink > * articleLinks, char const * leaf = &rootNode.front(); char const * leafEnd = leaf + rootNode.size(); - char const * chainPtr = 0; + char const * chainPtr = nullptr; vector< char > extLeaf; @@ -1213,22 +1207,21 @@ void BtreeIndex::findArticleLinks( QVector< WordArticleLink > * articleLinks, articleLinks->reserve( n + n / 10 ); } - for( unsigned i = 0; i < result.size(); i++ ) - { + for ( auto & i : result ) { if( isCancelled && Utils::AtomicInt::loadAcquire( *isCancelled ) ) return; if( headwords ) - headwords->insert( QString::fromUtf8( ( result[ i ].prefix + result[ i ].word ).c_str() ) ); + headwords->insert( QString::fromUtf8( ( i.prefix + i.word ).c_str() ) ); - if( offsets && offsets->contains( result[ i ].articleOffset ) ) + if ( offsets && offsets->contains( i.articleOffset ) ) continue; if( offsets ) - offsets->insert( result[ i ].articleOffset ); + offsets->insert( i.articleOffset ); if( articleLinks ) - articleLinks->push_back( WordArticleLink( result[ i ].prefix + result[ i ].word, result[ i ].articleOffset ) ); + articleLinks->push_back( WordArticleLink( i.prefix + i.word, i.articleOffset ) ); } if ( chainPtr >= leafEnd ) @@ -1279,9 +1272,9 @@ void BtreeIndex::findSingleNodeHeadwords( uint32_t offsets, QMutexLocker _( idxFileMutex ); - char const * leaf = 0; - char const * leafEnd = 0; - char const * chainPtr = 0; + char const * leaf = nullptr; + char const * leafEnd = nullptr; + char const * chainPtr = nullptr; vector< char > extLeaf; @@ -1299,9 +1292,8 @@ void BtreeIndex::findSingleNodeHeadwords( uint32_t offsets, if( headwords ) { - for( unsigned i = 0; i < result.size(); i++ ) - { - headwords->insert( QString::fromUtf8( ( result[ i ].prefix + result[ i ].word ).c_str() ) ); + for ( auto & i : result ) { + headwords->insert( QString::fromUtf8( ( i.prefix + i.word ).c_str() ) ); } } @@ -1368,7 +1360,7 @@ void BtreeIndex::getHeadwordsFromOffsets( QList & offsets, char const * leaf = &rootNode.front(); char const * leafEnd = leaf + rootNode.size(); - char const * chainPtr = 0; + char const * chainPtr = nullptr; vector< char > extLeaf; @@ -1416,9 +1408,8 @@ void BtreeIndex::getHeadwordsFromOffsets( QList & offsets, { vector< WordArticleLink > result = readChain( chainPtr ); - for( unsigned i = 0; i < result.size(); i++ ) - { - uint32_t articleOffset = result.at(i).articleOffset; + for ( auto & i : result ) { + uint32_t articleOffset = i.articleOffset; QList::Iterator it = std::lower_bound( begOffsets, endOffsets, articleOffset ); @@ -1428,7 +1419,7 @@ void BtreeIndex::getHeadwordsFromOffsets( QList & offsets, if( isCancelled && Utils::AtomicInt::loadAcquire( *isCancelled ) ) return; - auto word = QString::fromUtf8( ( result[ i ].prefix + result[ i ].word ).c_str() ); + auto word = QString::fromUtf8( ( i.prefix + i.word ).c_str() ); if ( headwords.indexOf( word ) == -1 ) { headwords.append( word ); diff --git a/src/common/folding.cc b/src/common/folding.cc index fe1efa2a..132729fe 100644 --- a/src/common/folding.cc +++ b/src/common/folding.cc @@ -9,7 +9,6 @@ namespace Folding { - /// Tests if the given char is one of the Unicode combining marks. Some are /// caught by the diacritics folding table, but they are only handled there /// when they come with their main characters, not by themselves. The rest @@ -154,9 +153,10 @@ wstring applyWhitespaceAndPunctOnly( wstring const & in ) out.reserve( in.size() ); - for( size_t left = in.size(); left--; ++nextChar ) + for ( size_t left = in.size(); left--; ++nextChar ) { if ( !isWhitespace( *nextChar ) && !isPunct( *nextChar ) ) out.push_back( *nextChar ); + } return out; } @@ -246,7 +246,7 @@ QString trimWhitespace( QString const & in ) QString escapeWildcardSymbols( const QString & str ) { QString escaped( str ); - escaped.replace( QRegularExpression( R"(([\[\]\?\*]))" ), "\\\\1" ); + escaped.replace( QRegularExpression( R"(([\[\]\?\*]))" ), R"(\\1)" ); return escaped; } @@ -254,8 +254,8 @@ QString escapeWildcardSymbols( const QString & str ) QString unescapeWildcardSymbols( const QString & str ) { QString unescaped( str ); - unescaped.replace( QRegularExpression( R"(\\([\[\]\?\*]))" ), "\\1" ); + unescaped.replace( QRegularExpression( R"(\\([\[\]\?\*]))" ), R"(\1)" ); return unescaped; } -} +} // namespace Folding