Merge pull request #929 from xiaoyifang/fix/query-hypen

fix: when search hypen `-` alone will result too much unnecessary result.
2024-11-24 04:24:09 +00:00 · 2023-07-06 17:07:05 +08:00 · 2023-07-06 17:07:05 +08:00 · d9b81e7a68
parent ff0a3a2e0e 5290d0ceb9
commit d9b81e7a68
5 changed files with 65 additions and 56 deletions
--- a/src/btreeidx.cc
+++ b/src/btreeidx.cc
@ -34,7 +34,8 @@ enum
 };
 BtreeIndex::BtreeIndex():
-  idxFile( 0 ), rootNodeLoaded( false )
+  idxFile( nullptr ),
  rootNodeLoaded( false )
 {
 }
@ -72,7 +73,7 @@ vector< WordArticleLink > BtreeIndex::findArticles( wstring const & search_word,
  try
  {
    wstring folded = Folding::apply( word );
-    if( folded.empty() )
+    if ( folded.empty() )
      folded = Folding::applyWhitespaceOnly( word );
    bool exactMatch;
@ -175,10 +176,7 @@ void BtreeWordSearchRequest::findMatches()
    bool insideSet = false;
    bool escaped = false;
-    for( wstring::size_type x = 0; x < foldedWithWildcards.size(); x++ )
+    for ( char32_t ch : foldedWithWildcards ) {
    {
      wchar ch = foldedWithWildcards[ x ];
      if( ch == L'\\' && !escaped )
      {
        escaped = true;
@ -216,10 +214,7 @@ void BtreeWordSearchRequest::findMatches()
    folded.clear();
    folded.reserve( foldedWithWildcards.size() );
    escaped = false;
-    for( wstring::size_type x = 0; x < foldedWithWildcards.size(); x++ )
+    for ( char32_t ch : foldedWithWildcards ) {
    {
      wchar ch = foldedWithWildcards[ x ];
      if( escaped )
      {
        if( bNoLetters || ( ch != L'*' && ch != L'?' && ch != L'[' && ch != L']' ) )
@ -303,11 +298,10 @@ void BtreeWordSearchRequest::findMatches()
          QMutexLocker _( &dataMutex );
-          for( unsigned x = 0; x < chain.size(); ++x )
+          for ( auto & x : chain ) {
          {
            if( useWildcards )
            {
-              wstring word = Utf8::decode( chain[ x ].prefix + chain[ x ].word );
+              wstring word   = Utf8::decode( x.prefix + x.word );
              wstring result = Folding::applyDiacriticsOnly( word );
              if( result.size() >= (wstring::size_type)minMatchLength )
              {
@ -322,9 +316,9 @@ void BtreeWordSearchRequest::findMatches()
            {
              // Skip middle matches, if requested. If suffix variation is specified,
              // make sure the string isn't larger than requested.
-              if ( ( allowMiddleMatches || Folding::apply( Utf8::decode( chain[ x ].prefix ) ).empty() ) &&
+              if ( ( allowMiddleMatches || Folding::apply( Utf8::decode( x.prefix ) ).empty() )
-                   ( maxSuffixVariation < 0 || (int)resultFolded.size() - initialFoldedSize <= maxSuffixVariation ) )
+                   && ( maxSuffixVariation < 0 || (int)resultFolded.size() - initialFoldedSize <= maxSuffixVariation ) )
-                  addMatch( Utf8::decode( chain[ x ].prefix + chain[ x ].word ) );
+                addMatch( Utf8::decode( x.prefix + x.word ) );
            }
          }
@ -517,7 +511,7 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix( wstring const & target,
          nextLeaf = 0;
        }
        if( !leafEntries )
-          return 0;
+          return nullptr;
        return leaf + sizeof( uint32_t );
      }
@ -645,7 +639,7 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix( wstring const & target,
        if ( currentNodeOffset != rootOffset )
          throw exCorruptedChainData();
        else
-          return 0; // No match
+          return nullptr; // No match
      }
      // Build an array containing all chain pointers
@ -745,7 +739,7 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix( wstring const & target,
                return &extLeaf.front() + sizeof( uint32_t );
              }
              else
-                return 0; // This was the last leaf
+                return nullptr; // This was the last leaf
            }
            else
              return chainToCheck[ 1 ];
@ -1117,7 +1111,7 @@ void BtreeIndex::getAllHeadwords( QSet< QString > & headwords )
  if ( !idxFile )
    throw exIndexWasNotOpened();
-  findArticleLinks( NULL, NULL, &headwords );
+  findArticleLinks( nullptr, nullptr, &headwords );
 }
 void BtreeIndex::findAllArticleLinks( QVector< WordArticleLink > & articleLinks )
@ -1127,7 +1121,7 @@ void BtreeIndex::findAllArticleLinks( QVector< WordArticleLink > & articleLinks
  QSet< uint32_t > offsets;
-  findArticleLinks( &articleLinks, &offsets, NULL );
+  findArticleLinks( &articleLinks, &offsets, nullptr );
 }
 void BtreeIndex::findArticleLinks( QVector< WordArticleLink > * articleLinks,
@ -1150,7 +1144,7 @@ void BtreeIndex::findArticleLinks( QVector< WordArticleLink > * articleLinks,
  char const * leaf = &rootNode.front();
  char const * leafEnd = leaf + rootNode.size();
-  char const * chainPtr = 0;
+  char const * chainPtr = nullptr;
  vector< char > extLeaf;
@ -1213,22 +1207,21 @@ void BtreeIndex::findArticleLinks( QVector< WordArticleLink > * articleLinks,
      articleLinks->reserve( n + n / 10 );
    }
-    for( unsigned i = 0; i < result.size(); i++ )
+    for ( auto & i : result ) {
    {
      if( isCancelled && Utils::AtomicInt::loadAcquire( *isCancelled ) )
        return;
      if( headwords )
-        headwords->insert( QString::fromUtf8( ( result[ i ].prefix + result[ i ].word ).c_str() ) );
+        headwords->insert( QString::fromUtf8( ( i.prefix + i.word ).c_str() ) );
-      if( offsets && offsets->contains( result[ i ].articleOffset ) )
+      if ( offsets && offsets->contains( i.articleOffset ) )
        continue;
      if( offsets )
-        offsets->insert( result[ i ].articleOffset );
+        offsets->insert( i.articleOffset );
      if( articleLinks )
-        articleLinks->push_back( WordArticleLink( result[ i ].prefix + result[ i ].word, result[ i ].articleOffset ) );
+        articleLinks->push_back( WordArticleLink( i.prefix + i.word, i.articleOffset ) );
    }
    if ( chainPtr >= leafEnd )
@ -1279,9 +1272,9 @@ void BtreeIndex::findSingleNodeHeadwords( uint32_t offsets,
  QMutexLocker _( idxFileMutex );
-  char const * leaf = 0;
+  char const * leaf     = nullptr;
-  char const * leafEnd = 0;
+  char const * leafEnd  = nullptr;
-  char const * chainPtr = 0;
+  char const * chainPtr = nullptr;
  vector< char > extLeaf;
@ -1299,9 +1292,8 @@ void BtreeIndex::findSingleNodeHeadwords( uint32_t offsets,
    if( headwords )
    {
-      for( unsigned i = 0; i < result.size(); i++ )
+      for ( auto & i : result ) {
-      {
+        headwords->insert( QString::fromUtf8( ( i.prefix + i.word ).c_str() ) );
        headwords->insert( QString::fromUtf8( ( result[ i ].prefix + result[ i ].word ).c_str() ) );
      }
    }
@ -1368,7 +1360,7 @@ void BtreeIndex::getHeadwordsFromOffsets( QList<uint32_t> & offsets,
  char const * leaf = &rootNode.front();
  char const * leafEnd = leaf + rootNode.size();
-  char const * chainPtr = 0;
+  char const * chainPtr = nullptr;
  vector< char > extLeaf;
@ -1416,9 +1408,8 @@ void BtreeIndex::getHeadwordsFromOffsets( QList<uint32_t> & offsets,
  {
    vector< WordArticleLink > result = readChain( chainPtr );
-    for( unsigned i = 0; i < result.size(); i++ )
+    for ( auto & i : result ) {
-    {
+      uint32_t articleOffset = i.articleOffset;
      uint32_t articleOffset =   result.at(i).articleOffset;
      QList<uint32_t>::Iterator  it = std::lower_bound( begOffsets, endOffsets,
                                                    articleOffset );
@ -1428,10 +1419,12 @@ void BtreeIndex::getHeadwordsFromOffsets( QList<uint32_t> & offsets,
        if( isCancelled && Utils::AtomicInt::loadAcquire( *isCancelled ) )
          return;
-        auto word = QString::fromUtf8( ( result[ i ].prefix + result[ i ].word ).c_str() );
+        auto word = QString::fromUtf8( ( i.prefix + i.word ).c_str() );
-        headwords.append(  word );
+        if ( headwords.indexOf( word ) == -1 ) {
-        offsets.erase( it);
+          headwords.append( word );
        }
        offsets.erase( it );
        begOffsets = offsets.begin();
        endOffsets = offsets.end();
      }
--- a/src/common/folding.cc
+++ b/src/common/folding.cc
@ -9,7 +9,6 @@
 namespace Folding {
 /// Tests if the given char is one of the Unicode combining marks. Some are
 /// caught by the diacritics folding table, but they are only handled there
 /// when they come with their main characters, not by themselves. The rest
@ -154,9 +153,10 @@ wstring applyWhitespaceAndPunctOnly( wstring const & in )
  out.reserve( in.size() );
-  for( size_t left = in.size(); left--; ++nextChar )
+  for ( size_t left = in.size(); left--; ++nextChar ) {
    if ( !isWhitespace( *nextChar ) && !isPunct( *nextChar ) )
      out.push_back( *nextChar );
  }
  return out;
 }
@ -166,6 +166,11 @@ bool isWhitespace( wchar ch )
  return QChar::isSpace( ch );
 }
 bool isWhitespaceOrPunct( wchar ch )
 {
  return QChar::isSpace( ch ) || QChar::isPunct( ch );
 }
 bool isPunct( wchar ch )
 {
  return QChar::isPunct( ch );
@ -241,7 +246,7 @@ QString trimWhitespace( QString const & in )
 QString escapeWildcardSymbols( const QString & str )
 {
  QString escaped( str );
-  escaped.replace( QRegularExpression( R"(([\[\]\?\*]))" ), "\\\\1" );
+  escaped.replace( QRegularExpression( R"(([\[\]\?\*]))" ), R"(\\1)" );
  return escaped;
 }
@ -249,8 +254,8 @@ QString escapeWildcardSymbols( const QString & str )
 QString unescapeWildcardSymbols( const QString & str )
 {
  QString unescaped( str );
-  unescaped.replace( QRegularExpression( R"(\\([\[\]\?\*]))" ), "\\1" );
+  unescaped.replace( QRegularExpression( R"(\\([\[\]\?\*]))" ), R"(\1)" );
  return unescaped;
 }
-}
+} // namespace Folding
--- a/src/common/folding.hh
+++ b/src/common/folding.hh
@ -57,6 +57,7 @@ wstring applyWhitespaceAndPunctOnly( wstring const & );
 /// otherwise. Whitespace corresponds to Zl/Zp/Zs Unicode classes, and also
 /// includes \n, \r and \t.
 bool isWhitespace( wchar ch );
 bool isWhitespaceOrPunct( wchar ch );
 /// Returns true if the given character is any form of punctuation, false
 /// otherwise. Punctuation corresponds to Pc/Pd/Pe/Pf/Pi/Po/Ps classes.
--- a/src/dict/stardict.cc
+++ b/src/dict/stardict.cc
@ -1358,18 +1358,23 @@ void StardictArticleRequest::run()
  {
    vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics );
-    for( unsigned x = 0; x < alts.size(); ++x )
+    //if alts has more than 100 , great probability that the dictionary is wrong produced or parsed.
-    {
+    if ( alts.size() < 100 ) {
-      /// Make an additional query for each alt
+      for ( unsigned x = 0; x < alts.size(); ++x ) {
        /// Make an additional query for each alt
-      vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics );
+        vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics );
-
+        if ( altChain.size() > 100 ) {
-      chain.insert( chain.end(), altChain.begin(), altChain.end() );
+          continue;
        }
        chain.insert( chain.end(), altChain.begin(), altChain.end() );
      }
    }
    multimap< wstring, pair< string, string > > mainArticles, alternateArticles;
-    set< uint32_t > articlesIncluded; // Some synonims make it that the articles
+    set< uint32_t > articlesIncluded; // Some synonms make it that the articles
                                      // appear several times. We combat this
                                      // by only allowing them to appear once.
@ -1377,8 +1382,8 @@ void StardictArticleRequest::run()
    if( ignoreDiacritics )
      wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
-    for( unsigned x = 0; x < chain.size(); ++x )
+    //if the chain is too large, it is more likely has some dictionary making or parsing issue.
-    {
+    for ( unsigned x = 0; x < qMin( 10, (int)chain.size() ); ++x ) {
      if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
      {
        finish();
@ -1883,6 +1888,11 @@ static void handleIdxSynFile( string const & fileName,
        if ( strchr( word, '/' ) )
          continue; // Skip this entry
      }
      // if the entry is hypen, skip
      if ( wordLen == 1 && *word == '-' ) {
        continue; // Skip this entry
      }
    }
    // Insert new entry into an index
--- a/src/ftshelpers.cc
+++ b/src/ftshelpers.cc
@ -311,7 +311,7 @@ void FTSResultsRequest::run()
      emit matchCount(matches.get_matches_estimated());
      // Display the results.
      qDebug() << matches.get_matches_estimated() << " results found.\n";
-      qDebug() << "Matches 1-" << matches.size() << ":\n\n";
+      qDebug() << "Matches " << matches.size() << ":\n\n";
      QList< uint32_t > offsetsForHeadwords;
      for( Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i )
      {