Merge pull request #929 from xiaoyifang/fix/query-hypen

fix: when search hypen `-` alone will result too much unnecessary result.
This commit is contained in:
xiaoyifang 2023-07-06 17:07:05 +08:00 committed by GitHub
commit d9b81e7a68
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 65 additions and 56 deletions

View file

@ -34,7 +34,8 @@ enum
};
BtreeIndex::BtreeIndex():
idxFile( 0 ), rootNodeLoaded( false )
idxFile( nullptr ),
rootNodeLoaded( false )
{
}
@ -72,7 +73,7 @@ vector< WordArticleLink > BtreeIndex::findArticles( wstring const & search_word,
try
{
wstring folded = Folding::apply( word );
if( folded.empty() )
if ( folded.empty() )
folded = Folding::applyWhitespaceOnly( word );
bool exactMatch;
@ -175,10 +176,7 @@ void BtreeWordSearchRequest::findMatches()
bool insideSet = false;
bool escaped = false;
for( wstring::size_type x = 0; x < foldedWithWildcards.size(); x++ )
{
wchar ch = foldedWithWildcards[ x ];
for ( char32_t ch : foldedWithWildcards ) {
if( ch == L'\\' && !escaped )
{
escaped = true;
@ -216,10 +214,7 @@ void BtreeWordSearchRequest::findMatches()
folded.clear();
folded.reserve( foldedWithWildcards.size() );
escaped = false;
for( wstring::size_type x = 0; x < foldedWithWildcards.size(); x++ )
{
wchar ch = foldedWithWildcards[ x ];
for ( char32_t ch : foldedWithWildcards ) {
if( escaped )
{
if( bNoLetters || ( ch != L'*' && ch != L'?' && ch != L'[' && ch != L']' ) )
@ -303,11 +298,10 @@ void BtreeWordSearchRequest::findMatches()
QMutexLocker _( &dataMutex );
for( unsigned x = 0; x < chain.size(); ++x )
{
for ( auto & x : chain ) {
if( useWildcards )
{
wstring word = Utf8::decode( chain[ x ].prefix + chain[ x ].word );
wstring word = Utf8::decode( x.prefix + x.word );
wstring result = Folding::applyDiacriticsOnly( word );
if( result.size() >= (wstring::size_type)minMatchLength )
{
@ -322,9 +316,9 @@ void BtreeWordSearchRequest::findMatches()
{
// Skip middle matches, if requested. If suffix variation is specified,
// make sure the string isn't larger than requested.
if ( ( allowMiddleMatches || Folding::apply( Utf8::decode( chain[ x ].prefix ) ).empty() ) &&
( maxSuffixVariation < 0 || (int)resultFolded.size() - initialFoldedSize <= maxSuffixVariation ) )
addMatch( Utf8::decode( chain[ x ].prefix + chain[ x ].word ) );
if ( ( allowMiddleMatches || Folding::apply( Utf8::decode( x.prefix ) ).empty() )
&& ( maxSuffixVariation < 0 || (int)resultFolded.size() - initialFoldedSize <= maxSuffixVariation ) )
addMatch( Utf8::decode( x.prefix + x.word ) );
}
}
@ -517,7 +511,7 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix( wstring const & target,
nextLeaf = 0;
}
if( !leafEntries )
return 0;
return nullptr;
return leaf + sizeof( uint32_t );
}
@ -645,7 +639,7 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix( wstring const & target,
if ( currentNodeOffset != rootOffset )
throw exCorruptedChainData();
else
return 0; // No match
return nullptr; // No match
}
// Build an array containing all chain pointers
@ -745,7 +739,7 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix( wstring const & target,
return &extLeaf.front() + sizeof( uint32_t );
}
else
return 0; // This was the last leaf
return nullptr; // This was the last leaf
}
else
return chainToCheck[ 1 ];
@ -1117,7 +1111,7 @@ void BtreeIndex::getAllHeadwords( QSet< QString > & headwords )
if ( !idxFile )
throw exIndexWasNotOpened();
findArticleLinks( NULL, NULL, &headwords );
findArticleLinks( nullptr, nullptr, &headwords );
}
void BtreeIndex::findAllArticleLinks( QVector< WordArticleLink > & articleLinks )
@ -1127,7 +1121,7 @@ void BtreeIndex::findAllArticleLinks( QVector< WordArticleLink > & articleLinks
QSet< uint32_t > offsets;
findArticleLinks( &articleLinks, &offsets, NULL );
findArticleLinks( &articleLinks, &offsets, nullptr );
}
void BtreeIndex::findArticleLinks( QVector< WordArticleLink > * articleLinks,
@ -1150,7 +1144,7 @@ void BtreeIndex::findArticleLinks( QVector< WordArticleLink > * articleLinks,
char const * leaf = &rootNode.front();
char const * leafEnd = leaf + rootNode.size();
char const * chainPtr = 0;
char const * chainPtr = nullptr;
vector< char > extLeaf;
@ -1213,22 +1207,21 @@ void BtreeIndex::findArticleLinks( QVector< WordArticleLink > * articleLinks,
articleLinks->reserve( n + n / 10 );
}
for( unsigned i = 0; i < result.size(); i++ )
{
for ( auto & i : result ) {
if( isCancelled && Utils::AtomicInt::loadAcquire( *isCancelled ) )
return;
if( headwords )
headwords->insert( QString::fromUtf8( ( result[ i ].prefix + result[ i ].word ).c_str() ) );
headwords->insert( QString::fromUtf8( ( i.prefix + i.word ).c_str() ) );
if( offsets && offsets->contains( result[ i ].articleOffset ) )
if ( offsets && offsets->contains( i.articleOffset ) )
continue;
if( offsets )
offsets->insert( result[ i ].articleOffset );
offsets->insert( i.articleOffset );
if( articleLinks )
articleLinks->push_back( WordArticleLink( result[ i ].prefix + result[ i ].word, result[ i ].articleOffset ) );
articleLinks->push_back( WordArticleLink( i.prefix + i.word, i.articleOffset ) );
}
if ( chainPtr >= leafEnd )
@ -1279,9 +1272,9 @@ void BtreeIndex::findSingleNodeHeadwords( uint32_t offsets,
QMutexLocker _( idxFileMutex );
char const * leaf = 0;
char const * leafEnd = 0;
char const * chainPtr = 0;
char const * leaf = nullptr;
char const * leafEnd = nullptr;
char const * chainPtr = nullptr;
vector< char > extLeaf;
@ -1299,9 +1292,8 @@ void BtreeIndex::findSingleNodeHeadwords( uint32_t offsets,
if( headwords )
{
for( unsigned i = 0; i < result.size(); i++ )
{
headwords->insert( QString::fromUtf8( ( result[ i ].prefix + result[ i ].word ).c_str() ) );
for ( auto & i : result ) {
headwords->insert( QString::fromUtf8( ( i.prefix + i.word ).c_str() ) );
}
}
@ -1368,7 +1360,7 @@ void BtreeIndex::getHeadwordsFromOffsets( QList<uint32_t> & offsets,
char const * leaf = &rootNode.front();
char const * leafEnd = leaf + rootNode.size();
char const * chainPtr = 0;
char const * chainPtr = nullptr;
vector< char > extLeaf;
@ -1416,9 +1408,8 @@ void BtreeIndex::getHeadwordsFromOffsets( QList<uint32_t> & offsets,
{
vector< WordArticleLink > result = readChain( chainPtr );
for( unsigned i = 0; i < result.size(); i++ )
{
uint32_t articleOffset = result.at(i).articleOffset;
for ( auto & i : result ) {
uint32_t articleOffset = i.articleOffset;
QList<uint32_t>::Iterator it = std::lower_bound( begOffsets, endOffsets,
articleOffset );
@ -1428,10 +1419,12 @@ void BtreeIndex::getHeadwordsFromOffsets( QList<uint32_t> & offsets,
if( isCancelled && Utils::AtomicInt::loadAcquire( *isCancelled ) )
return;
auto word = QString::fromUtf8( ( result[ i ].prefix + result[ i ].word ).c_str() );
auto word = QString::fromUtf8( ( i.prefix + i.word ).c_str() );
headwords.append( word );
offsets.erase( it);
if ( headwords.indexOf( word ) == -1 ) {
headwords.append( word );
}
offsets.erase( it );
begOffsets = offsets.begin();
endOffsets = offsets.end();
}

View file

@ -9,7 +9,6 @@
namespace Folding {
/// Tests if the given char is one of the Unicode combining marks. Some are
/// caught by the diacritics folding table, but they are only handled there
/// when they come with their main characters, not by themselves. The rest
@ -154,9 +153,10 @@ wstring applyWhitespaceAndPunctOnly( wstring const & in )
out.reserve( in.size() );
for( size_t left = in.size(); left--; ++nextChar )
for ( size_t left = in.size(); left--; ++nextChar ) {
if ( !isWhitespace( *nextChar ) && !isPunct( *nextChar ) )
out.push_back( *nextChar );
}
return out;
}
@ -166,6 +166,11 @@ bool isWhitespace( wchar ch )
return QChar::isSpace( ch );
}
bool isWhitespaceOrPunct( wchar ch )
{
return QChar::isSpace( ch ) || QChar::isPunct( ch );
}
bool isPunct( wchar ch )
{
return QChar::isPunct( ch );
@ -241,7 +246,7 @@ QString trimWhitespace( QString const & in )
QString escapeWildcardSymbols( const QString & str )
{
QString escaped( str );
escaped.replace( QRegularExpression( R"(([\[\]\?\*]))" ), "\\\\1" );
escaped.replace( QRegularExpression( R"(([\[\]\?\*]))" ), R"(\\1)" );
return escaped;
}
@ -249,8 +254,8 @@ QString escapeWildcardSymbols( const QString & str )
QString unescapeWildcardSymbols( const QString & str )
{
QString unescaped( str );
unescaped.replace( QRegularExpression( R"(\\([\[\]\?\*]))" ), "\\1" );
unescaped.replace( QRegularExpression( R"(\\([\[\]\?\*]))" ), R"(\1)" );
return unescaped;
}
}
} // namespace Folding

View file

@ -57,6 +57,7 @@ wstring applyWhitespaceAndPunctOnly( wstring const & );
/// otherwise. Whitespace corresponds to Zl/Zp/Zs Unicode classes, and also
/// includes \n, \r and \t.
bool isWhitespace( wchar ch );
bool isWhitespaceOrPunct( wchar ch );
/// Returns true if the given character is any form of punctuation, false
/// otherwise. Punctuation corresponds to Pc/Pd/Pe/Pf/Pi/Po/Ps classes.

View file

@ -1358,18 +1358,23 @@ void StardictArticleRequest::run()
{
vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics );
for( unsigned x = 0; x < alts.size(); ++x )
{
/// Make an additional query for each alt
//if alts has more than 100 , great probability that the dictionary is wrong produced or parsed.
if ( alts.size() < 100 ) {
for ( unsigned x = 0; x < alts.size(); ++x ) {
/// Make an additional query for each alt
vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics );
chain.insert( chain.end(), altChain.begin(), altChain.end() );
vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics );
if ( altChain.size() > 100 ) {
continue;
}
chain.insert( chain.end(), altChain.begin(), altChain.end() );
}
}
multimap< wstring, pair< string, string > > mainArticles, alternateArticles;
set< uint32_t > articlesIncluded; // Some synonims make it that the articles
set< uint32_t > articlesIncluded; // Some synonms make it that the articles
// appear several times. We combat this
// by only allowing them to appear once.
@ -1377,8 +1382,8 @@ void StardictArticleRequest::run()
if( ignoreDiacritics )
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
for( unsigned x = 0; x < chain.size(); ++x )
{
//if the chain is too large, it is more likely has some dictionary making or parsing issue.
for ( unsigned x = 0; x < qMin( 10, (int)chain.size() ); ++x ) {
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
{
finish();
@ -1883,6 +1888,11 @@ static void handleIdxSynFile( string const & fileName,
if ( strchr( word, '/' ) )
continue; // Skip this entry
}
// if the entry is hypen, skip
if ( wordLen == 1 && *word == '-' ) {
continue; // Skip this entry
}
}
// Insert new entry into an index

View file

@ -311,7 +311,7 @@ void FTSResultsRequest::run()
emit matchCount(matches.get_matches_estimated());
// Display the results.
qDebug() << matches.get_matches_estimated() << " results found.\n";
qDebug() << "Matches 1-" << matches.size() << ":\n\n";
qDebug() << "Matches " << matches.size() << ":\n\n";
QList< uint32_t > offsetsForHeadwords;
for( Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i )
{