mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-24 04:24:09 +00:00
Merge pull request #929 from xiaoyifang/fix/query-hypen
fix: when search hypen `-` alone will result too much unnecessary result.
This commit is contained in:
commit
d9b81e7a68
|
@ -34,7 +34,8 @@ enum
|
||||||
};
|
};
|
||||||
|
|
||||||
BtreeIndex::BtreeIndex():
|
BtreeIndex::BtreeIndex():
|
||||||
idxFile( 0 ), rootNodeLoaded( false )
|
idxFile( nullptr ),
|
||||||
|
rootNodeLoaded( false )
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -72,7 +73,7 @@ vector< WordArticleLink > BtreeIndex::findArticles( wstring const & search_word,
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
wstring folded = Folding::apply( word );
|
wstring folded = Folding::apply( word );
|
||||||
if( folded.empty() )
|
if ( folded.empty() )
|
||||||
folded = Folding::applyWhitespaceOnly( word );
|
folded = Folding::applyWhitespaceOnly( word );
|
||||||
|
|
||||||
bool exactMatch;
|
bool exactMatch;
|
||||||
|
@ -175,10 +176,7 @@ void BtreeWordSearchRequest::findMatches()
|
||||||
|
|
||||||
bool insideSet = false;
|
bool insideSet = false;
|
||||||
bool escaped = false;
|
bool escaped = false;
|
||||||
for( wstring::size_type x = 0; x < foldedWithWildcards.size(); x++ )
|
for ( char32_t ch : foldedWithWildcards ) {
|
||||||
{
|
|
||||||
wchar ch = foldedWithWildcards[ x ];
|
|
||||||
|
|
||||||
if( ch == L'\\' && !escaped )
|
if( ch == L'\\' && !escaped )
|
||||||
{
|
{
|
||||||
escaped = true;
|
escaped = true;
|
||||||
|
@ -216,10 +214,7 @@ void BtreeWordSearchRequest::findMatches()
|
||||||
folded.clear();
|
folded.clear();
|
||||||
folded.reserve( foldedWithWildcards.size() );
|
folded.reserve( foldedWithWildcards.size() );
|
||||||
escaped = false;
|
escaped = false;
|
||||||
for( wstring::size_type x = 0; x < foldedWithWildcards.size(); x++ )
|
for ( char32_t ch : foldedWithWildcards ) {
|
||||||
{
|
|
||||||
wchar ch = foldedWithWildcards[ x ];
|
|
||||||
|
|
||||||
if( escaped )
|
if( escaped )
|
||||||
{
|
{
|
||||||
if( bNoLetters || ( ch != L'*' && ch != L'?' && ch != L'[' && ch != L']' ) )
|
if( bNoLetters || ( ch != L'*' && ch != L'?' && ch != L'[' && ch != L']' ) )
|
||||||
|
@ -303,11 +298,10 @@ void BtreeWordSearchRequest::findMatches()
|
||||||
|
|
||||||
QMutexLocker _( &dataMutex );
|
QMutexLocker _( &dataMutex );
|
||||||
|
|
||||||
for( unsigned x = 0; x < chain.size(); ++x )
|
for ( auto & x : chain ) {
|
||||||
{
|
|
||||||
if( useWildcards )
|
if( useWildcards )
|
||||||
{
|
{
|
||||||
wstring word = Utf8::decode( chain[ x ].prefix + chain[ x ].word );
|
wstring word = Utf8::decode( x.prefix + x.word );
|
||||||
wstring result = Folding::applyDiacriticsOnly( word );
|
wstring result = Folding::applyDiacriticsOnly( word );
|
||||||
if( result.size() >= (wstring::size_type)minMatchLength )
|
if( result.size() >= (wstring::size_type)minMatchLength )
|
||||||
{
|
{
|
||||||
|
@ -322,9 +316,9 @@ void BtreeWordSearchRequest::findMatches()
|
||||||
{
|
{
|
||||||
// Skip middle matches, if requested. If suffix variation is specified,
|
// Skip middle matches, if requested. If suffix variation is specified,
|
||||||
// make sure the string isn't larger than requested.
|
// make sure the string isn't larger than requested.
|
||||||
if ( ( allowMiddleMatches || Folding::apply( Utf8::decode( chain[ x ].prefix ) ).empty() ) &&
|
if ( ( allowMiddleMatches || Folding::apply( Utf8::decode( x.prefix ) ).empty() )
|
||||||
( maxSuffixVariation < 0 || (int)resultFolded.size() - initialFoldedSize <= maxSuffixVariation ) )
|
&& ( maxSuffixVariation < 0 || (int)resultFolded.size() - initialFoldedSize <= maxSuffixVariation ) )
|
||||||
addMatch( Utf8::decode( chain[ x ].prefix + chain[ x ].word ) );
|
addMatch( Utf8::decode( x.prefix + x.word ) );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -517,7 +511,7 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix( wstring const & target,
|
||||||
nextLeaf = 0;
|
nextLeaf = 0;
|
||||||
}
|
}
|
||||||
if( !leafEntries )
|
if( !leafEntries )
|
||||||
return 0;
|
return nullptr;
|
||||||
|
|
||||||
return leaf + sizeof( uint32_t );
|
return leaf + sizeof( uint32_t );
|
||||||
}
|
}
|
||||||
|
@ -645,7 +639,7 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix( wstring const & target,
|
||||||
if ( currentNodeOffset != rootOffset )
|
if ( currentNodeOffset != rootOffset )
|
||||||
throw exCorruptedChainData();
|
throw exCorruptedChainData();
|
||||||
else
|
else
|
||||||
return 0; // No match
|
return nullptr; // No match
|
||||||
}
|
}
|
||||||
|
|
||||||
// Build an array containing all chain pointers
|
// Build an array containing all chain pointers
|
||||||
|
@ -745,7 +739,7 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix( wstring const & target,
|
||||||
return &extLeaf.front() + sizeof( uint32_t );
|
return &extLeaf.front() + sizeof( uint32_t );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
return 0; // This was the last leaf
|
return nullptr; // This was the last leaf
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
return chainToCheck[ 1 ];
|
return chainToCheck[ 1 ];
|
||||||
|
@ -1117,7 +1111,7 @@ void BtreeIndex::getAllHeadwords( QSet< QString > & headwords )
|
||||||
if ( !idxFile )
|
if ( !idxFile )
|
||||||
throw exIndexWasNotOpened();
|
throw exIndexWasNotOpened();
|
||||||
|
|
||||||
findArticleLinks( NULL, NULL, &headwords );
|
findArticleLinks( nullptr, nullptr, &headwords );
|
||||||
}
|
}
|
||||||
|
|
||||||
void BtreeIndex::findAllArticleLinks( QVector< WordArticleLink > & articleLinks )
|
void BtreeIndex::findAllArticleLinks( QVector< WordArticleLink > & articleLinks )
|
||||||
|
@ -1127,7 +1121,7 @@ void BtreeIndex::findAllArticleLinks( QVector< WordArticleLink > & articleLinks
|
||||||
|
|
||||||
QSet< uint32_t > offsets;
|
QSet< uint32_t > offsets;
|
||||||
|
|
||||||
findArticleLinks( &articleLinks, &offsets, NULL );
|
findArticleLinks( &articleLinks, &offsets, nullptr );
|
||||||
}
|
}
|
||||||
|
|
||||||
void BtreeIndex::findArticleLinks( QVector< WordArticleLink > * articleLinks,
|
void BtreeIndex::findArticleLinks( QVector< WordArticleLink > * articleLinks,
|
||||||
|
@ -1150,7 +1144,7 @@ void BtreeIndex::findArticleLinks( QVector< WordArticleLink > * articleLinks,
|
||||||
|
|
||||||
char const * leaf = &rootNode.front();
|
char const * leaf = &rootNode.front();
|
||||||
char const * leafEnd = leaf + rootNode.size();
|
char const * leafEnd = leaf + rootNode.size();
|
||||||
char const * chainPtr = 0;
|
char const * chainPtr = nullptr;
|
||||||
|
|
||||||
vector< char > extLeaf;
|
vector< char > extLeaf;
|
||||||
|
|
||||||
|
@ -1213,22 +1207,21 @@ void BtreeIndex::findArticleLinks( QVector< WordArticleLink > * articleLinks,
|
||||||
articleLinks->reserve( n + n / 10 );
|
articleLinks->reserve( n + n / 10 );
|
||||||
}
|
}
|
||||||
|
|
||||||
for( unsigned i = 0; i < result.size(); i++ )
|
for ( auto & i : result ) {
|
||||||
{
|
|
||||||
if( isCancelled && Utils::AtomicInt::loadAcquire( *isCancelled ) )
|
if( isCancelled && Utils::AtomicInt::loadAcquire( *isCancelled ) )
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if( headwords )
|
if( headwords )
|
||||||
headwords->insert( QString::fromUtf8( ( result[ i ].prefix + result[ i ].word ).c_str() ) );
|
headwords->insert( QString::fromUtf8( ( i.prefix + i.word ).c_str() ) );
|
||||||
|
|
||||||
if( offsets && offsets->contains( result[ i ].articleOffset ) )
|
if ( offsets && offsets->contains( i.articleOffset ) )
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if( offsets )
|
if( offsets )
|
||||||
offsets->insert( result[ i ].articleOffset );
|
offsets->insert( i.articleOffset );
|
||||||
|
|
||||||
if( articleLinks )
|
if( articleLinks )
|
||||||
articleLinks->push_back( WordArticleLink( result[ i ].prefix + result[ i ].word, result[ i ].articleOffset ) );
|
articleLinks->push_back( WordArticleLink( i.prefix + i.word, i.articleOffset ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( chainPtr >= leafEnd )
|
if ( chainPtr >= leafEnd )
|
||||||
|
@ -1279,9 +1272,9 @@ void BtreeIndex::findSingleNodeHeadwords( uint32_t offsets,
|
||||||
|
|
||||||
QMutexLocker _( idxFileMutex );
|
QMutexLocker _( idxFileMutex );
|
||||||
|
|
||||||
char const * leaf = 0;
|
char const * leaf = nullptr;
|
||||||
char const * leafEnd = 0;
|
char const * leafEnd = nullptr;
|
||||||
char const * chainPtr = 0;
|
char const * chainPtr = nullptr;
|
||||||
|
|
||||||
vector< char > extLeaf;
|
vector< char > extLeaf;
|
||||||
|
|
||||||
|
@ -1299,9 +1292,8 @@ void BtreeIndex::findSingleNodeHeadwords( uint32_t offsets,
|
||||||
|
|
||||||
if( headwords )
|
if( headwords )
|
||||||
{
|
{
|
||||||
for( unsigned i = 0; i < result.size(); i++ )
|
for ( auto & i : result ) {
|
||||||
{
|
headwords->insert( QString::fromUtf8( ( i.prefix + i.word ).c_str() ) );
|
||||||
headwords->insert( QString::fromUtf8( ( result[ i ].prefix + result[ i ].word ).c_str() ) );
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1368,7 +1360,7 @@ void BtreeIndex::getHeadwordsFromOffsets( QList<uint32_t> & offsets,
|
||||||
|
|
||||||
char const * leaf = &rootNode.front();
|
char const * leaf = &rootNode.front();
|
||||||
char const * leafEnd = leaf + rootNode.size();
|
char const * leafEnd = leaf + rootNode.size();
|
||||||
char const * chainPtr = 0;
|
char const * chainPtr = nullptr;
|
||||||
|
|
||||||
vector< char > extLeaf;
|
vector< char > extLeaf;
|
||||||
|
|
||||||
|
@ -1416,9 +1408,8 @@ void BtreeIndex::getHeadwordsFromOffsets( QList<uint32_t> & offsets,
|
||||||
{
|
{
|
||||||
vector< WordArticleLink > result = readChain( chainPtr );
|
vector< WordArticleLink > result = readChain( chainPtr );
|
||||||
|
|
||||||
for( unsigned i = 0; i < result.size(); i++ )
|
for ( auto & i : result ) {
|
||||||
{
|
uint32_t articleOffset = i.articleOffset;
|
||||||
uint32_t articleOffset = result.at(i).articleOffset;
|
|
||||||
|
|
||||||
QList<uint32_t>::Iterator it = std::lower_bound( begOffsets, endOffsets,
|
QList<uint32_t>::Iterator it = std::lower_bound( begOffsets, endOffsets,
|
||||||
articleOffset );
|
articleOffset );
|
||||||
|
@ -1428,10 +1419,12 @@ void BtreeIndex::getHeadwordsFromOffsets( QList<uint32_t> & offsets,
|
||||||
if( isCancelled && Utils::AtomicInt::loadAcquire( *isCancelled ) )
|
if( isCancelled && Utils::AtomicInt::loadAcquire( *isCancelled ) )
|
||||||
return;
|
return;
|
||||||
|
|
||||||
auto word = QString::fromUtf8( ( result[ i ].prefix + result[ i ].word ).c_str() );
|
auto word = QString::fromUtf8( ( i.prefix + i.word ).c_str() );
|
||||||
|
|
||||||
headwords.append( word );
|
if ( headwords.indexOf( word ) == -1 ) {
|
||||||
offsets.erase( it);
|
headwords.append( word );
|
||||||
|
}
|
||||||
|
offsets.erase( it );
|
||||||
begOffsets = offsets.begin();
|
begOffsets = offsets.begin();
|
||||||
endOffsets = offsets.end();
|
endOffsets = offsets.end();
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,7 +9,6 @@
|
||||||
|
|
||||||
namespace Folding {
|
namespace Folding {
|
||||||
|
|
||||||
|
|
||||||
/// Tests if the given char is one of the Unicode combining marks. Some are
|
/// Tests if the given char is one of the Unicode combining marks. Some are
|
||||||
/// caught by the diacritics folding table, but they are only handled there
|
/// caught by the diacritics folding table, but they are only handled there
|
||||||
/// when they come with their main characters, not by themselves. The rest
|
/// when they come with their main characters, not by themselves. The rest
|
||||||
|
@ -154,9 +153,10 @@ wstring applyWhitespaceAndPunctOnly( wstring const & in )
|
||||||
|
|
||||||
out.reserve( in.size() );
|
out.reserve( in.size() );
|
||||||
|
|
||||||
for( size_t left = in.size(); left--; ++nextChar )
|
for ( size_t left = in.size(); left--; ++nextChar ) {
|
||||||
if ( !isWhitespace( *nextChar ) && !isPunct( *nextChar ) )
|
if ( !isWhitespace( *nextChar ) && !isPunct( *nextChar ) )
|
||||||
out.push_back( *nextChar );
|
out.push_back( *nextChar );
|
||||||
|
}
|
||||||
|
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
@ -166,6 +166,11 @@ bool isWhitespace( wchar ch )
|
||||||
return QChar::isSpace( ch );
|
return QChar::isSpace( ch );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool isWhitespaceOrPunct( wchar ch )
|
||||||
|
{
|
||||||
|
return QChar::isSpace( ch ) || QChar::isPunct( ch );
|
||||||
|
}
|
||||||
|
|
||||||
bool isPunct( wchar ch )
|
bool isPunct( wchar ch )
|
||||||
{
|
{
|
||||||
return QChar::isPunct( ch );
|
return QChar::isPunct( ch );
|
||||||
|
@ -241,7 +246,7 @@ QString trimWhitespace( QString const & in )
|
||||||
QString escapeWildcardSymbols( const QString & str )
|
QString escapeWildcardSymbols( const QString & str )
|
||||||
{
|
{
|
||||||
QString escaped( str );
|
QString escaped( str );
|
||||||
escaped.replace( QRegularExpression( R"(([\[\]\?\*]))" ), "\\\\1" );
|
escaped.replace( QRegularExpression( R"(([\[\]\?\*]))" ), R"(\\1)" );
|
||||||
|
|
||||||
return escaped;
|
return escaped;
|
||||||
}
|
}
|
||||||
|
@ -249,8 +254,8 @@ QString escapeWildcardSymbols( const QString & str )
|
||||||
QString unescapeWildcardSymbols( const QString & str )
|
QString unescapeWildcardSymbols( const QString & str )
|
||||||
{
|
{
|
||||||
QString unescaped( str );
|
QString unescaped( str );
|
||||||
unescaped.replace( QRegularExpression( R"(\\([\[\]\?\*]))" ), "\\1" );
|
unescaped.replace( QRegularExpression( R"(\\([\[\]\?\*]))" ), R"(\1)" );
|
||||||
|
|
||||||
return unescaped;
|
return unescaped;
|
||||||
}
|
}
|
||||||
}
|
} // namespace Folding
|
||||||
|
|
|
@ -57,6 +57,7 @@ wstring applyWhitespaceAndPunctOnly( wstring const & );
|
||||||
/// otherwise. Whitespace corresponds to Zl/Zp/Zs Unicode classes, and also
|
/// otherwise. Whitespace corresponds to Zl/Zp/Zs Unicode classes, and also
|
||||||
/// includes \n, \r and \t.
|
/// includes \n, \r and \t.
|
||||||
bool isWhitespace( wchar ch );
|
bool isWhitespace( wchar ch );
|
||||||
|
bool isWhitespaceOrPunct( wchar ch );
|
||||||
|
|
||||||
/// Returns true if the given character is any form of punctuation, false
|
/// Returns true if the given character is any form of punctuation, false
|
||||||
/// otherwise. Punctuation corresponds to Pc/Pd/Pe/Pf/Pi/Po/Ps classes.
|
/// otherwise. Punctuation corresponds to Pc/Pd/Pe/Pf/Pi/Po/Ps classes.
|
||||||
|
|
|
@ -1358,18 +1358,23 @@ void StardictArticleRequest::run()
|
||||||
{
|
{
|
||||||
vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics );
|
vector< WordArticleLink > chain = dict.findArticles( word, ignoreDiacritics );
|
||||||
|
|
||||||
for( unsigned x = 0; x < alts.size(); ++x )
|
//if alts has more than 100 , great probability that the dictionary is wrong produced or parsed.
|
||||||
{
|
if ( alts.size() < 100 ) {
|
||||||
/// Make an additional query for each alt
|
for ( unsigned x = 0; x < alts.size(); ++x ) {
|
||||||
|
/// Make an additional query for each alt
|
||||||
|
|
||||||
vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics );
|
vector< WordArticleLink > altChain = dict.findArticles( alts[ x ], ignoreDiacritics );
|
||||||
|
if ( altChain.size() > 100 ) {
|
||||||
chain.insert( chain.end(), altChain.begin(), altChain.end() );
|
continue;
|
||||||
|
}
|
||||||
|
chain.insert( chain.end(), altChain.begin(), altChain.end() );
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
multimap< wstring, pair< string, string > > mainArticles, alternateArticles;
|
multimap< wstring, pair< string, string > > mainArticles, alternateArticles;
|
||||||
|
|
||||||
set< uint32_t > articlesIncluded; // Some synonims make it that the articles
|
set< uint32_t > articlesIncluded; // Some synonms make it that the articles
|
||||||
// appear several times. We combat this
|
// appear several times. We combat this
|
||||||
// by only allowing them to appear once.
|
// by only allowing them to appear once.
|
||||||
|
|
||||||
|
@ -1377,8 +1382,8 @@ void StardictArticleRequest::run()
|
||||||
if( ignoreDiacritics )
|
if( ignoreDiacritics )
|
||||||
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
|
wordCaseFolded = Folding::applyDiacriticsOnly( wordCaseFolded );
|
||||||
|
|
||||||
for( unsigned x = 0; x < chain.size(); ++x )
|
//if the chain is too large, it is more likely has some dictionary making or parsing issue.
|
||||||
{
|
for ( unsigned x = 0; x < qMin( 10, (int)chain.size() ); ++x ) {
|
||||||
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
||||||
{
|
{
|
||||||
finish();
|
finish();
|
||||||
|
@ -1883,6 +1888,11 @@ static void handleIdxSynFile( string const & fileName,
|
||||||
if ( strchr( word, '/' ) )
|
if ( strchr( word, '/' ) )
|
||||||
continue; // Skip this entry
|
continue; // Skip this entry
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// if the entry is hypen, skip
|
||||||
|
if ( wordLen == 1 && *word == '-' ) {
|
||||||
|
continue; // Skip this entry
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Insert new entry into an index
|
// Insert new entry into an index
|
||||||
|
|
|
@ -311,7 +311,7 @@ void FTSResultsRequest::run()
|
||||||
emit matchCount(matches.get_matches_estimated());
|
emit matchCount(matches.get_matches_estimated());
|
||||||
// Display the results.
|
// Display the results.
|
||||||
qDebug() << matches.get_matches_estimated() << " results found.\n";
|
qDebug() << matches.get_matches_estimated() << " results found.\n";
|
||||||
qDebug() << "Matches 1-" << matches.size() << ":\n\n";
|
qDebug() << "Matches " << matches.size() << ":\n\n";
|
||||||
QList< uint32_t > offsetsForHeadwords;
|
QList< uint32_t > offsetsForHeadwords;
|
||||||
for( Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i )
|
for( Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i )
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in a new issue