mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-12-12 10:54:07 +00:00
Merge pull request #89 from xiaoyifang/feature/cjk
feat:add cjk fulltext search
This commit is contained in:
commit
58fdbae34f
|
@ -1399,6 +1399,7 @@ void BtreeIndex::getHeadwordsFromOffsets( QList<uint32_t> & offsets,
|
||||||
for( unsigned i = 0; i < result.size(); i++ )
|
for( unsigned i = 0; i < result.size(); i++ )
|
||||||
{
|
{
|
||||||
uint32_t articleOffset = result.at(i).articleOffset;
|
uint32_t articleOffset = result.at(i).articleOffset;
|
||||||
|
|
||||||
QList<uint32_t>::Iterator it = std::lower_bound( begOffsets, endOffsets,
|
QList<uint32_t>::Iterator it = std::lower_bound( begOffsets, endOffsets,
|
||||||
articleOffset );
|
articleOffset );
|
||||||
|
|
||||||
|
@ -1407,7 +1408,9 @@ void BtreeIndex::getHeadwordsFromOffsets( QList<uint32_t> & offsets,
|
||||||
if( isCancelled && Utils::AtomicInt::loadAcquire( *isCancelled ) )
|
if( isCancelled && Utils::AtomicInt::loadAcquire( *isCancelled ) )
|
||||||
return;
|
return;
|
||||||
|
|
||||||
headwords.append( QString::fromUtf8( ( result[ i ].prefix + result[ i ].word ).c_str() ) );
|
auto word = QString::fromUtf8( ( result[ i ].prefix + result[ i ].word ).c_str() );
|
||||||
|
|
||||||
|
headwords.append( word );
|
||||||
offsets.erase( it);
|
offsets.erase( it);
|
||||||
begOffsets = offsets.begin();
|
begOffsets = offsets.begin();
|
||||||
endOffsets = offsets.end();
|
endOffsets = offsets.end();
|
||||||
|
|
172
ftshelpers.cc
172
ftshelpers.cc
|
@ -39,16 +39,26 @@ bool ftsIndexIsOldOrBad( string const & indexFile,
|
||||||
|
|
||||||
static QString makeHiliteRegExpString( QStringList const & words,
|
static QString makeHiliteRegExpString( QStringList const & words,
|
||||||
int searchMode,
|
int searchMode,
|
||||||
int distanceBetweenWords )
|
int distanceBetweenWords, bool hasCJK = false )
|
||||||
{
|
{
|
||||||
QString searchString( "(" );
|
QString searchString( "(" );
|
||||||
|
|
||||||
QString stripWords( "(?:\\W+\\w+){0," );
|
QString stripWords( "(?:\\W+\\w+){0," );
|
||||||
if( distanceBetweenWords >= 0 )
|
if( distanceBetweenWords >= 0 )
|
||||||
stripWords += QString::number( distanceBetweenWords );
|
stripWords += QString::number( distanceBetweenWords );
|
||||||
stripWords += "}\\W+";
|
stripWords += "}";
|
||||||
|
|
||||||
|
if(!hasCJK)
|
||||||
|
{
|
||||||
|
stripWords += "\\W+";
|
||||||
|
}
|
||||||
|
|
||||||
QString boundWord( searchMode == FTS::WholeWords ? "\\b" : "(?:\\w*)");
|
QString boundWord( searchMode == FTS::WholeWords ? "\\b" : "(?:\\w*)");
|
||||||
|
if(hasCJK)
|
||||||
|
{
|
||||||
|
//no boundary for CJK
|
||||||
|
boundWord.clear();
|
||||||
|
}
|
||||||
|
|
||||||
for( int x = 0; x < words.size(); x++ )
|
for( int x = 0; x < words.size(); x++ )
|
||||||
{
|
{
|
||||||
|
@ -62,71 +72,7 @@ static QString makeHiliteRegExpString( QStringList const & words,
|
||||||
return searchString;
|
return searchString;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool parseSearchString( QString const & str, QStringList & indexWords,
|
void tokenizeCJK( QStringList & indexWords, QRegularExpression wordRegExp, QStringList list )
|
||||||
QStringList & searchWords,
|
|
||||||
QRegExp & searchRegExp, int searchMode,
|
|
||||||
bool matchCase,
|
|
||||||
int distanceBetweenWords,
|
|
||||||
bool & hasCJK )
|
|
||||||
{
|
|
||||||
searchWords.clear();
|
|
||||||
indexWords.clear();
|
|
||||||
QRegularExpression spacesRegExp( "\\W+", QRegularExpression::UseUnicodePropertiesOption );
|
|
||||||
QRegularExpression wordRegExp( QString( "\\w{" ) + QString::number( FTS::MinimumWordSize ) + ",}", QRegularExpression::UseUnicodePropertiesOption );
|
|
||||||
QRegularExpression setsRegExp( "\\[[^\\]]+\\]", QRegularExpression::CaseInsensitiveOption );
|
|
||||||
QRegularExpression regexRegExp( "\\\\[afnrtvdDwWsSbB]|\\\\x([0-9A-Fa-f]{4})|\\\\0([0-7]{3})", QRegularExpression::CaseInsensitiveOption);
|
|
||||||
|
|
||||||
hasCJK = false;
|
|
||||||
for( int x = 0; x < str.size(); x++ )
|
|
||||||
if( isCJKChar( str.at( x ).unicode() ) )
|
|
||||||
{
|
|
||||||
hasCJK = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if( searchMode == FTS::WholeWords || searchMode == FTS::PlainText )
|
|
||||||
{
|
|
||||||
if( hasCJK )
|
|
||||||
return false;
|
|
||||||
|
|
||||||
// Make words list for search in article text
|
|
||||||
searchWords = str.normalized( QString::NormalizationForm_C )
|
|
||||||
.split( spacesRegExp, Qt::SkipEmptyParts );
|
|
||||||
|
|
||||||
// Make words list for index search
|
|
||||||
QStringList list = str.normalized( QString::NormalizationForm_C )
|
|
||||||
.toLower().split( spacesRegExp, Qt::SkipEmptyParts );
|
|
||||||
indexWords = list.filter( wordRegExp );
|
|
||||||
indexWords.removeDuplicates();
|
|
||||||
|
|
||||||
// Make regexp for results hilite
|
|
||||||
|
|
||||||
QStringList allWords = str.split( spacesRegExp, Qt::SkipEmptyParts );
|
|
||||||
QString searchString = makeHiliteRegExpString( allWords, searchMode, distanceBetweenWords );
|
|
||||||
|
|
||||||
searchRegExp = QRegExp( searchString, matchCase ? Qt::CaseSensitive : Qt::CaseInsensitive,
|
|
||||||
QRegExp::RegExp2 );
|
|
||||||
searchRegExp.setMinimal( true );
|
|
||||||
|
|
||||||
return !indexWords.isEmpty();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// Make words list for index search
|
|
||||||
|
|
||||||
QString tmp = str;
|
|
||||||
|
|
||||||
// Remove RegExp commands
|
|
||||||
if( searchMode == FTS::RegExp )
|
|
||||||
tmp.replace( regexRegExp, " " );
|
|
||||||
|
|
||||||
// Remove all symbol sets
|
|
||||||
tmp.replace( setsRegExp, " " );
|
|
||||||
|
|
||||||
QStringList list = tmp.normalized( QString::NormalizationForm_C )
|
|
||||||
.toLower().split( spacesRegExp, Qt::SkipEmptyParts );
|
|
||||||
|
|
||||||
if( hasCJK )
|
|
||||||
{
|
{
|
||||||
QStringList wordList, hieroglyphList;
|
QStringList wordList, hieroglyphList;
|
||||||
for( int i = 0; i < list.size(); i ++ )
|
for( int i = 0; i < list.size(); i ++ )
|
||||||
|
@ -161,6 +107,84 @@ bool parseSearchString( QString const & str, QStringList & indexWords,
|
||||||
hieroglyphList.removeDuplicates();
|
hieroglyphList.removeDuplicates();
|
||||||
indexWords += hieroglyphList;
|
indexWords += hieroglyphList;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool containCJK( QString const & str)
|
||||||
|
{
|
||||||
|
bool hasCJK = false;
|
||||||
|
for( int x = 0; x < str.size(); x++ )
|
||||||
|
if( isCJKChar( str.at( x ).unicode() ) )
|
||||||
|
{
|
||||||
|
hasCJK = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return hasCJK;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool parseSearchString( QString const & str, QStringList & indexWords,
|
||||||
|
QStringList & searchWords,
|
||||||
|
QRegExp & searchRegExp, int searchMode,
|
||||||
|
bool matchCase,
|
||||||
|
int distanceBetweenWords,
|
||||||
|
bool & hasCJK )
|
||||||
|
{
|
||||||
|
searchWords.clear();
|
||||||
|
indexWords.clear();
|
||||||
|
QRegularExpression spacesRegExp( "\\W+", QRegularExpression::UseUnicodePropertiesOption );
|
||||||
|
QRegularExpression wordRegExp( QString( "\\w{" ) + QString::number( FTS::MinimumWordSize ) + ",}", QRegularExpression::UseUnicodePropertiesOption );
|
||||||
|
QRegularExpression setsRegExp( "\\[[^\\]]+\\]", QRegularExpression::CaseInsensitiveOption );
|
||||||
|
QRegularExpression regexRegExp( "\\\\[afnrtvdDwWsSbB]|\\\\x([0-9A-Fa-f]{4})|\\\\0([0-7]{3})", QRegularExpression::CaseInsensitiveOption);
|
||||||
|
|
||||||
|
hasCJK = containCJK( str );
|
||||||
|
|
||||||
|
if( searchMode == FTS::WholeWords || searchMode == FTS::PlainText )
|
||||||
|
{
|
||||||
|
// Make words list for search in article text
|
||||||
|
searchWords = str.normalized( QString::NormalizationForm_C ).split( spacesRegExp, Qt::SkipEmptyParts );
|
||||||
|
// Make words list for index search
|
||||||
|
QStringList list =
|
||||||
|
str.normalized( QString::NormalizationForm_C ).toLower().split( spacesRegExp, Qt::SkipEmptyParts );
|
||||||
|
|
||||||
|
QString searchString;
|
||||||
|
if( hasCJK )
|
||||||
|
{
|
||||||
|
tokenizeCJK( indexWords, wordRegExp, list );
|
||||||
|
// QStringList allWords = str.split( spacesRegExp, Qt::SkipEmptyParts );
|
||||||
|
searchString = makeHiliteRegExpString( indexWords, searchMode, distanceBetweenWords, hasCJK );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
indexWords = list.filter( wordRegExp );
|
||||||
|
indexWords.removeDuplicates();
|
||||||
|
|
||||||
|
// Make regexp for results hilite
|
||||||
|
|
||||||
|
QStringList allWords = str.split( spacesRegExp, Qt::SkipEmptyParts );
|
||||||
|
searchString = makeHiliteRegExpString( allWords, searchMode, distanceBetweenWords );
|
||||||
|
}
|
||||||
|
searchRegExp = QRegExp( searchString, matchCase ? Qt::CaseSensitive : Qt::CaseInsensitive, QRegExp::RegExp2 );
|
||||||
|
searchRegExp.setMinimal( true );
|
||||||
|
return !indexWords.isEmpty();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Make words list for index search
|
||||||
|
|
||||||
|
QString tmp = str;
|
||||||
|
|
||||||
|
// Remove RegExp commands
|
||||||
|
if( searchMode == FTS::RegExp )
|
||||||
|
tmp.replace( regexRegExp, " " );
|
||||||
|
|
||||||
|
// Remove all symbol sets
|
||||||
|
tmp.replace( setsRegExp, " " );
|
||||||
|
|
||||||
|
QStringList list = tmp.normalized( QString::NormalizationForm_C )
|
||||||
|
.toLower().split( spacesRegExp, Qt::SkipEmptyParts );
|
||||||
|
|
||||||
|
if( hasCJK )
|
||||||
|
{
|
||||||
|
tokenizeCJK( indexWords, wordRegExp, list );
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
indexWords = list.filter( wordRegExp );
|
indexWords = list.filter( wordRegExp );
|
||||||
|
@ -543,6 +567,7 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
|
||||||
int n;
|
int n;
|
||||||
for( n = 0; n < parsedWords.size(); n++ )
|
for( n = 0; n < parsedWords.size(); n++ )
|
||||||
{
|
{
|
||||||
|
auto parsed_word = parsedWords.at( n );
|
||||||
if( ignoreWordsOrder )
|
if( ignoreWordsOrder )
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
|
@ -550,8 +575,8 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
|
||||||
{
|
{
|
||||||
if( wordsList.at( i ).second )
|
if( wordsList.at( i ).second )
|
||||||
{
|
{
|
||||||
if( ( searchMode == FTS::WholeWords && parsedWords.at( n ).compare( wordsList.at( i ).first, cs ) == 0 )
|
if( ( searchMode == FTS::WholeWords && parsed_word.compare( wordsList.at( i ).first, cs ) == 0 )
|
||||||
|| ( searchMode == FTS::PlainText && parsedWords.at( n ).contains( wordsList.at( i ).first, cs ) ) )
|
|| ( searchMode == FTS::PlainText && parsed_word.contains( wordsList.at( i ).first, cs ) ) )
|
||||||
{
|
{
|
||||||
wordsList[ i ].second = false;
|
wordsList[ i ].second = false;
|
||||||
|
|
||||||
|
@ -630,8 +655,13 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if( ( searchMode == FTS::WholeWords && parsedWords.at( n ).compare( words.at( matchWordNom ), cs ) == 0 )
|
//for cjk word, FTS::WholeWords and FTS::PlainText actually have same effect.
|
||||||
|| ( searchMode == FTS::PlainText && parsedWords.at( n ).contains( words.at( matchWordNom ), cs ) ) )
|
auto match_word = words.at( matchWordNom );
|
||||||
|
bool hasCJK = containCJK( match_word );
|
||||||
|
if( ( searchMode == FTS::WholeWords &&
|
||||||
|
( ( !hasCJK&& parsed_word.compare( match_word, cs ) == 0 ) ||
|
||||||
|
( hasCJK && parsed_word.contains( match_word, cs ) ) ) )
|
||||||
|
|| ( searchMode == FTS::PlainText && parsed_word.contains( match_word, cs ) ) )
|
||||||
{
|
{
|
||||||
matchWordNom += 1;
|
matchWordNom += 1;
|
||||||
|
|
||||||
|
|
|
@ -353,16 +353,16 @@ void FullTextSearchDialog::accept()
|
||||||
distanceBetweenWords,
|
distanceBetweenWords,
|
||||||
hasCJK ) )
|
hasCJK ) )
|
||||||
{
|
{
|
||||||
if( hasCJK && ( mode == WholeWords || mode == PlainText ) )
|
// if( hasCJK && ( mode == WholeWords || mode == PlainText ) )
|
||||||
{
|
// {
|
||||||
QMessageBox message( QMessageBox::Warning,
|
// QMessageBox message( QMessageBox::Warning,
|
||||||
"GoldenDict",
|
// "GoldenDict",
|
||||||
tr( "CJK symbols in search string are not compatible with search modes \"Whole words\" and \"Plain text\"" ),
|
// tr( "CJK symbols in search string are not compatible with search modes \"Whole words\" and \"Plain text\"" ),
|
||||||
QMessageBox::Ok,
|
// QMessageBox::Ok,
|
||||||
this );
|
// this );
|
||||||
message.exec();
|
// message.exec();
|
||||||
}
|
// }
|
||||||
else
|
// else
|
||||||
{
|
{
|
||||||
QMessageBox message( QMessageBox::Warning,
|
QMessageBox message( QMessageBox::Warning,
|
||||||
"GoldenDict",
|
"GoldenDict",
|
||||||
|
|
Loading…
Reference in a new issue