Merge pull request #89 from xiaoyifang/feature/cjk

feat:add cjk fulltext search
This commit is contained in:
xiaoyifang 2022-06-03 15:59:45 +08:00 committed by GitHub
commit 58fdbae34f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 105 additions and 72 deletions

View file

@ -1399,6 +1399,7 @@ void BtreeIndex::getHeadwordsFromOffsets( QList<uint32_t> & offsets,
for( unsigned i = 0; i < result.size(); i++ ) for( unsigned i = 0; i < result.size(); i++ )
{ {
uint32_t articleOffset = result.at(i).articleOffset; uint32_t articleOffset = result.at(i).articleOffset;
QList<uint32_t>::Iterator it = std::lower_bound( begOffsets, endOffsets, QList<uint32_t>::Iterator it = std::lower_bound( begOffsets, endOffsets,
articleOffset ); articleOffset );
@ -1407,7 +1408,9 @@ void BtreeIndex::getHeadwordsFromOffsets( QList<uint32_t> & offsets,
if( isCancelled && Utils::AtomicInt::loadAcquire( *isCancelled ) ) if( isCancelled && Utils::AtomicInt::loadAcquire( *isCancelled ) )
return; return;
headwords.append( QString::fromUtf8( ( result[ i ].prefix + result[ i ].word ).c_str() ) ); auto word = QString::fromUtf8( ( result[ i ].prefix + result[ i ].word ).c_str() );
headwords.append( word );
offsets.erase( it); offsets.erase( it);
begOffsets = offsets.begin(); begOffsets = offsets.begin();
endOffsets = offsets.end(); endOffsets = offsets.end();

View file

@ -39,16 +39,26 @@ bool ftsIndexIsOldOrBad( string const & indexFile,
static QString makeHiliteRegExpString( QStringList const & words, static QString makeHiliteRegExpString( QStringList const & words,
int searchMode, int searchMode,
int distanceBetweenWords ) int distanceBetweenWords, bool hasCJK = false )
{ {
QString searchString( "(" ); QString searchString( "(" );
QString stripWords( "(?:\\W+\\w+){0," ); QString stripWords( "(?:\\W+\\w+){0," );
if( distanceBetweenWords >= 0 ) if( distanceBetweenWords >= 0 )
stripWords += QString::number( distanceBetweenWords ); stripWords += QString::number( distanceBetweenWords );
stripWords += "}\\W+"; stripWords += "}";
if(!hasCJK)
{
stripWords += "\\W+";
}
QString boundWord( searchMode == FTS::WholeWords ? "\\b" : "(?:\\w*)"); QString boundWord( searchMode == FTS::WholeWords ? "\\b" : "(?:\\w*)");
if(hasCJK)
{
//no boundary for CJK
boundWord.clear();
}
for( int x = 0; x < words.size(); x++ ) for( int x = 0; x < words.size(); x++ )
{ {
@ -62,6 +72,54 @@ static QString makeHiliteRegExpString( QStringList const & words,
return searchString; return searchString;
} }
void tokenizeCJK( QStringList & indexWords, QRegularExpression wordRegExp, QStringList list )
{
QStringList wordList, hieroglyphList;
for( int i = 0; i < list.size(); i ++ )
{
QString word = list.at( i );
// Check for CJK symbols in word
bool parsed = false;
QString hieroglyph;
for( int x = 0; x < word.size(); x++ )
if( isCJKChar( word.at( x ).unicode() ) )
{
parsed = true;
hieroglyph.append( word[ x ] );
if( QChar( word.at( x ) ).isHighSurrogate()
&& QChar( word[ x + 1 ] ).isLowSurrogate() )
hieroglyph.append( word[ ++x ] );
hieroglyphList.append( hieroglyph );
hieroglyph.clear();
}
// If word don't contains CJK symbols put it in list as is
if( !parsed )
wordList.append( word );
}
indexWords = wordList.filter( wordRegExp );
indexWords.removeDuplicates();
hieroglyphList.removeDuplicates();
indexWords += hieroglyphList;
}
bool containCJK( QString const & str)
{
bool hasCJK = false;
for( int x = 0; x < str.size(); x++ )
if( isCJKChar( str.at( x ).unicode() ) )
{
hasCJK = true;
break;
}
return hasCJK;
}
bool parseSearchString( QString const & str, QStringList & indexWords, bool parseSearchString( QString const & str, QStringList & indexWords,
QStringList & searchWords, QStringList & searchWords,
QRegExp & searchRegExp, int searchMode, QRegExp & searchRegExp, int searchMode,
@ -76,38 +134,35 @@ bool parseSearchString( QString const & str, QStringList & indexWords,
QRegularExpression setsRegExp( "\\[[^\\]]+\\]", QRegularExpression::CaseInsensitiveOption ); QRegularExpression setsRegExp( "\\[[^\\]]+\\]", QRegularExpression::CaseInsensitiveOption );
QRegularExpression regexRegExp( "\\\\[afnrtvdDwWsSbB]|\\\\x([0-9A-Fa-f]{4})|\\\\0([0-7]{3})", QRegularExpression::CaseInsensitiveOption); QRegularExpression regexRegExp( "\\\\[afnrtvdDwWsSbB]|\\\\x([0-9A-Fa-f]{4})|\\\\0([0-7]{3})", QRegularExpression::CaseInsensitiveOption);
hasCJK = false; hasCJK = containCJK( str );
for( int x = 0; x < str.size(); x++ )
if( isCJKChar( str.at( x ).unicode() ) )
{
hasCJK = true;
break;
}
if( searchMode == FTS::WholeWords || searchMode == FTS::PlainText ) if( searchMode == FTS::WholeWords || searchMode == FTS::PlainText )
{ {
if( hasCJK )
return false;
// Make words list for search in article text // Make words list for search in article text
searchWords = str.normalized( QString::NormalizationForm_C ) searchWords = str.normalized( QString::NormalizationForm_C ).split( spacesRegExp, Qt::SkipEmptyParts );
.split( spacesRegExp, Qt::SkipEmptyParts );
// Make words list for index search // Make words list for index search
QStringList list = str.normalized( QString::NormalizationForm_C ) QStringList list =
.toLower().split( spacesRegExp, Qt::SkipEmptyParts ); str.normalized( QString::NormalizationForm_C ).toLower().split( spacesRegExp, Qt::SkipEmptyParts );
indexWords = list.filter( wordRegExp );
indexWords.removeDuplicates();
// Make regexp for results hilite QString searchString;
if( hasCJK )
{
tokenizeCJK( indexWords, wordRegExp, list );
// QStringList allWords = str.split( spacesRegExp, Qt::SkipEmptyParts );
searchString = makeHiliteRegExpString( indexWords, searchMode, distanceBetweenWords, hasCJK );
}
else
{
indexWords = list.filter( wordRegExp );
indexWords.removeDuplicates();
QStringList allWords = str.split( spacesRegExp, Qt::SkipEmptyParts ); // Make regexp for results hilite
QString searchString = makeHiliteRegExpString( allWords, searchMode, distanceBetweenWords );
searchRegExp = QRegExp( searchString, matchCase ? Qt::CaseSensitive : Qt::CaseInsensitive, QStringList allWords = str.split( spacesRegExp, Qt::SkipEmptyParts );
QRegExp::RegExp2 ); searchString = makeHiliteRegExpString( allWords, searchMode, distanceBetweenWords );
}
searchRegExp = QRegExp( searchString, matchCase ? Qt::CaseSensitive : Qt::CaseInsensitive, QRegExp::RegExp2 );
searchRegExp.setMinimal( true ); searchRegExp.setMinimal( true );
return !indexWords.isEmpty(); return !indexWords.isEmpty();
} }
else else
@ -128,38 +183,7 @@ bool parseSearchString( QString const & str, QStringList & indexWords,
if( hasCJK ) if( hasCJK )
{ {
QStringList wordList, hieroglyphList; tokenizeCJK( indexWords, wordRegExp, list );
for( int i = 0; i < list.size(); i ++ )
{
QString word = list.at( i );
// Check for CJK symbols in word
bool parsed = false;
QString hieroglyph;
for( int x = 0; x < word.size(); x++ )
if( isCJKChar( word.at( x ).unicode() ) )
{
parsed = true;
hieroglyph.append( word[ x ] );
if( QChar( word.at( x ) ).isHighSurrogate()
&& QChar( word[ x + 1 ] ).isLowSurrogate() )
hieroglyph.append( word[ ++x ] );
hieroglyphList.append( hieroglyph );
hieroglyph.clear();
}
// If word don't contains CJK symbols put it in list as is
if( !parsed )
wordList.append( word );
}
indexWords = wordList.filter( wordRegExp );
indexWords.removeDuplicates();
hieroglyphList.removeDuplicates();
indexWords += hieroglyphList;
} }
else else
{ {
@ -543,6 +567,7 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
int n; int n;
for( n = 0; n < parsedWords.size(); n++ ) for( n = 0; n < parsedWords.size(); n++ )
{ {
auto parsed_word = parsedWords.at( n );
if( ignoreWordsOrder ) if( ignoreWordsOrder )
{ {
int i; int i;
@ -550,8 +575,8 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
{ {
if( wordsList.at( i ).second ) if( wordsList.at( i ).second )
{ {
if( ( searchMode == FTS::WholeWords && parsedWords.at( n ).compare( wordsList.at( i ).first, cs ) == 0 ) if( ( searchMode == FTS::WholeWords && parsed_word.compare( wordsList.at( i ).first, cs ) == 0 )
|| ( searchMode == FTS::PlainText && parsedWords.at( n ).contains( wordsList.at( i ).first, cs ) ) ) || ( searchMode == FTS::PlainText && parsed_word.contains( wordsList.at( i ).first, cs ) ) )
{ {
wordsList[ i ].second = false; wordsList[ i ].second = false;
@ -630,8 +655,13 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
} }
else else
{ {
if( ( searchMode == FTS::WholeWords && parsedWords.at( n ).compare( words.at( matchWordNom ), cs ) == 0 ) //for cjk word, FTS::WholeWords and FTS::PlainText actually have same effect.
|| ( searchMode == FTS::PlainText && parsedWords.at( n ).contains( words.at( matchWordNom ), cs ) ) ) auto match_word = words.at( matchWordNom );
bool hasCJK = containCJK( match_word );
if( ( searchMode == FTS::WholeWords &&
( ( !hasCJK&& parsed_word.compare( match_word, cs ) == 0 ) ||
( hasCJK && parsed_word.contains( match_word, cs ) ) ) )
|| ( searchMode == FTS::PlainText && parsed_word.contains( match_word, cs ) ) )
{ {
matchWordNom += 1; matchWordNom += 1;

View file

@ -353,16 +353,16 @@ void FullTextSearchDialog::accept()
distanceBetweenWords, distanceBetweenWords,
hasCJK ) ) hasCJK ) )
{ {
if( hasCJK && ( mode == WholeWords || mode == PlainText ) ) // if( hasCJK && ( mode == WholeWords || mode == PlainText ) )
{ // {
QMessageBox message( QMessageBox::Warning, // QMessageBox message( QMessageBox::Warning,
"GoldenDict", // "GoldenDict",
tr( "CJK symbols in search string are not compatible with search modes \"Whole words\" and \"Plain text\"" ), // tr( "CJK symbols in search string are not compatible with search modes \"Whole words\" and \"Plain text\"" ),
QMessageBox::Ok, // QMessageBox::Ok,
this ); // this );
message.exec(); // message.exec();
} // }
else // else
{ {
QMessageBox message( QMessageBox::Warning, QMessageBox message( QMessageBox::Warning,
"GoldenDict", "GoldenDict",