Merge pull request #90 from xiaoyifang/feature/cjk

improve: fulltext search checkArticle optimize
This commit is contained in:
xiaoyifang 2022-06-04 16:11:50 +08:00 committed by GitHub
commit c42513ebe8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 120 additions and 250 deletions

View file

@ -15,6 +15,7 @@
#include <QVector>
#include <QRegularExpression>
#include "wildcard.hh"
using std::vector;
@ -38,8 +39,7 @@ bool ftsIndexIsOldOrBad( string const & indexFile,
}
static QString makeHiliteRegExpString( QStringList const & words,
int searchMode,
int distanceBetweenWords, bool hasCJK = false )
int searchMode, int distanceBetweenWords, bool hasCJK = false, bool ignoreWordsOrder = false )
{
QString searchString( "(" );
@ -63,9 +63,20 @@ static QString makeHiliteRegExpString( QStringList const & words,
for( int x = 0; x < words.size(); x++ )
{
if( x )
{
searchString += stripWords;
if(ignoreWordsOrder)
searchString += "(";
}
searchString += boundWord + words[ x ] + boundWord;
if( x )
{
if( ignoreWordsOrder )
searchString += ")?";
}
}
searchString += ")";
@ -125,7 +136,8 @@ bool parseSearchString( QString const & str, QStringList & indexWords,
QRegExp & searchRegExp, int searchMode,
bool matchCase,
int distanceBetweenWords,
bool & hasCJK )
bool & hasCJK,
bool ignoreWordsOrder )
{
searchWords.clear();
indexWords.clear();
@ -149,7 +161,7 @@ bool parseSearchString( QString const & str, QStringList & indexWords,
{
tokenizeCJK( indexWords, wordRegExp, list );
// QStringList allWords = str.split( spacesRegExp, Qt::SkipEmptyParts );
searchString = makeHiliteRegExpString( indexWords, searchMode, distanceBetweenWords, hasCJK );
searchString = makeHiliteRegExpString( list, searchMode, distanceBetweenWords, hasCJK , ignoreWordsOrder);
}
else
{
@ -159,7 +171,7 @@ bool parseSearchString( QString const & str, QStringList & indexWords,
// Make regexp for results hilite
QStringList allWords = str.split( spacesRegExp, Qt::SkipEmptyParts );
searchString = makeHiliteRegExpString( allWords, searchMode, distanceBetweenWords );
searchString = makeHiliteRegExpString( allWords, searchMode, distanceBetweenWords,false, ignoreWordsOrder );
}
searchRegExp = QRegExp( searchString, matchCase ? Qt::CaseSensitive : Qt::CaseInsensitive, QRegExp::RegExp2 );
searchRegExp.setMinimal( true );
@ -443,25 +455,23 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
QRegularExpression::UseUnicodePropertiesOption);
QRegularExpression regSplit( "[^\\w\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption );
// RegExp mode
QRegularExpression searchRegularExpression;
if( searchMode == FTS::Wildcards )
searchRegularExpression.setPattern( wildcardsToRegexp( searchRegexp.pattern() ) );
else
searchRegularExpression.setPattern( searchRegexp.pattern() );
QRegularExpression::PatternOptions patternOptions =
QRegularExpression::DotMatchesEverythingOption | QRegularExpression::UseUnicodePropertiesOption
| QRegularExpression::MultilineOption | QRegularExpression::InvertedGreedinessOption;
if( searchRegexp.caseSensitivity() == Qt::CaseInsensitive )
patternOptions |= QRegularExpression::CaseInsensitiveOption;
searchRegularExpression.setPatternOptions( patternOptions );
if( !searchRegularExpression.isValid() )
searchRegularExpression.setPattern( "" );
if( searchMode == FTS::Wildcards || searchMode == FTS::RegExp )
{
// RegExp mode
QRegularExpression searchRegularExpression;
if( searchMode == FTS::Wildcards )
searchRegularExpression.setPattern( wildcardsToRegexp( searchRegexp.pattern() ) );
else
searchRegularExpression.setPattern( searchRegexp.pattern() );
QRegularExpression::PatternOptions patternOptions = QRegularExpression::DotMatchesEverythingOption
| QRegularExpression::UseUnicodePropertiesOption
| QRegularExpression::MultilineOption
| QRegularExpression::InvertedGreedinessOption;
if( searchRegexp.caseSensitivity() == Qt::CaseInsensitive )
patternOptions |= QRegularExpression::CaseInsensitiveOption;
searchRegularExpression.setPatternOptions( patternOptions );
if( !searchRegularExpression.isValid() )
searchRegularExpression.setPattern( "" );
for( int i = 0; i < offsets.size(); i++ )
{
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
@ -527,243 +537,92 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
if( ignoreDiacritics )
articleText = gd::toQString( Folding::applyDiacriticsOnly( gd::toWString( articleText ) ) );
QStringList articleWords = articleText.split( needHandleBrackets ? splitWithBrackets : splitWithoutBrackets,
Qt::SkipEmptyParts );
//QStringList articleWords = articleText.split( needHandleBrackets ? splitWithBrackets : splitWithoutBrackets,
// Qt::SkipEmptyParts );
int wordsNum = articleWords.length();
while ( pos < wordsNum )
if(ignoreWordsOrder)
{
QString s = articleWords[ pos ];
bool breakSearch = false;
QStringList parsedWords;
if( needHandleBrackets && ( s.indexOf( '(' ) >= 0 || s.indexOf( ')' ) >= 0 ) )
bool allMatch = true;
foreach( QString word, words )
{
if( containCJK( word ) || searchMode == FTS::PlainText )
{
// Handle brackets
QRegularExpressionMatch match_brackets = regBrackets.match( s );
if( match_brackets.hasMatch() )
if( !articleText.contains( word ) )
{
QStringList parts = match_brackets.capturedTexts();
// Add empty strings for compatibility with QRegExp behaviour
for( int i = match_brackets.lastCapturedIndex() + 1; i < 6; i++ )
parts.append( QString() );
QString word = parts[ 2 ] + parts[ 4 ]; // Brackets removed
parsedWords.append( word );
word = parts[ 1 ].remove( '(' ).remove( ')' )
+ parts[ 2 ]
+ parts[ 3 ].remove( '(' ).remove( ')' )
+ parts[ 4 ]
+ parts[ 5 ].remove( '(' ).remove( ')' ); // Brackets expansed
parsedWords.append( word );
allMatch = false;
break;
}
}
else if( searchMode == FTS::WholeWords)
{
QRegularExpression tmpReg( QString( "\b%1\b" ).arg( word ),QRegularExpression::CaseInsensitiveOption|QRegularExpression::UseUnicodePropertiesOption );
if( !articleText.contains( tmpReg) )
{
allMatch = false;
break;
}
}
}
if(!allMatch)
{
continue;
}
if( distanceBetweenWords >= 0 )
{
// the article text contains all the needed words.
// determine if distance restriction is meet
QRegularExpression replaceReg( QString( "(%1)" ).arg( words.join( '|' ) ),
QRegularExpression::CaseInsensitiveOption |
QRegularExpression::UseUnicodePropertiesOption );
// use a string that could not be presented in the article.
articleText = articleText.replace( replaceReg, "=@XXXXX@=" );
auto hasCJK = false;
foreach(QString word,words)
{
if(containCJK( word ))
{
hasCJK = true;
break;
}
}
//hascjk value ,perhaps should depend on each word
auto searchRegStr = makeHiliteRegExpString( Utils::repeat( "=@XXXXX@=", words.size() ), searchMode, distanceBetweenWords,hasCJK );
QRegularExpression distanceOrderReg( searchRegStr,
QRegularExpression::CaseInsensitiveOption |
QRegularExpression::UseUnicodePropertiesOption );
// use a string that could not be presented in the article.
if(articleText.contains(distanceOrderReg))
{
if( headword.isEmpty() )
offsetsForHeadwords.append( offsets.at( i ) );
else
parsedWords = s.split( regSplit, Qt::SkipEmptyParts );
foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) );
results++;
if( maxResults > 0 && results >= maxResults )
break;
}
else
parsedWords.append( s );
int n;
for( n = 0; n < parsedWords.size(); n++ )
{
auto parsed_word = parsedWords.at( n );
if( ignoreWordsOrder )
{
int i;
for( i = 0; i < wordsList.size(); i++ )
{
if( wordsList.at( i ).second )
{
if( ( searchMode == FTS::WholeWords && parsed_word.compare( wordsList.at( i ).first, cs ) == 0 )
|| ( searchMode == FTS::PlainText && parsed_word.contains( wordsList.at( i ).first, cs ) ) )
{
wordsList[ i ].second = false;
if( parsedWords.size() > 1 )
{
QString wordToHilite = s;
while( !wordToHilite.isEmpty() && ( wordToHilite.at( 0 ) == '(' || wordToHilite.at( 0 ) == ')' ) )
wordToHilite.remove( 0, 1 );
while( !wordToHilite.isEmpty() && ( wordToHilite.endsWith( '(' ) || wordToHilite.endsWith( ')' ) ) )
wordToHilite.chop( 1 );
order.append( wordToHilite.replace( '(', "\\(" ).replace( ')', "\\)" ) );
}
else
order.append( wordsList.at( i ).first );
break;
}
}
}
if( i < wordsList.size() )
{
// Word found
matchWordNom += 1;
if( matchWordNom == 1 )
{
// Store position to remake search if sequence will not be found
nextNotFoundPos = pos + 1;
}
if( matchWordNom >= words.size() )
{
// All words are found
// Store found words sequence and continue search
// It's nesessary for hilite search results
// Check if such sequence already presented
int x;
for( x = 0; x < allOrders.size(); x++ )
{
if( allOrders[ x ] == order )
break;
}
if( x >= allOrders.size() )
allOrders.append( order );
order.clear();
matchWordNom = 0;
unmatchWordNom = 0;
for( int i = 0; i < wordsList.size(); i++ )
wordsList[ i ].second = true;
nextNotFoundPos = 0;
break;
}
unmatchWordNom = 0;
break;
}
else
if( matchWordNom > 0 && n >= parsedWords.size() - 1 )
{
unmatchWordNom += 1;
if( distanceBetweenWords >= 0 && unmatchWordNom > distanceBetweenWords )
{
// Sequence broken, clear all counters
matchWordNom = 0;
unmatchWordNom = 0;
for( int i = 0; i < wordsList.size(); i++ )
wordsList[ i ].second = true;
order.clear();
}
}
}
else
{
//for cjk word, FTS::WholeWords and FTS::PlainText actually have same effect.
auto match_word = words.at( matchWordNom );
bool hasCJK = containCJK( match_word );
if( ( searchMode == FTS::WholeWords &&
( ( !hasCJK&& parsed_word.compare( match_word, cs ) == 0 ) ||
( hasCJK && parsed_word.contains( match_word, cs ) ) ) )
|| ( searchMode == FTS::PlainText && parsed_word.contains( match_word, cs ) ) )
{
matchWordNom += 1;
if( matchWordNom == 1 )
{
// Store position to remake search if sequence will not be found
nextNotFoundPos = pos + 1;
}
if( needHandleBrackets )
{
if( parsedWords.size() > 1 )
{
QString wordToHilite = s;
while( !wordToHilite.isEmpty() && ( wordToHilite.at( 0 ) == '(' || wordToHilite.at( 0 ) == ')' ) )
wordToHilite.remove( 0, 1 );
while( !wordToHilite.isEmpty() && ( wordToHilite.endsWith( '(' ) || wordToHilite.endsWith( ')' ) ) )
wordToHilite.chop( 1 );
order.append( wordToHilite.replace( '(', "\\(" ).replace( ')', "\\)" ) );
}
else
order.append( words.at( matchWordNom - 1 ) );
}
if( matchWordNom >= words.size() )
{
// All words are found
if( needHandleBrackets )
{
if( allOrders.isEmpty() )
allOrders.append( words );
// Check if such sequence already presented
int x;
for( x = 0; x < allOrders.size(); x++ )
{
if( allOrders[ x ] == order )
break;
}
if( x >= allOrders.size() )
allOrders.append( order );
matchWordNom = 0;
unmatchWordNom = 0;
order.clear();
nextNotFoundPos = 0;
}
else
breakSearch = true;
break;
}
unmatchWordNom = 0;
break;
}
else
if( matchWordNom > 0 && n >= parsedWords.size() - 1 )
{
unmatchWordNom += 1;
if( distanceBetweenWords >= 0 && unmatchWordNom > distanceBetweenWords )
{
matchWordNom = 0;
unmatchWordNom = 0;
if( needHandleBrackets )
order.clear();
}
}
}
}
if( breakSearch )
break;
if( nextNotFoundPos > 0 && matchWordNom == 0 )
{
pos = nextNotFoundPos;
nextNotFoundPos = 0;
}
else
pos += 1;
}
}
if( !allOrders.isEmpty() || matchWordNom >= words.size() )
else
{
QStringList hiliteReg;
if( !allOrders.isEmpty() )
if( articleText.contains( searchRegularExpression ) )
{
for( int i = 0; i < allOrders.size(); i++ )
{
QString hiliteStr = makeHiliteRegExpString( allOrders.at( i ), searchMode, distanceBetweenWords );
hiliteReg.append( hiliteStr );
}
allOrders.clear();
if( headword.isEmpty() )
offsetsForHeadwords.append( offsets.at( i ) );
else
foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) );
results++;
if( maxResults > 0 && results >= maxResults )
break;
}
if( headword.isEmpty() )
{
offsetsForHeadwords.append( offsets.at( i ) );
hiliteRegExps.append( hiliteReg );
}
else
foundHeadwords->append( FTS::FtsHeadword( headword, id, hiliteReg, matchCase ) );
results++;
if( maxResults > 0 && results >= maxResults )
break;
}
}
}
@ -1150,7 +1009,7 @@ void FTSResultsRequest::run()
QRegExp searchRegExp;
if( !FtsHelpers::parseSearchString( searchString, indexWords, searchWords, searchRegExp,
searchMode, matchCase, distanceBetweenWords, hasCJK ) )
searchMode, matchCase, distanceBetweenWords, hasCJK, ignoreWordsOrder ) )
{
finish();
return;

View file

@ -55,7 +55,8 @@ bool parseSearchString( QString const & str, QStringList & IndexWords,
QRegExp & searchRegExp, int searchMode,
bool matchCase,
int distanceBetweenWords,
bool & hasCJK );
bool & hasCJK,
bool ignoreWordsOrder = false );
void parseArticleForFts( uint32_t articleAddress, QString & articleText,
QMap< QString, QVector< uint32_t > > & words,

View file

@ -351,7 +351,7 @@ void FullTextSearchDialog::accept()
searchRegExp, mode,
ui.matchCase->isChecked(),
distanceBetweenWords,
hasCJK ) )
hasCJK, ignoreWordsOrder ) )
{
// if( hasCJK && ( mode == WholeWords || mode == PlainText ) )
// {

View file

@ -118,6 +118,16 @@ inline QString json2String( const QJsonObject & json )
return QString( QJsonDocument( json ).toJson( QJsonDocument::Compact ) );
}
inline QStringList repeat( const QString str, const int times )
{
QStringList list;
for( int i = 0; i < times; i++ )
{
list << str;
}
return list;
}
namespace AtomicInt
{