From 3e05e711e5ebab7ca5c4f34b30b90df225caaecf Mon Sep 17 00:00:00 2001 From: Xiao YiFang Date: Fri, 3 Jun 2022 20:07:14 +0800 Subject: [PATCH] improve: fulltext search checkArticle optimize --- ftshelpers.cc | 355 ++++++++++++++-------------------------------- ftshelpers.hh | 3 +- fulltextsearch.cc | 2 +- utils.hh | 10 ++ 4 files changed, 120 insertions(+), 250 deletions(-) diff --git a/ftshelpers.cc b/ftshelpers.cc index 30517ae5..e1b51641 100644 --- a/ftshelpers.cc +++ b/ftshelpers.cc @@ -15,6 +15,7 @@ #include #include + #include "wildcard.hh" using std::vector; @@ -38,8 +39,7 @@ bool ftsIndexIsOldOrBad( string const & indexFile, } static QString makeHiliteRegExpString( QStringList const & words, - int searchMode, - int distanceBetweenWords, bool hasCJK = false ) + int searchMode, int distanceBetweenWords, bool hasCJK = false, bool ignoreWordsOrder = false ) { QString searchString( "(" ); @@ -63,9 +63,20 @@ static QString makeHiliteRegExpString( QStringList const & words, for( int x = 0; x < words.size(); x++ ) { if( x ) + { searchString += stripWords; + if(ignoreWordsOrder) + searchString += "("; + } searchString += boundWord + words[ x ] + boundWord; + + if( x ) + { + if( ignoreWordsOrder ) + searchString += ")?"; + } + } searchString += ")"; @@ -125,7 +136,8 @@ bool parseSearchString( QString const & str, QStringList & indexWords, QRegExp & searchRegExp, int searchMode, bool matchCase, int distanceBetweenWords, - bool & hasCJK ) + bool & hasCJK, + bool ignoreWordsOrder ) { searchWords.clear(); indexWords.clear(); @@ -149,7 +161,7 @@ bool parseSearchString( QString const & str, QStringList & indexWords, { tokenizeCJK( indexWords, wordRegExp, list ); // QStringList allWords = str.split( spacesRegExp, Qt::SkipEmptyParts ); - searchString = makeHiliteRegExpString( indexWords, searchMode, distanceBetweenWords, hasCJK ); + searchString = makeHiliteRegExpString( list, searchMode, distanceBetweenWords, hasCJK , ignoreWordsOrder); } else { @@ -159,7 +171,7 @@ bool parseSearchString( QString const & str, QStringList & indexWords, // Make regexp for results hilite QStringList allWords = str.split( spacesRegExp, Qt::SkipEmptyParts ); - searchString = makeHiliteRegExpString( allWords, searchMode, distanceBetweenWords ); + searchString = makeHiliteRegExpString( allWords, searchMode, distanceBetweenWords,false, ignoreWordsOrder ); } searchRegExp = QRegExp( searchString, matchCase ? Qt::CaseSensitive : Qt::CaseInsensitive, QRegExp::RegExp2 ); searchRegExp.setMinimal( true ); @@ -443,25 +455,23 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets, QRegularExpression::UseUnicodePropertiesOption); QRegularExpression regSplit( "[^\\w\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption ); + // RegExp mode + QRegularExpression searchRegularExpression; + if( searchMode == FTS::Wildcards ) + searchRegularExpression.setPattern( wildcardsToRegexp( searchRegexp.pattern() ) ); + else + searchRegularExpression.setPattern( searchRegexp.pattern() ); + QRegularExpression::PatternOptions patternOptions = + QRegularExpression::DotMatchesEverythingOption | QRegularExpression::UseUnicodePropertiesOption + | QRegularExpression::MultilineOption | QRegularExpression::InvertedGreedinessOption; + if( searchRegexp.caseSensitivity() == Qt::CaseInsensitive ) + patternOptions |= QRegularExpression::CaseInsensitiveOption; + searchRegularExpression.setPatternOptions( patternOptions ); + if( !searchRegularExpression.isValid() ) + searchRegularExpression.setPattern( "" ); if( searchMode == FTS::Wildcards || searchMode == FTS::RegExp ) { - // RegExp mode - - QRegularExpression searchRegularExpression; - if( searchMode == FTS::Wildcards ) - searchRegularExpression.setPattern( wildcardsToRegexp( searchRegexp.pattern() ) ); - else - searchRegularExpression.setPattern( searchRegexp.pattern() ); - QRegularExpression::PatternOptions patternOptions = QRegularExpression::DotMatchesEverythingOption - | QRegularExpression::UseUnicodePropertiesOption - | QRegularExpression::MultilineOption - | QRegularExpression::InvertedGreedinessOption; - if( searchRegexp.caseSensitivity() == Qt::CaseInsensitive ) - patternOptions |= QRegularExpression::CaseInsensitiveOption; - searchRegularExpression.setPatternOptions( patternOptions ); - if( !searchRegularExpression.isValid() ) - searchRegularExpression.setPattern( "" ); for( int i = 0; i < offsets.size(); i++ ) { if( Utils::AtomicInt::loadAcquire( isCancelled ) ) @@ -527,243 +537,92 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets, if( ignoreDiacritics ) articleText = gd::toQString( Folding::applyDiacriticsOnly( gd::toWString( articleText ) ) ); - QStringList articleWords = articleText.split( needHandleBrackets ? splitWithBrackets : splitWithoutBrackets, - Qt::SkipEmptyParts ); + //QStringList articleWords = articleText.split( needHandleBrackets ? splitWithBrackets : splitWithoutBrackets, + // Qt::SkipEmptyParts ); - int wordsNum = articleWords.length(); - while ( pos < wordsNum ) + if(ignoreWordsOrder) { - QString s = articleWords[ pos ]; - bool breakSearch = false; - - QStringList parsedWords; - if( needHandleBrackets && ( s.indexOf( '(' ) >= 0 || s.indexOf( ')' ) >= 0 ) ) + bool allMatch = true; + foreach( QString word, words ) + { + if( containCJK( word ) || searchMode == FTS::PlainText ) { - // Handle brackets - QRegularExpressionMatch match_brackets = regBrackets.match( s ); - if( match_brackets.hasMatch() ) + if( !articleText.contains( word ) ) { - QStringList parts = match_brackets.capturedTexts(); - // Add empty strings for compatibility with QRegExp behaviour - for( int i = match_brackets.lastCapturedIndex() + 1; i < 6; i++ ) - parts.append( QString() ); - - QString word = parts[ 2 ] + parts[ 4 ]; // Brackets removed - parsedWords.append( word ); - - word = parts[ 1 ].remove( '(' ).remove( ')' ) - + parts[ 2 ] - + parts[ 3 ].remove( '(' ).remove( ')' ) - + parts[ 4 ] - + parts[ 5 ].remove( '(' ).remove( ')' ); // Brackets expansed - parsedWords.append( word ); + allMatch = false; + break; } + } + else if( searchMode == FTS::WholeWords) + { + QRegularExpression tmpReg( QString( "\b%1\b" ).arg( word ),QRegularExpression::CaseInsensitiveOption|QRegularExpression::UseUnicodePropertiesOption ); + if( !articleText.contains( tmpReg) ) + { + allMatch = false; + break; + } + } + + } + + if(!allMatch) + { + continue; + } + + if( distanceBetweenWords >= 0 ) + { + // the article text contains all the needed words. + // determine if distance restriction is meet + QRegularExpression replaceReg( QString( "(%1)" ).arg( words.join( '|' ) ), + QRegularExpression::CaseInsensitiveOption | + QRegularExpression::UseUnicodePropertiesOption ); + // use a string that could not be presented in the article. + articleText = articleText.replace( replaceReg, "=@XXXXX@=" ); + + auto hasCJK = false; + foreach(QString word,words) + { + if(containCJK( word )) + { + hasCJK = true; + break; + } + } + + //hascjk value ,perhaps should depend on each word + auto searchRegStr = makeHiliteRegExpString( Utils::repeat( "=@XXXXX@=", words.size() ), searchMode, distanceBetweenWords,hasCJK ); + QRegularExpression distanceOrderReg( searchRegStr, + QRegularExpression::CaseInsensitiveOption | + QRegularExpression::UseUnicodePropertiesOption ); + // use a string that could not be presented in the article. + if(articleText.contains(distanceOrderReg)) + { + if( headword.isEmpty() ) + offsetsForHeadwords.append( offsets.at( i ) ); else - parsedWords = s.split( regSplit, Qt::SkipEmptyParts ); + foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) ); + + results++; + if( maxResults > 0 && results >= maxResults ) + break; } - else - parsedWords.append( s ); - - int n; - for( n = 0; n < parsedWords.size(); n++ ) - { - auto parsed_word = parsedWords.at( n ); - if( ignoreWordsOrder ) - { - int i; - for( i = 0; i < wordsList.size(); i++ ) - { - if( wordsList.at( i ).second ) - { - if( ( searchMode == FTS::WholeWords && parsed_word.compare( wordsList.at( i ).first, cs ) == 0 ) - || ( searchMode == FTS::PlainText && parsed_word.contains( wordsList.at( i ).first, cs ) ) ) - { - wordsList[ i ].second = false; - - if( parsedWords.size() > 1 ) - { - QString wordToHilite = s; - while( !wordToHilite.isEmpty() && ( wordToHilite.at( 0 ) == '(' || wordToHilite.at( 0 ) == ')' ) ) - wordToHilite.remove( 0, 1 ); - while( !wordToHilite.isEmpty() && ( wordToHilite.endsWith( '(' ) || wordToHilite.endsWith( ')' ) ) ) - wordToHilite.chop( 1 ); - order.append( wordToHilite.replace( '(', "\\(" ).replace( ')', "\\)" ) ); - } - else - order.append( wordsList.at( i ).first ); - - break; - } - } - } - if( i < wordsList.size() ) - { - // Word found - - matchWordNom += 1; - - if( matchWordNom == 1 ) - { - // Store position to remake search if sequence will not be found - nextNotFoundPos = pos + 1; - } - - if( matchWordNom >= words.size() ) - { - // All words are found - // Store found words sequence and continue search - // It's nesessary for hilite search results - - // Check if such sequence already presented - int x; - for( x = 0; x < allOrders.size(); x++ ) - { - if( allOrders[ x ] == order ) - break; - } - if( x >= allOrders.size() ) - allOrders.append( order ); - - order.clear(); - - matchWordNom = 0; - unmatchWordNom = 0; - for( int i = 0; i < wordsList.size(); i++ ) - wordsList[ i ].second = true; - nextNotFoundPos = 0; - - break; - } - - unmatchWordNom = 0; - break; - } - else - if( matchWordNom > 0 && n >= parsedWords.size() - 1 ) - { - unmatchWordNom += 1; - if( distanceBetweenWords >= 0 && unmatchWordNom > distanceBetweenWords ) - { - // Sequence broken, clear all counters - matchWordNom = 0; - unmatchWordNom = 0; - for( int i = 0; i < wordsList.size(); i++ ) - wordsList[ i ].second = true; - order.clear(); - } - } - } - else - { - //for cjk word, FTS::WholeWords and FTS::PlainText actually have same effect. - auto match_word = words.at( matchWordNom ); - bool hasCJK = containCJK( match_word ); - if( ( searchMode == FTS::WholeWords && - ( ( !hasCJK&& parsed_word.compare( match_word, cs ) == 0 ) || - ( hasCJK && parsed_word.contains( match_word, cs ) ) ) ) - || ( searchMode == FTS::PlainText && parsed_word.contains( match_word, cs ) ) ) - { - matchWordNom += 1; - - if( matchWordNom == 1 ) - { - // Store position to remake search if sequence will not be found - nextNotFoundPos = pos + 1; - } - - if( needHandleBrackets ) - { - if( parsedWords.size() > 1 ) - { - QString wordToHilite = s; - while( !wordToHilite.isEmpty() && ( wordToHilite.at( 0 ) == '(' || wordToHilite.at( 0 ) == ')' ) ) - wordToHilite.remove( 0, 1 ); - while( !wordToHilite.isEmpty() && ( wordToHilite.endsWith( '(' ) || wordToHilite.endsWith( ')' ) ) ) - wordToHilite.chop( 1 ); - order.append( wordToHilite.replace( '(', "\\(" ).replace( ')', "\\)" ) ); - } - else - order.append( words.at( matchWordNom - 1 ) ); - } - - if( matchWordNom >= words.size() ) - { - // All words are found - if( needHandleBrackets ) - { - if( allOrders.isEmpty() ) - allOrders.append( words ); - - // Check if such sequence already presented - int x; - for( x = 0; x < allOrders.size(); x++ ) - { - if( allOrders[ x ] == order ) - break; - } - if( x >= allOrders.size() ) - allOrders.append( order ); - - matchWordNom = 0; - unmatchWordNom = 0; - order.clear(); - nextNotFoundPos = 0; - } - else - breakSearch = true; - break; - } - unmatchWordNom = 0; - break; - } - else - if( matchWordNom > 0 && n >= parsedWords.size() - 1 ) - { - unmatchWordNom += 1; - if( distanceBetweenWords >= 0 && unmatchWordNom > distanceBetweenWords ) - { - matchWordNom = 0; - unmatchWordNom = 0; - if( needHandleBrackets ) - order.clear(); - } - } - } - } - if( breakSearch ) - break; - if( nextNotFoundPos > 0 && matchWordNom == 0 ) - { - pos = nextNotFoundPos; - nextNotFoundPos = 0; - } - else - pos += 1; + } + } - - if( !allOrders.isEmpty() || matchWordNom >= words.size() ) + else { - QStringList hiliteReg; - if( !allOrders.isEmpty() ) + if( articleText.contains( searchRegularExpression ) ) { - for( int i = 0; i < allOrders.size(); i++ ) - { - QString hiliteStr = makeHiliteRegExpString( allOrders.at( i ), searchMode, distanceBetweenWords ); - hiliteReg.append( hiliteStr ); - } - allOrders.clear(); + if( headword.isEmpty() ) + offsetsForHeadwords.append( offsets.at( i ) ); + else + foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) ); + + results++; + if( maxResults > 0 && results >= maxResults ) + break; } - if( headword.isEmpty() ) - { - offsetsForHeadwords.append( offsets.at( i ) ); - hiliteRegExps.append( hiliteReg ); - } - else - foundHeadwords->append( FTS::FtsHeadword( headword, id, hiliteReg, matchCase ) ); - - results++; - if( maxResults > 0 && results >= maxResults ) - break; } } } @@ -1150,7 +1009,7 @@ void FTSResultsRequest::run() QRegExp searchRegExp; if( !FtsHelpers::parseSearchString( searchString, indexWords, searchWords, searchRegExp, - searchMode, matchCase, distanceBetweenWords, hasCJK ) ) + searchMode, matchCase, distanceBetweenWords, hasCJK, ignoreWordsOrder ) ) { finish(); return; diff --git a/ftshelpers.hh b/ftshelpers.hh index 6d0fa29a..f477016d 100644 --- a/ftshelpers.hh +++ b/ftshelpers.hh @@ -55,7 +55,8 @@ bool parseSearchString( QString const & str, QStringList & IndexWords, QRegExp & searchRegExp, int searchMode, bool matchCase, int distanceBetweenWords, - bool & hasCJK ); + bool & hasCJK, + bool ignoreWordsOrder = false ); void parseArticleForFts( uint32_t articleAddress, QString & articleText, QMap< QString, QVector< uint32_t > > & words, diff --git a/fulltextsearch.cc b/fulltextsearch.cc index 749a2a94..cb87124b 100644 --- a/fulltextsearch.cc +++ b/fulltextsearch.cc @@ -351,7 +351,7 @@ void FullTextSearchDialog::accept() searchRegExp, mode, ui.matchCase->isChecked(), distanceBetweenWords, - hasCJK ) ) + hasCJK, ignoreWordsOrder ) ) { // if( hasCJK && ( mode == WholeWords || mode == PlainText ) ) // { diff --git a/utils.hh b/utils.hh index bfd71a79..be5c2563 100644 --- a/utils.hh +++ b/utils.hh @@ -118,6 +118,16 @@ inline QString json2String( const QJsonObject & json ) return QString( QJsonDocument( json ).toJson( QJsonDocument::Compact ) ); } +inline QStringList repeat( const QString str, const int times ) +{ + QStringList list; + for( int i = 0; i < times; i++ ) + { + list << str; + } + return list; +} + namespace AtomicInt {