mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-24 04:24:09 +00:00
Merge pull request #90 from xiaoyifang/feature/cjk
improve: fulltext search checkArticle optimize
This commit is contained in:
commit
c42513ebe8
355
ftshelpers.cc
355
ftshelpers.cc
|
@ -15,6 +15,7 @@
|
|||
#include <QVector>
|
||||
|
||||
#include <QRegularExpression>
|
||||
|
||||
#include "wildcard.hh"
|
||||
|
||||
using std::vector;
|
||||
|
@ -38,8 +39,7 @@ bool ftsIndexIsOldOrBad( string const & indexFile,
|
|||
}
|
||||
|
||||
static QString makeHiliteRegExpString( QStringList const & words,
|
||||
int searchMode,
|
||||
int distanceBetweenWords, bool hasCJK = false )
|
||||
int searchMode, int distanceBetweenWords, bool hasCJK = false, bool ignoreWordsOrder = false )
|
||||
{
|
||||
QString searchString( "(" );
|
||||
|
||||
|
@ -63,9 +63,20 @@ static QString makeHiliteRegExpString( QStringList const & words,
|
|||
for( int x = 0; x < words.size(); x++ )
|
||||
{
|
||||
if( x )
|
||||
{
|
||||
searchString += stripWords;
|
||||
if(ignoreWordsOrder)
|
||||
searchString += "(";
|
||||
}
|
||||
|
||||
searchString += boundWord + words[ x ] + boundWord;
|
||||
|
||||
if( x )
|
||||
{
|
||||
if( ignoreWordsOrder )
|
||||
searchString += ")?";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
searchString += ")";
|
||||
|
@ -125,7 +136,8 @@ bool parseSearchString( QString const & str, QStringList & indexWords,
|
|||
QRegExp & searchRegExp, int searchMode,
|
||||
bool matchCase,
|
||||
int distanceBetweenWords,
|
||||
bool & hasCJK )
|
||||
bool & hasCJK,
|
||||
bool ignoreWordsOrder )
|
||||
{
|
||||
searchWords.clear();
|
||||
indexWords.clear();
|
||||
|
@ -149,7 +161,7 @@ bool parseSearchString( QString const & str, QStringList & indexWords,
|
|||
{
|
||||
tokenizeCJK( indexWords, wordRegExp, list );
|
||||
// QStringList allWords = str.split( spacesRegExp, Qt::SkipEmptyParts );
|
||||
searchString = makeHiliteRegExpString( indexWords, searchMode, distanceBetweenWords, hasCJK );
|
||||
searchString = makeHiliteRegExpString( list, searchMode, distanceBetweenWords, hasCJK , ignoreWordsOrder);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -159,7 +171,7 @@ bool parseSearchString( QString const & str, QStringList & indexWords,
|
|||
// Make regexp for results hilite
|
||||
|
||||
QStringList allWords = str.split( spacesRegExp, Qt::SkipEmptyParts );
|
||||
searchString = makeHiliteRegExpString( allWords, searchMode, distanceBetweenWords );
|
||||
searchString = makeHiliteRegExpString( allWords, searchMode, distanceBetweenWords,false, ignoreWordsOrder );
|
||||
}
|
||||
searchRegExp = QRegExp( searchString, matchCase ? Qt::CaseSensitive : Qt::CaseInsensitive, QRegExp::RegExp2 );
|
||||
searchRegExp.setMinimal( true );
|
||||
|
@ -443,25 +455,23 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
|
|||
QRegularExpression::UseUnicodePropertiesOption);
|
||||
QRegularExpression regSplit( "[^\\w\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption );
|
||||
|
||||
// RegExp mode
|
||||
QRegularExpression searchRegularExpression;
|
||||
if( searchMode == FTS::Wildcards )
|
||||
searchRegularExpression.setPattern( wildcardsToRegexp( searchRegexp.pattern() ) );
|
||||
else
|
||||
searchRegularExpression.setPattern( searchRegexp.pattern() );
|
||||
QRegularExpression::PatternOptions patternOptions =
|
||||
QRegularExpression::DotMatchesEverythingOption | QRegularExpression::UseUnicodePropertiesOption
|
||||
| QRegularExpression::MultilineOption | QRegularExpression::InvertedGreedinessOption;
|
||||
if( searchRegexp.caseSensitivity() == Qt::CaseInsensitive )
|
||||
patternOptions |= QRegularExpression::CaseInsensitiveOption;
|
||||
searchRegularExpression.setPatternOptions( patternOptions );
|
||||
if( !searchRegularExpression.isValid() )
|
||||
searchRegularExpression.setPattern( "" );
|
||||
|
||||
if( searchMode == FTS::Wildcards || searchMode == FTS::RegExp )
|
||||
{
|
||||
// RegExp mode
|
||||
|
||||
QRegularExpression searchRegularExpression;
|
||||
if( searchMode == FTS::Wildcards )
|
||||
searchRegularExpression.setPattern( wildcardsToRegexp( searchRegexp.pattern() ) );
|
||||
else
|
||||
searchRegularExpression.setPattern( searchRegexp.pattern() );
|
||||
QRegularExpression::PatternOptions patternOptions = QRegularExpression::DotMatchesEverythingOption
|
||||
| QRegularExpression::UseUnicodePropertiesOption
|
||||
| QRegularExpression::MultilineOption
|
||||
| QRegularExpression::InvertedGreedinessOption;
|
||||
if( searchRegexp.caseSensitivity() == Qt::CaseInsensitive )
|
||||
patternOptions |= QRegularExpression::CaseInsensitiveOption;
|
||||
searchRegularExpression.setPatternOptions( patternOptions );
|
||||
if( !searchRegularExpression.isValid() )
|
||||
searchRegularExpression.setPattern( "" );
|
||||
for( int i = 0; i < offsets.size(); i++ )
|
||||
{
|
||||
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
||||
|
@ -527,243 +537,92 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
|
|||
if( ignoreDiacritics )
|
||||
articleText = gd::toQString( Folding::applyDiacriticsOnly( gd::toWString( articleText ) ) );
|
||||
|
||||
QStringList articleWords = articleText.split( needHandleBrackets ? splitWithBrackets : splitWithoutBrackets,
|
||||
Qt::SkipEmptyParts );
|
||||
//QStringList articleWords = articleText.split( needHandleBrackets ? splitWithBrackets : splitWithoutBrackets,
|
||||
// Qt::SkipEmptyParts );
|
||||
|
||||
int wordsNum = articleWords.length();
|
||||
while ( pos < wordsNum )
|
||||
if(ignoreWordsOrder)
|
||||
{
|
||||
QString s = articleWords[ pos ];
|
||||
bool breakSearch = false;
|
||||
|
||||
QStringList parsedWords;
|
||||
if( needHandleBrackets && ( s.indexOf( '(' ) >= 0 || s.indexOf( ')' ) >= 0 ) )
|
||||
bool allMatch = true;
|
||||
foreach( QString word, words )
|
||||
{
|
||||
if( containCJK( word ) || searchMode == FTS::PlainText )
|
||||
{
|
||||
// Handle brackets
|
||||
QRegularExpressionMatch match_brackets = regBrackets.match( s );
|
||||
if( match_brackets.hasMatch() )
|
||||
if( !articleText.contains( word ) )
|
||||
{
|
||||
QStringList parts = match_brackets.capturedTexts();
|
||||
// Add empty strings for compatibility with QRegExp behaviour
|
||||
for( int i = match_brackets.lastCapturedIndex() + 1; i < 6; i++ )
|
||||
parts.append( QString() );
|
||||
|
||||
QString word = parts[ 2 ] + parts[ 4 ]; // Brackets removed
|
||||
parsedWords.append( word );
|
||||
|
||||
word = parts[ 1 ].remove( '(' ).remove( ')' )
|
||||
+ parts[ 2 ]
|
||||
+ parts[ 3 ].remove( '(' ).remove( ')' )
|
||||
+ parts[ 4 ]
|
||||
+ parts[ 5 ].remove( '(' ).remove( ')' ); // Brackets expansed
|
||||
parsedWords.append( word );
|
||||
allMatch = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if( searchMode == FTS::WholeWords)
|
||||
{
|
||||
QRegularExpression tmpReg( QString( "\b%1\b" ).arg( word ),QRegularExpression::CaseInsensitiveOption|QRegularExpression::UseUnicodePropertiesOption );
|
||||
if( !articleText.contains( tmpReg) )
|
||||
{
|
||||
allMatch = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if(!allMatch)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if( distanceBetweenWords >= 0 )
|
||||
{
|
||||
// the article text contains all the needed words.
|
||||
// determine if distance restriction is meet
|
||||
QRegularExpression replaceReg( QString( "(%1)" ).arg( words.join( '|' ) ),
|
||||
QRegularExpression::CaseInsensitiveOption |
|
||||
QRegularExpression::UseUnicodePropertiesOption );
|
||||
// use a string that could not be presented in the article.
|
||||
articleText = articleText.replace( replaceReg, "=@XXXXX@=" );
|
||||
|
||||
auto hasCJK = false;
|
||||
foreach(QString word,words)
|
||||
{
|
||||
if(containCJK( word ))
|
||||
{
|
||||
hasCJK = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
//hascjk value ,perhaps should depend on each word
|
||||
auto searchRegStr = makeHiliteRegExpString( Utils::repeat( "=@XXXXX@=", words.size() ), searchMode, distanceBetweenWords,hasCJK );
|
||||
QRegularExpression distanceOrderReg( searchRegStr,
|
||||
QRegularExpression::CaseInsensitiveOption |
|
||||
QRegularExpression::UseUnicodePropertiesOption );
|
||||
// use a string that could not be presented in the article.
|
||||
if(articleText.contains(distanceOrderReg))
|
||||
{
|
||||
if( headword.isEmpty() )
|
||||
offsetsForHeadwords.append( offsets.at( i ) );
|
||||
else
|
||||
parsedWords = s.split( regSplit, Qt::SkipEmptyParts );
|
||||
foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) );
|
||||
|
||||
results++;
|
||||
if( maxResults > 0 && results >= maxResults )
|
||||
break;
|
||||
}
|
||||
else
|
||||
parsedWords.append( s );
|
||||
|
||||
int n;
|
||||
for( n = 0; n < parsedWords.size(); n++ )
|
||||
{
|
||||
auto parsed_word = parsedWords.at( n );
|
||||
if( ignoreWordsOrder )
|
||||
{
|
||||
int i;
|
||||
for( i = 0; i < wordsList.size(); i++ )
|
||||
{
|
||||
if( wordsList.at( i ).second )
|
||||
{
|
||||
if( ( searchMode == FTS::WholeWords && parsed_word.compare( wordsList.at( i ).first, cs ) == 0 )
|
||||
|| ( searchMode == FTS::PlainText && parsed_word.contains( wordsList.at( i ).first, cs ) ) )
|
||||
{
|
||||
wordsList[ i ].second = false;
|
||||
|
||||
if( parsedWords.size() > 1 )
|
||||
{
|
||||
QString wordToHilite = s;
|
||||
while( !wordToHilite.isEmpty() && ( wordToHilite.at( 0 ) == '(' || wordToHilite.at( 0 ) == ')' ) )
|
||||
wordToHilite.remove( 0, 1 );
|
||||
while( !wordToHilite.isEmpty() && ( wordToHilite.endsWith( '(' ) || wordToHilite.endsWith( ')' ) ) )
|
||||
wordToHilite.chop( 1 );
|
||||
order.append( wordToHilite.replace( '(', "\\(" ).replace( ')', "\\)" ) );
|
||||
}
|
||||
else
|
||||
order.append( wordsList.at( i ).first );
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if( i < wordsList.size() )
|
||||
{
|
||||
// Word found
|
||||
|
||||
matchWordNom += 1;
|
||||
|
||||
if( matchWordNom == 1 )
|
||||
{
|
||||
// Store position to remake search if sequence will not be found
|
||||
nextNotFoundPos = pos + 1;
|
||||
}
|
||||
|
||||
if( matchWordNom >= words.size() )
|
||||
{
|
||||
// All words are found
|
||||
// Store found words sequence and continue search
|
||||
// It's nesessary for hilite search results
|
||||
|
||||
// Check if such sequence already presented
|
||||
int x;
|
||||
for( x = 0; x < allOrders.size(); x++ )
|
||||
{
|
||||
if( allOrders[ x ] == order )
|
||||
break;
|
||||
}
|
||||
if( x >= allOrders.size() )
|
||||
allOrders.append( order );
|
||||
|
||||
order.clear();
|
||||
|
||||
matchWordNom = 0;
|
||||
unmatchWordNom = 0;
|
||||
for( int i = 0; i < wordsList.size(); i++ )
|
||||
wordsList[ i ].second = true;
|
||||
nextNotFoundPos = 0;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
unmatchWordNom = 0;
|
||||
break;
|
||||
}
|
||||
else
|
||||
if( matchWordNom > 0 && n >= parsedWords.size() - 1 )
|
||||
{
|
||||
unmatchWordNom += 1;
|
||||
if( distanceBetweenWords >= 0 && unmatchWordNom > distanceBetweenWords )
|
||||
{
|
||||
// Sequence broken, clear all counters
|
||||
matchWordNom = 0;
|
||||
unmatchWordNom = 0;
|
||||
for( int i = 0; i < wordsList.size(); i++ )
|
||||
wordsList[ i ].second = true;
|
||||
order.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
//for cjk word, FTS::WholeWords and FTS::PlainText actually have same effect.
|
||||
auto match_word = words.at( matchWordNom );
|
||||
bool hasCJK = containCJK( match_word );
|
||||
if( ( searchMode == FTS::WholeWords &&
|
||||
( ( !hasCJK&& parsed_word.compare( match_word, cs ) == 0 ) ||
|
||||
( hasCJK && parsed_word.contains( match_word, cs ) ) ) )
|
||||
|| ( searchMode == FTS::PlainText && parsed_word.contains( match_word, cs ) ) )
|
||||
{
|
||||
matchWordNom += 1;
|
||||
|
||||
if( matchWordNom == 1 )
|
||||
{
|
||||
// Store position to remake search if sequence will not be found
|
||||
nextNotFoundPos = pos + 1;
|
||||
}
|
||||
|
||||
if( needHandleBrackets )
|
||||
{
|
||||
if( parsedWords.size() > 1 )
|
||||
{
|
||||
QString wordToHilite = s;
|
||||
while( !wordToHilite.isEmpty() && ( wordToHilite.at( 0 ) == '(' || wordToHilite.at( 0 ) == ')' ) )
|
||||
wordToHilite.remove( 0, 1 );
|
||||
while( !wordToHilite.isEmpty() && ( wordToHilite.endsWith( '(' ) || wordToHilite.endsWith( ')' ) ) )
|
||||
wordToHilite.chop( 1 );
|
||||
order.append( wordToHilite.replace( '(', "\\(" ).replace( ')', "\\)" ) );
|
||||
}
|
||||
else
|
||||
order.append( words.at( matchWordNom - 1 ) );
|
||||
}
|
||||
|
||||
if( matchWordNom >= words.size() )
|
||||
{
|
||||
// All words are found
|
||||
if( needHandleBrackets )
|
||||
{
|
||||
if( allOrders.isEmpty() )
|
||||
allOrders.append( words );
|
||||
|
||||
// Check if such sequence already presented
|
||||
int x;
|
||||
for( x = 0; x < allOrders.size(); x++ )
|
||||
{
|
||||
if( allOrders[ x ] == order )
|
||||
break;
|
||||
}
|
||||
if( x >= allOrders.size() )
|
||||
allOrders.append( order );
|
||||
|
||||
matchWordNom = 0;
|
||||
unmatchWordNom = 0;
|
||||
order.clear();
|
||||
nextNotFoundPos = 0;
|
||||
}
|
||||
else
|
||||
breakSearch = true;
|
||||
break;
|
||||
}
|
||||
unmatchWordNom = 0;
|
||||
break;
|
||||
}
|
||||
else
|
||||
if( matchWordNom > 0 && n >= parsedWords.size() - 1 )
|
||||
{
|
||||
unmatchWordNom += 1;
|
||||
if( distanceBetweenWords >= 0 && unmatchWordNom > distanceBetweenWords )
|
||||
{
|
||||
matchWordNom = 0;
|
||||
unmatchWordNom = 0;
|
||||
if( needHandleBrackets )
|
||||
order.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if( breakSearch )
|
||||
break;
|
||||
if( nextNotFoundPos > 0 && matchWordNom == 0 )
|
||||
{
|
||||
pos = nextNotFoundPos;
|
||||
nextNotFoundPos = 0;
|
||||
}
|
||||
else
|
||||
pos += 1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if( !allOrders.isEmpty() || matchWordNom >= words.size() )
|
||||
else
|
||||
{
|
||||
QStringList hiliteReg;
|
||||
if( !allOrders.isEmpty() )
|
||||
if( articleText.contains( searchRegularExpression ) )
|
||||
{
|
||||
for( int i = 0; i < allOrders.size(); i++ )
|
||||
{
|
||||
QString hiliteStr = makeHiliteRegExpString( allOrders.at( i ), searchMode, distanceBetweenWords );
|
||||
hiliteReg.append( hiliteStr );
|
||||
}
|
||||
allOrders.clear();
|
||||
if( headword.isEmpty() )
|
||||
offsetsForHeadwords.append( offsets.at( i ) );
|
||||
else
|
||||
foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) );
|
||||
|
||||
results++;
|
||||
if( maxResults > 0 && results >= maxResults )
|
||||
break;
|
||||
}
|
||||
if( headword.isEmpty() )
|
||||
{
|
||||
offsetsForHeadwords.append( offsets.at( i ) );
|
||||
hiliteRegExps.append( hiliteReg );
|
||||
}
|
||||
else
|
||||
foundHeadwords->append( FTS::FtsHeadword( headword, id, hiliteReg, matchCase ) );
|
||||
|
||||
results++;
|
||||
if( maxResults > 0 && results >= maxResults )
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1150,7 +1009,7 @@ void FTSResultsRequest::run()
|
|||
QRegExp searchRegExp;
|
||||
|
||||
if( !FtsHelpers::parseSearchString( searchString, indexWords, searchWords, searchRegExp,
|
||||
searchMode, matchCase, distanceBetweenWords, hasCJK ) )
|
||||
searchMode, matchCase, distanceBetweenWords, hasCJK, ignoreWordsOrder ) )
|
||||
{
|
||||
finish();
|
||||
return;
|
||||
|
|
|
@ -55,7 +55,8 @@ bool parseSearchString( QString const & str, QStringList & IndexWords,
|
|||
QRegExp & searchRegExp, int searchMode,
|
||||
bool matchCase,
|
||||
int distanceBetweenWords,
|
||||
bool & hasCJK );
|
||||
bool & hasCJK,
|
||||
bool ignoreWordsOrder = false );
|
||||
|
||||
void parseArticleForFts( uint32_t articleAddress, QString & articleText,
|
||||
QMap< QString, QVector< uint32_t > > & words,
|
||||
|
|
|
@ -351,7 +351,7 @@ void FullTextSearchDialog::accept()
|
|||
searchRegExp, mode,
|
||||
ui.matchCase->isChecked(),
|
||||
distanceBetweenWords,
|
||||
hasCJK ) )
|
||||
hasCJK, ignoreWordsOrder ) )
|
||||
{
|
||||
// if( hasCJK && ( mode == WholeWords || mode == PlainText ) )
|
||||
// {
|
||||
|
|
10
utils.hh
10
utils.hh
|
@ -118,6 +118,16 @@ inline QString json2String( const QJsonObject & json )
|
|||
return QString( QJsonDocument( json ).toJson( QJsonDocument::Compact ) );
|
||||
}
|
||||
|
||||
inline QStringList repeat( const QString str, const int times )
|
||||
{
|
||||
QStringList list;
|
||||
for( int i = 0; i < times; i++ )
|
||||
{
|
||||
list << str;
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
namespace AtomicInt
|
||||
{
|
||||
|
||||
|
|
Loading…
Reference in a new issue