Merge branch 'staged' into dev

This commit is contained in:
Xiao YiFang 2022-06-13 19:48:06 +08:00
commit e2d470d6dc
11 changed files with 246 additions and 184 deletions

View file

@ -1,5 +1,7 @@
name: macos-PR-check name: macos-PR-check
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
on: on:
workflow_dispatch: workflow_dispatch:

View file

@ -1,5 +1,7 @@
name: Ubuntu-PR-check name: Ubuntu-PR-check
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
on: on:
workflow_dispatch: workflow_dispatch:

View file

@ -1,5 +1,7 @@
name: Windows-PR-check name: Windows-PR-check
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
on: on:
workflow_dispatch: workflow_dispatch:

View file

@ -24,24 +24,19 @@ a:hover
background: white; background: white;
} }
/* Dictionary's name heading */
.gddictname
{
border: 1px dotted black; padding: 0.2em; padding-left: 0.5em;
margin-top: 1.2em; margin-bottom: 0.1em; font-weight: bold; font-size: 14px;
background: #87CEEB;
}
/* The 'From ' string which preceeds dictionary name in the heading */ /* The 'From ' string which preceeds dictionary name in the heading */
.gdfromprefix .gdfromprefix
{ {
display: none; display: none;
} }
/* Dictionary's name heading */
.gddictname .gddictname
{ {
padding: 0.2em; padding-left: 0.5em;
margin-bottom: 0.1em;
font-size: 14px;
font-weight: normal; font-weight: normal;
float: right; float: right;
border: 1px solid white; border: 1px solid white;
margin-top: 7px; margin-top: 7px;

View file

@ -42,6 +42,11 @@ pre
/*background: #ffffdd;*/ /*background: #ffffdd;*/
} }
.gddicttitle
{
user-select: none;
}
.gddictnamebodyseparator .gddictnamebodyseparator
{ {
clear: both; clear: both;

50
base/globalregex.cc Normal file
View file

@ -0,0 +1,50 @@
#include "globalregex.hh"
#include "fulltextsearch.hh"
using namespace RX;
QRegularExpression Ftx::regBrackets(
"(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+)(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+){0,1}(\\([\\w\\p{M}]+\\)){0,1}",
QRegularExpression::UseUnicodePropertiesOption );
QRegularExpression Ftx::regSplit( "[^\\w\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption );
QRegularExpression Ftx::spacesRegExp( "\\W+", QRegularExpression::UseUnicodePropertiesOption );
QRegularExpression Ftx::wordRegExp( QString( "\\w{" ) + QString::number( FTS::MinimumWordSize ) + ",}",
QRegularExpression::UseUnicodePropertiesOption );
QRegularExpression Ftx::setsRegExp( "\\[[^\\]]+\\]", QRegularExpression::CaseInsensitiveOption );
QRegularExpression Ftx::regexRegExp( "\\\\[afnrtvdDwWsSbB]|\\\\x([0-9A-Fa-f]{4})|\\\\0([0-7]{3})",
QRegularExpression::CaseInsensitiveOption );
//mdx
QRegularExpression Mdx::allLinksRe( "(?:<\\s*(a(?:rea)?|img|link|script|source)(?:\\s+[^>]+|\\s*)>)",
QRegularExpression::CaseInsensitiveOption );
QRegularExpression Mdx::wordCrossLink( "([\\s\"']href\\s*=)\\s*([\"'])entry://([^>#]*?)((?:#[^>]*?)?)\\2",
QRegularExpression::CaseInsensitiveOption );
QRegularExpression Mdx::anchorIdRe( "([\\s\"'](?:name|id)\\s*=)\\s*([\"'])\\s*(?=\\S)",
QRegularExpression::CaseInsensitiveOption );
QRegularExpression Mdx::anchorIdReWord( "([\\s\"'](?:name|id)\\s*=)\\s*([\"'])\\s*(?=\\S)([^\"]*)",
QRegularExpression::CaseInsensitiveOption );
QRegularExpression Mdx::anchorIdRe2( "([\\s\"'](?:name|id)\\s*=)\\s*(?=[^\"'])([^\\s\">]+)",
QRegularExpression::CaseInsensitiveOption );
QRegularExpression Mdx::anchorLinkRe( "([\\s\"']href\\s*=\\s*[\"'])entry://#",
QRegularExpression::CaseInsensitiveOption );
QRegularExpression Mdx::audioRe( "([\\s\"']href\\s*=)\\s*([\"'])sound://([^\">]+)\\2",
QRegularExpression::CaseInsensitiveOption
| QRegularExpression::InvertedGreedinessOption );
QRegularExpression Mdx::stylesRe( "([\\s\"']href\\s*=)\\s*([\"'])(?!\\s*\\b(?:(?:bres|https?|ftp)://"
"|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\">]+)\\2",
QRegularExpression::CaseInsensitiveOption );
QRegularExpression Mdx::stylesRe2( "([\\s\"']href\\s*=)\\s*(?![\\s\"']|\\b(?:(?:bres|https?|ftp)://"
"|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\\s\">]+)",
QRegularExpression::CaseInsensitiveOption );
QRegularExpression Mdx::inlineScriptRe( "<\\s*script(?:(?=\\s)(?:(?![\\s\"']src\\s*=)[^>])+|\\s*)>",
QRegularExpression::CaseInsensitiveOption );
QRegularExpression Mdx::closeScriptTagRe( "<\\s*/script\\s*>", QRegularExpression::CaseInsensitiveOption );
QRegularExpression Mdx::srcRe( "([\\s\"']src\\s*=)\\s*([\"'])(?!\\s*\\b(?:(?:bres|https?|ftp)://"
"|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\">]+)\\2",
QRegularExpression::CaseInsensitiveOption );
QRegularExpression Mdx::srcRe2( "([\\s\"']src\\s*=)\\s*(?![\\s\"']|\\b(?:(?:bres|https?|ftp)://"
"|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\\s\">]+)",
QRegularExpression::CaseInsensitiveOption );

40
base/globalregex.hh Normal file
View file

@ -0,0 +1,40 @@
#ifndef GLOBALREGEX_HH
#define GLOBALREGEX_HH
#include <QRegularExpression>
namespace RX
{
class Ftx
{
public:
static QRegularExpression regBrackets;
static QRegularExpression regSplit;
static QRegularExpression spacesRegExp;
static QRegularExpression wordRegExp;
static QRegularExpression setsRegExp;
static QRegularExpression regexRegExp;
};
class Mdx
{
public:
static QRegularExpression allLinksRe;
static QRegularExpression wordCrossLink;
static QRegularExpression anchorIdRe;
static QRegularExpression anchorIdReWord;
static QRegularExpression anchorIdRe2;
static QRegularExpression anchorLinkRe;
static QRegularExpression audioRe;
static QRegularExpression stylesRe;
static QRegularExpression stylesRe2;
static QRegularExpression inlineScriptRe;
static QRegularExpression closeScriptTagRe;
static QRegularExpression srcRe;
static QRegularExpression srcRe2;
};
} // namespace RX
#endif // GLOBALREGEX_HH

View file

@ -17,6 +17,8 @@
#include <QRegularExpression> #include <QRegularExpression>
#include "wildcard.hh" #include "wildcard.hh"
#include <QtConcurrent>
#include "base/globalregex.hh"
using std::vector; using std::vector;
using std::string; using std::string;
@ -147,36 +149,36 @@ bool parseSearchString( QString const & str, QStringList & indexWords,
{ {
searchWords.clear(); searchWords.clear();
indexWords.clear(); indexWords.clear();
QRegularExpression spacesRegExp( "\\W+", QRegularExpression::UseUnicodePropertiesOption ); // QRegularExpression spacesRegExp( "\\W+", QRegularExpression::UseUnicodePropertiesOption );
QRegularExpression wordRegExp( QString( "\\w{" ) + QString::number( FTS::MinimumWordSize ) + ",}", QRegularExpression::UseUnicodePropertiesOption ); // QRegularExpression wordRegExp( QString( "\\w{" ) + QString::number( FTS::MinimumWordSize ) + ",}", QRegularExpression::UseUnicodePropertiesOption );
QRegularExpression setsRegExp( "\\[[^\\]]+\\]", QRegularExpression::CaseInsensitiveOption ); // QRegularExpression setsRegExp( "\\[[^\\]]+\\]", QRegularExpression::CaseInsensitiveOption );
QRegularExpression regexRegExp( "\\\\[afnrtvdDwWsSbB]|\\\\x([0-9A-Fa-f]{4})|\\\\0([0-7]{3})", QRegularExpression::CaseInsensitiveOption); // QRegularExpression regexRegExp( "\\\\[afnrtvdDwWsSbB]|\\\\x([0-9A-Fa-f]{4})|\\\\0([0-7]{3})", QRegularExpression::CaseInsensitiveOption);
hasCJK = containCJK( str ); hasCJK = containCJK( str );
if( searchMode == FTS::WholeWords || searchMode == FTS::PlainText ) if( searchMode == FTS::WholeWords || searchMode == FTS::PlainText )
{ {
// Make words list for search in article text // Make words list for search in article text
searchWords = str.normalized( QString::NormalizationForm_C ).split( spacesRegExp, Qt::SkipEmptyParts ); searchWords = str.normalized( QString::NormalizationForm_C ).split( RX::Ftx::spacesRegExp, Qt::SkipEmptyParts );
// Make words list for index search // Make words list for index search
QStringList list = QStringList list =
str.normalized( QString::NormalizationForm_C ).toLower().split( spacesRegExp, Qt::SkipEmptyParts ); str.normalized( QString::NormalizationForm_C ).toLower().split( RX::Ftx::spacesRegExp, Qt::SkipEmptyParts );
QString searchString; QString searchString;
if( hasCJK ) if( hasCJK )
{ {
tokenizeCJK( indexWords, wordRegExp, list ); tokenizeCJK( indexWords, RX::Ftx::wordRegExp, list );
// QStringList allWords = str.split( spacesRegExp, Qt::SkipEmptyParts ); // QStringList allWords = str.split( spacesRegExp, Qt::SkipEmptyParts );
searchString = makeHiliteRegExpString( list, searchMode, distanceBetweenWords, hasCJK , ignoreWordsOrder); searchString = makeHiliteRegExpString( list, searchMode, distanceBetweenWords, hasCJK , ignoreWordsOrder);
} }
else else
{ {
indexWords = list.filter( wordRegExp ); indexWords = list.filter( RX::Ftx::wordRegExp );
indexWords.removeDuplicates(); indexWords.removeDuplicates();
// Make regexp for results hilite // Make regexp for results hilite
QStringList allWords = str.split( spacesRegExp, Qt::SkipEmptyParts ); QStringList allWords = str.split( RX::Ftx::spacesRegExp, Qt::SkipEmptyParts );
searchString = makeHiliteRegExpString( allWords, searchMode, distanceBetweenWords,false, ignoreWordsOrder ); searchString = makeHiliteRegExpString( allWords, searchMode, distanceBetweenWords,false, ignoreWordsOrder );
} }
searchRegExp = QRegExp( searchString, matchCase ? Qt::CaseSensitive : Qt::CaseInsensitive, QRegExp::RegExp2 ); searchRegExp = QRegExp( searchString, matchCase ? Qt::CaseSensitive : Qt::CaseInsensitive, QRegExp::RegExp2 );
@ -191,21 +193,21 @@ bool parseSearchString( QString const & str, QStringList & indexWords,
// Remove RegExp commands // Remove RegExp commands
if( searchMode == FTS::RegExp ) if( searchMode == FTS::RegExp )
tmp.replace( regexRegExp, " " ); tmp.replace( RX::Ftx::regexRegExp, " " );
// Remove all symbol sets // Remove all symbol sets
tmp.replace( setsRegExp, " " ); tmp.replace( RX::Ftx::setsRegExp, " " );
QStringList list = tmp.normalized( QString::NormalizationForm_C ) QStringList list = tmp.normalized( QString::NormalizationForm_C )
.toLower().split( spacesRegExp, Qt::SkipEmptyParts ); .toLower().split( RX::Ftx::spacesRegExp, Qt::SkipEmptyParts );
if( hasCJK ) if( hasCJK )
{ {
tokenizeCJK( indexWords, wordRegExp, list ); tokenizeCJK( indexWords, RX::Ftx::wordRegExp, list );
} }
else else
{ {
indexWords = list.filter( wordRegExp ); indexWords = list.filter( RX::Ftx::wordRegExp );
indexWords.removeDuplicates(); indexWords.removeDuplicates();
} }
@ -224,9 +226,9 @@ void parseArticleForFts( uint32_t articleAddress, QString & articleText,
if( articleText.isEmpty() ) if( articleText.isEmpty() )
return; return;
QRegularExpression regBrackets( "(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+)(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+){0,1}(\\([\\w\\p{M}]+\\)){0,1}", // QRegularExpression regBrackets( "(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+)(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+){0,1}(\\([\\w\\p{M}]+\\)){0,1}",
QRegularExpression::UseUnicodePropertiesOption); // QRegularExpression::UseUnicodePropertiesOption);
QRegularExpression regSplit( "[^\\w\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption ); // QRegularExpression regSplit( "[^\\w\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption );
QStringList articleWords = articleText.normalized( QString::NormalizationForm_C ) QStringList articleWords = articleText.normalized( QString::NormalizationForm_C )
.split( QRegularExpression( handleRoundBrackets ? "[^\\w\\(\\)\\p{M}]+" : "[^\\w\\p{M}]+", .split( QRegularExpression( handleRoundBrackets ? "[^\\w\\(\\)\\p{M}]+" : "[^\\w\\p{M}]+",
@ -275,12 +277,12 @@ void parseArticleForFts( uint32_t articleAddress, QString & articleText,
// Special handle for words with round brackets - DSL feature // Special handle for words with round brackets - DSL feature
QStringList list; QStringList list;
QStringList oldVariant = word.split( regSplit, Qt::SkipEmptyParts ); QStringList oldVariant = word.split( RX::Ftx::regSplit, Qt::SkipEmptyParts );
for( QStringList::iterator it = oldVariant.begin(); it != oldVariant.end(); ++it ) for( QStringList::iterator it = oldVariant.begin(); it != oldVariant.end(); ++it )
if( it->size() >= FTS::MinimumWordSize && !list.contains( *it ) ) if( it->size() >= FTS::MinimumWordSize && !list.contains( *it ) )
list.append( *it ); list.append( *it );
QRegularExpressionMatch match = regBrackets.match( word ); QRegularExpressionMatch match = RX::Ftx::regBrackets.match( word );
if( match.hasMatch() ) if( match.hasMatch() )
{ {
QStringList parts = match.capturedTexts(); QStringList parts = match.capturedTexts();
@ -445,21 +447,20 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
QStringList const & words, QStringList const & words,
QRegExp const & searchRegexp ) QRegExp const & searchRegexp )
{ {
int results = 0; QtConcurrent::blockingMap( offsets, [ & ]( uint32_t offset ) { checkSingleArticle( offset, words, searchRegexp ); } );
}
void FTSResultsRequest::checkSingleArticle( uint32_t offset,
QStringList const & words,
QRegExp const & searchRegexp )
{
qDebug()<<"checking"<<offset<<QThread::currentThreadId();
// int results = 0;
QString headword, articleText; QString headword, articleText;
QList< uint32_t > offsetsForHeadwords; QList< uint32_t > offsetsForHeadwords;
QVector< QStringList > hiliteRegExps; QVector< QStringList > hiliteRegExps;
QString id = QString::fromUtf8( dict.getId().c_str() ); QString id = QString::fromUtf8( dict.getId().c_str() );
bool needHandleBrackets;
{
QString name = QString::fromUtf8( dict.getDictionaryFilenames()[ 0 ].c_str() ).toLower();
needHandleBrackets = name.endsWith( ".dsl" ) || name.endsWith( ".dsl.dz" );
}
QRegularExpression regBrackets( "(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+)(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+){0,1}(\\([\\w\\p{M}]+\\)){0,1}",
QRegularExpression::UseUnicodePropertiesOption);
QRegularExpression regSplit( "[^\\w\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption );
// RegExp mode // RegExp mode
QRegularExpression searchRegularExpression; QRegularExpression searchRegularExpression;
@ -478,12 +479,13 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
if( searchMode == FTS::Wildcards || searchMode == FTS::RegExp ) if( searchMode == FTS::Wildcards || searchMode == FTS::RegExp )
{ {
for( int i = 0; i < offsets.size(); i++ ) // for( int i = 0; i < offsets.size(); i++ )
{ {
if( Utils::AtomicInt::loadAcquire( isCancelled ) ) if( Utils::AtomicInt::loadAcquire( isCancelled ) )
break; return;
dict.getArticleText( offsets.at( i ), headword, articleText ); // auto article_address = offsets.at( i );
dict.getArticleText( offset, headword, articleText );
articleText = articleText.normalized( QString::NormalizationForm_C ); articleText = articleText.normalized( QString::NormalizationForm_C );
if( ignoreDiacritics ) if( ignoreDiacritics )
@ -492,13 +494,13 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
if( articleText.contains( searchRegularExpression ) ) if( articleText.contains( searchRegularExpression ) )
{ {
if( headword.isEmpty() ) if( headword.isEmpty() )
offsetsForHeadwords.append( offsets.at( i ) ); offsetsForHeadwords.append( offset );
else else
foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) ); foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) );
results++; ++results;
if( maxResults > 0 && results >= maxResults ) if( maxResults > 0 && results >= maxResults )
break; return;
} }
} }
} }
@ -506,10 +508,6 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
{ {
// Words mode // Words mode
QRegularExpression splitWithBrackets( "[^\\w\\(\\)\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption );
QRegularExpression splitWithoutBrackets( "[^\\w\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption );
Qt::CaseSensitivity cs = matchCase ? Qt::CaseSensitive : Qt::CaseInsensitive;
QVector< QPair< QString, bool > > wordsList; QVector< QPair< QString, bool > > wordsList;
if( ignoreWordsOrder ) if( ignoreWordsOrder )
{ {
@ -517,18 +515,10 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
wordsList.append( QPair< QString, bool >( *it, true ) ); wordsList.append( QPair< QString, bool >( *it, true ) );
} }
for( int i = 0; i < offsets.size(); i++ ) // for( int i = 0; i < offsets.size(); i++ )
{ {
if( Utils::AtomicInt::loadAcquire( isCancelled ) ) if( Utils::AtomicInt::loadAcquire( isCancelled ) )
break; return;
int pos = 0;
int matchWordNom = 0;
int unmatchWordNom = 0;
int nextNotFoundPos = 0;
QVector< QStringList > allOrders;
QStringList order;
if( ignoreWordsOrder ) if( ignoreWordsOrder )
{ {
@ -536,17 +526,14 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
wordsList[ i ].second = true; wordsList[ i ].second = true;
} }
dict.getArticleText( offsets.at( i ), headword, articleText ); dict.getArticleText( offset, headword, articleText );
articleText = articleText.normalized( QString::NormalizationForm_C ); articleText = articleText.normalized( QString::NormalizationForm_C );
if( ignoreDiacritics ) if( ignoreDiacritics )
articleText = gd::toQString( Folding::applyDiacriticsOnly( gd::toWString( articleText ) ) ); articleText = gd::toQString( Folding::applyDiacriticsOnly( gd::toWString( articleText ) ) );
//QStringList articleWords = articleText.split( needHandleBrackets ? splitWithBrackets : splitWithoutBrackets, if( ignoreWordsOrder )
// Qt::SkipEmptyParts );
if(ignoreWordsOrder)
{ {
bool allMatch = true; bool allMatch = true;
foreach( QString word, words ) foreach( QString word, words )
@ -559,75 +546,78 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
break; break;
} }
} }
else if( searchMode == FTS::WholeWords) else if( searchMode == FTS::WholeWords )
{ {
QRegularExpression tmpReg( QString( "\b%1\b" ).arg( word ),QRegularExpression::CaseInsensitiveOption|QRegularExpression::UseUnicodePropertiesOption ); QRegularExpression tmpReg( QString( "\b%1\b" ).arg( word ),
if( !articleText.contains( tmpReg) ) QRegularExpression::CaseInsensitiveOption
| QRegularExpression::UseUnicodePropertiesOption );
if( !articleText.contains( tmpReg ) )
{ {
allMatch = false; allMatch = false;
break; break;
} }
} }
} }
if(!allMatch) if( !allMatch )
{ {
continue; return;
} }
if( distanceBetweenWords >= 0 ) if( distanceBetweenWords >= 0 )
{ {
// the article text contains all the needed words. // the article text contains all the needed words.
// determine if distance restriction is meet // determine if distance restriction is meet
QRegularExpression replaceReg( QString( "(%1)" ).arg( words.join( '|' ) ), const QRegularExpression replaceReg( QString( "(%1)" ).arg( words.join( '|' ) ),
QRegularExpression::CaseInsensitiveOption | QRegularExpression::CaseInsensitiveOption
QRegularExpression::UseUnicodePropertiesOption ); | QRegularExpression::UseUnicodePropertiesOption );
// use a string that could not be presented in the article. // use a string that could not be presented in the article.
articleText = articleText.replace( replaceReg, "=@XXXXX@=" ); articleText = articleText.replace( replaceReg, "=@XXXXX@=" );
auto hasCJK = false; auto hasCJK = false;
foreach(QString word,words) foreach( QString word, words )
{ {
if(containCJK( word )) if( containCJK( word ) )
{ {
hasCJK = true; hasCJK = true;
break; break;
} }
} }
//hascjk value ,perhaps should depend on each word // hascjk value ,perhaps should depend on each word
auto searchRegStr = makeHiliteRegExpString( Utils::repeat( "=@XXXXX@=", words.size() ), searchMode, distanceBetweenWords,hasCJK ); const auto searchRegStr = makeHiliteRegExpString( Utils::repeat( "=@XXXXX@=", words.size() ),
QRegularExpression distanceOrderReg( searchRegStr, searchMode,
QRegularExpression::CaseInsensitiveOption | distanceBetweenWords,
QRegularExpression::UseUnicodePropertiesOption ); hasCJK );
const QRegularExpression distanceOrderReg( searchRegStr,
QRegularExpression::CaseInsensitiveOption
| QRegularExpression::UseUnicodePropertiesOption );
// use a string that could not be presented in the article. // use a string that could not be presented in the article.
if(articleText.contains(distanceOrderReg)) if( articleText.contains( distanceOrderReg ) )
{ {
if( headword.isEmpty() ) if( headword.isEmpty() )
offsetsForHeadwords.append( offsets.at( i ) ); offsetsForHeadwords.append( offset );
else else
foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) ); foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) );
results++; ++results;
if( maxResults > 0 && results >= maxResults ) if( maxResults > 0 && results >= maxResults )
break; return;
} }
} }
} }
else else
{ {
if( articleText.contains( searchRegularExpression ) ) if( articleText.contains( searchRegularExpression ) )
{ {
if( headword.isEmpty() ) if( headword.isEmpty() )
offsetsForHeadwords.append( offsets.at( i ) ); offsetsForHeadwords.append( offset );
else else
foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) ); foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) );
results++; ++results;
if( maxResults > 0 && results >= maxResults ) if( maxResults > 0 && results >= maxResults )
break; return;
} }
} }
} }
@ -637,7 +627,10 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
QVector< QString > headwords; QVector< QString > headwords;
dict.getHeadwordsFromOffsets( offsetsForHeadwords, headwords, &isCancelled ); dict.getHeadwordsFromOffsets( offsetsForHeadwords, headwords, &isCancelled );
for( int x = 0; x < headwords.size(); x++ ) for( int x = 0; x < headwords.size(); x++ )
foundHeadwords->append( FTS::FtsHeadword( headwords.at( x ), id, x < hiliteRegExps.size() ? hiliteRegExps.at( x ) : QStringList(), matchCase ) ); foundHeadwords->append( FTS::FtsHeadword( headwords.at( x ),
id,
x < hiliteRegExps.size() ? hiliteRegExps.at( x ) : QStringList(),
matchCase ) );
} }
} }
@ -648,27 +641,28 @@ void FTSResultsRequest::indexSearch( BtreeIndexing::BtreeIndex & ftsIndex,
{ {
// Find articles which contains all requested words // Find articles which contains all requested words
vector< BtreeIndexing::WordArticleLink > links; QSet< uint32_t > setOfOffsets;
QSet< uint32_t > setOfOffsets, tmp;
uint32_t size;
if( indexWords.isEmpty() ) if( indexWords.isEmpty() )
return; return;
int n = indexWords.length(); QList< QSet< uint32_t > > addressLists;
for( int i = 0; i < n; i++ )
auto findLinks = [ & ]( const QString & word )
{ {
QSet< uint32_t > tmp;
uint32_t size;
if( Utils::AtomicInt::loadAcquire( isCancelled ) ) if( Utils::AtomicInt::loadAcquire( isCancelled ) )
return; addressLists<< tmp;
tmp.clear(); vector< BtreeIndexing::WordArticleLink > links =
ftsIndex.findArticles( gd::toWString( word ), ignoreDiacritics );
links = ftsIndex.findArticles( gd::toWString( indexWords.at( i ) ), ignoreDiacritics );
for( unsigned x = 0; x < links.size(); x++ ) for( unsigned x = 0; x < links.size(); x++ )
{ {
if( Utils::AtomicInt::loadAcquire( isCancelled ) ) if( Utils::AtomicInt::loadAcquire( isCancelled ) )
return; addressLists<< tmp;
vector< char > chunk; vector< char > chunk;
char * linksPtr; char * linksPtr;
@ -677,24 +671,31 @@ void FTSResultsRequest::indexSearch( BtreeIndexing::BtreeIndex & ftsIndex,
linksPtr = chunks->getBlock( links[ x ].articleOffset, chunk ); linksPtr = chunks->getBlock( links[ x ].articleOffset, chunk );
} }
memcpy( &size, linksPtr, sizeof(uint32_t) ); memcpy( &size, linksPtr, sizeof( uint32_t ) );
linksPtr += sizeof(uint32_t); linksPtr += sizeof( uint32_t );
for( uint32_t y = 0; y < size; y++ ) for( uint32_t y = 0; y < size; y++ )
{ {
tmp.insert( *( reinterpret_cast< uint32_t * >( linksPtr ) ) ); tmp.insert( *( reinterpret_cast< uint32_t * >( linksPtr ) ) );
linksPtr += sizeof(uint32_t); linksPtr += sizeof( uint32_t );
} }
} }
links.clear(); links.clear();
if( i == 0 ) addressLists<< tmp;
setOfOffsets = tmp; };
// int n = indexWords.length();
QtConcurrent::blockingMap( indexWords, findLinks );
int i = 0;
for( auto & elem : addressLists )
{
if( i++ == 0 )
setOfOffsets = elem;
else else
setOfOffsets = setOfOffsets.intersect( tmp ); setOfOffsets = setOfOffsets.intersect( elem );
} }
tmp.clear();
if( setOfOffsets.isEmpty() ) if( setOfOffsets.isEmpty() )
return; return;
@ -757,17 +758,15 @@ void FTSResultsRequest::combinedIndexSearch( BtreeIndexing::BtreeIndex & ftsInde
if( !hieroglyphsList.empty() ) if( !hieroglyphsList.empty() )
{ {
QSet< uint32_t > tmp; QList< QSet< uint32_t > > sets;
vector< BtreeIndexing::WordArticleLink > links; auto fn_wordLink = [ & ](const QString & word )
for( int i = 0; i < hieroglyphsList.size(); i++ )
{ {
links = ftsIndex.findArticles( gd::toWString( hieroglyphsList.at( i ) ) ); QSet< uint32_t > tmp;
vector< BtreeIndexing::WordArticleLink > links = ftsIndex.findArticles( gd::toWString( word ) );
for( unsigned x = 0; x < links.size(); x++ ) for( unsigned x = 0; x < links.size(); x++ )
{ {
if( Utils::AtomicInt::loadAcquire( isCancelled ) ) if( Utils::AtomicInt::loadAcquire( isCancelled ) )
return; sets<< tmp;
vector< char > chunk; vector< char > chunk;
char * linksPtr; char * linksPtr;
@ -786,11 +785,17 @@ void FTSResultsRequest::combinedIndexSearch( BtreeIndexing::BtreeIndex & ftsInde
} }
links.clear(); links.clear();
sets<< tmp;
};
QtConcurrent::blockingMap( hieroglyphsList, fn_wordLink );
if( i == 0 ) int i = 0;
setOfOffsets = tmp; for( auto & elem : sets )
{
if( i++ == 0 )
setOfOffsets = elem;
else else
setOfOffsets = setOfOffsets.intersect( tmp ); setOfOffsets = setOfOffsets.intersect( elem );
} }
allWordsLinks[ wordNom ] = setOfOffsets; allWordsLinks[ wordNom ] = setOfOffsets;

View file

@ -82,12 +82,16 @@ class FTSResultsRequest : public Dictionary::DataRequest
QAtomicInt isCancelled; QAtomicInt isCancelled;
QAtomicInt results;
QList< FTS::FtsHeadword > * foundHeadwords; QList< FTS::FtsHeadword > * foundHeadwords;
void checkArticles( QVector< uint32_t > const & offsets, void checkArticles( QVector< uint32_t > const & offsets,
QStringList const & words, QStringList const & words,
QRegExp const & searchRegexp = QRegExp() ); QRegExp const & searchRegexp = QRegExp() );
void checkSingleArticle( uint32_t offset, QStringList const & words, QRegExp const & searchRegexp = QRegExp() );
void indexSearch( BtreeIndexing::BtreeIndex & ftsIndex, void indexSearch( BtreeIndexing::BtreeIndex & ftsIndex,
sptr< ChunkedStorage::Reader > chunks, sptr< ChunkedStorage::Reader > chunks,
QStringList & indexWords, QStringList & indexWords,
@ -127,6 +131,7 @@ public:
searchString = gd::toQString( Folding::applyDiacriticsOnly( gd::toWString( searchString_ ) ) ); searchString = gd::toQString( Folding::applyDiacriticsOnly( gd::toWString( searchString_ ) ) );
foundHeadwords = new QList< FTS::FtsHeadword >; foundHeadwords = new QList< FTS::FtsHeadword >;
results = 0;
QThreadPool::globalInstance()->start( [ this ]() { this->run(); }, -100 ); QThreadPool::globalInstance()->start( [ this ]() { this->run(); }, -100 );
} }

View file

@ -47,7 +47,8 @@ QT += core \
webenginewidgets\ webenginewidgets\
webchannel\ webchannel\
printsupport \ printsupport \
help help \
concurrent
greaterThan(QT_MAJOR_VERSION, 5): QT += webenginecore core5compat greaterThan(QT_MAJOR_VERSION, 5): QT += webenginecore core5compat
@ -242,6 +243,7 @@ HEADERS += folding.hh \
ankiconnector.h \ ankiconnector.h \
article_inspect.h \ article_inspect.h \
articlewebpage.h \ articlewebpage.h \
base/globalregex.hh \
globalbroadcaster.h \ globalbroadcaster.h \
iframeschemehandler.h \ iframeschemehandler.h \
inc_case_folding.hh \ inc_case_folding.hh \
@ -384,6 +386,7 @@ SOURCES += folding.cc \
ankiconnector.cpp \ ankiconnector.cpp \
article_inspect.cpp \ article_inspect.cpp \
articlewebpage.cpp \ articlewebpage.cpp \
base/globalregex.cc \
globalbroadcaster.cpp \ globalbroadcaster.cpp \
iframeschemehandler.cpp \ iframeschemehandler.cpp \
main.cc \ main.cc \

81
mdx.cc
View file

@ -42,6 +42,7 @@
#include "tiff.hh" #include "tiff.hh"
#include "utils.hh" #include "utils.hh"
#include "base/globalregex.hh"
namespace Mdx namespace Mdx
{ {
@ -192,51 +193,6 @@ public:
}; };
struct MdxRegex
{
MdxRegex() :
allLinksRe( "(?:<\\s*(a(?:rea)?|img|link|script|source)(?:\\s+[^>]+|\\s*)>)",
QRegularExpression::CaseInsensitiveOption ),
wordCrossLink( "([\\s\"']href\\s*=)\\s*([\"'])entry://([^>#]*?)((?:#[^>]*?)?)\\2",
QRegularExpression::CaseInsensitiveOption ),
anchorIdRe( "([\\s\"'](?:name|id)\\s*=)\\s*([\"'])\\s*(?=\\S)", QRegularExpression::CaseInsensitiveOption ),
anchorIdReWord( "([\\s\"'](?:name|id)\\s*=)\\s*([\"'])\\s*(?=\\S)([^\"]*)", QRegularExpression::CaseInsensitiveOption ),
anchorIdRe2( "([\\s\"'](?:name|id)\\s*=)\\s*(?=[^\"'])([^\\s\">]+)", QRegularExpression::CaseInsensitiveOption ),
anchorLinkRe( "([\\s\"']href\\s*=\\s*[\"'])entry://#", QRegularExpression::CaseInsensitiveOption ),
audioRe( "([\\s\"']href\\s*=)\\s*([\"'])sound://([^\">]+)\\2",
QRegularExpression::CaseInsensitiveOption | QRegularExpression::InvertedGreedinessOption ),
stylesRe( "([\\s\"']href\\s*=)\\s*([\"'])(?!\\s*\\b(?:(?:bres|https?|ftp)://"
"|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\">]+)\\2",
QRegularExpression::CaseInsensitiveOption ),
stylesRe2( "([\\s\"']href\\s*=)\\s*(?![\\s\"']|\\b(?:(?:bres|https?|ftp)://"
"|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\\s\">]+)",
QRegularExpression::CaseInsensitiveOption ),
inlineScriptRe( "<\\s*script(?:(?=\\s)(?:(?![\\s\"']src\\s*=)[^>])+|\\s*)>",
QRegularExpression::CaseInsensitiveOption ),
closeScriptTagRe( "<\\s*/script\\s*>", QRegularExpression::CaseInsensitiveOption ),
srcRe( "([\\s\"']src\\s*=)\\s*([\"'])(?!\\s*\\b(?:(?:bres|https?|ftp)://"
"|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\">]+)\\2",
QRegularExpression::CaseInsensitiveOption ),
srcRe2( "([\\s\"']src\\s*=)\\s*(?![\\s\"']|\\b(?:(?:bres|https?|ftp)://"
"|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\\s\">]+)",
QRegularExpression::CaseInsensitiveOption )
{
}
QRegularExpression allLinksRe;
QRegularExpression wordCrossLink;
QRegularExpression anchorIdRe;
QRegularExpression anchorIdReWord;
QRegularExpression anchorIdRe2;
QRegularExpression anchorLinkRe;
QRegularExpression audioRe;
QRegularExpression stylesRe;
QRegularExpression stylesRe2;
QRegularExpression inlineScriptRe;
QRegularExpression closeScriptTagRe;
QRegularExpression srcRe;
QRegularExpression srcRe2;
};
class MdxDictionary: public BtreeIndexing::BtreeDictionary class MdxDictionary: public BtreeIndexing::BtreeDictionary
{ {
Mutex idxMutex; Mutex idxMutex;
@ -256,8 +212,6 @@ class MdxDictionary: public BtreeIndexing::BtreeDictionary
string initError; string initError;
QString cacheDirName; QString cacheDirName;
static MdxRegex mdxRx;
public: public:
MdxDictionary( string const & id, string const & indexFile, vector<string> const & dictionaryFiles ); MdxDictionary( string const & id, string const & indexFile, vector<string> const & dictionaryFiles );
@ -347,8 +301,6 @@ private:
friend class MddResourceRequest; friend class MddResourceRequest;
}; };
MdxRegex MdxDictionary::mdxRx;
MdxDictionary::MdxDictionary( string const & id, string const & indexFile, MdxDictionary::MdxDictionary( string const & id, string const & indexFile,
vector<string> const & dictionaryFiles ): vector<string> const & dictionaryFiles ):
BtreeDictionary( id, dictionaryFiles ), BtreeDictionary( id, dictionaryFiles ),
@ -972,10 +924,11 @@ void MdxDictionary::loadArticle( uint32_t offset, string & articleText, bool noF
decompressed.constData() + recordInfo.recordOffset, decompressed.constData() + recordInfo.recordOffset,
recordInfo.recordSize ); recordInfo.recordSize );
article = MdictParser::substituteStylesheet( article, styleSheets );
if( !noFilter ) if( !noFilter )
{
article = MdictParser::substituteStylesheet( article, styleSheets );
article = filterResource( articleId, article ); article = filterResource( articleId, article );
}
articleText = article.toStdString(); articleText = article.toStdString();
} }
@ -987,7 +940,7 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
QString articleNewText; QString articleNewText;
int linkPos = 0; int linkPos = 0;
QRegularExpressionMatchIterator it = mdxRx.allLinksRe.globalMatch( article ); QRegularExpressionMatchIterator it = RX::Mdx::allLinksRe.globalMatch( article );
QMap<QString,QString> idMap; QMap<QString,QString> idMap;
while( it.hasNext() ) while( it.hasNext() )
{ {
@ -1005,10 +958,10 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
if( !linkType.isEmpty() && linkType.at( 0 ) == 'a' ) if( !linkType.isEmpty() && linkType.at( 0 ) == 'a' )
{ {
QRegularExpressionMatch match = mdxRx.anchorIdRe.match( linkTxt ); QRegularExpressionMatch match = RX::Mdx::anchorIdRe.match( linkTxt );
if( match.hasMatch() ) if( match.hasMatch() )
{ {
auto wordMatch = mdxRx.anchorIdReWord.match( linkTxt ); auto wordMatch = RX::Mdx::anchorIdReWord.match( linkTxt );
if( wordMatch.hasMatch() ) if( wordMatch.hasMatch() )
{ {
idMap.insert( wordMatch.captured( 3 ), uniquePrefix + wordMatch.captured( 3 ) ); idMap.insert( wordMatch.captured( 3 ), uniquePrefix + wordMatch.captured( 3 ) );
@ -1017,11 +970,11 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
newLink = linkTxt.replace( match.capturedStart(), match.capturedLength(), newText ); newLink = linkTxt.replace( match.capturedStart(), match.capturedLength(), newText );
} }
else else
newLink = linkTxt.replace( mdxRx.anchorIdRe2, "\\1\"" + uniquePrefix + "\\2\"" ); newLink = linkTxt.replace( RX::Mdx::anchorIdRe2, "\\1\"" + uniquePrefix + "\\2\"" );
newLink = newLink.replace( mdxRx.anchorLinkRe, "\\1#" + uniquePrefix ); newLink = newLink.replace( RX::Mdx::anchorLinkRe, "\\1#" + uniquePrefix );
match = mdxRx.audioRe.match( newLink ); match = RX::Mdx::audioRe.match( newLink );
if( match.hasMatch() ) if( match.hasMatch() )
{ {
// sounds and audio link script // sounds and audio link script
@ -1032,7 +985,7 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
+ newLink.replace( match.capturedStart(), match.capturedLength(), newTxt ); + newLink.replace( match.capturedStart(), match.capturedLength(), newTxt );
} }
match = mdxRx.wordCrossLink.match( newLink ); match = RX::Mdx::wordCrossLink.match( newLink );
if( match.hasMatch() ) if( match.hasMatch() )
{ {
QString newTxt = match.captured( 1 ) + match.captured( 2 ) QString newTxt = match.captured( 1 ) + match.captured( 2 )
@ -1050,7 +1003,7 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
if( linkType.compare( "link" ) == 0 ) if( linkType.compare( "link" ) == 0 )
{ {
// stylesheets // stylesheets
QRegularExpressionMatch match = mdxRx.stylesRe.match( linkTxt ); QRegularExpressionMatch match = RX::Mdx::stylesRe.match( linkTxt );
if( match.hasMatch() ) if( match.hasMatch() )
{ {
QString newText = match.captured( 1 ) + match.captured( 2 ) QString newText = match.captured( 1 ) + match.captured( 2 )
@ -1059,7 +1012,7 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
newLink = linkTxt.replace( match.capturedStart(), match.capturedLength(), newText ); newLink = linkTxt.replace( match.capturedStart(), match.capturedLength(), newText );
} }
else else
newLink = linkTxt.replace( mdxRx.stylesRe2, newLink = linkTxt.replace( RX::Mdx::stylesRe2,
"\\1\"bres://" + id + "/\\2\"" ); "\\1\"bres://" + id + "/\\2\"" );
} }
else else
@ -1067,13 +1020,13 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
|| linkType.compare( "source" ) == 0 ) || linkType.compare( "source" ) == 0 )
{ {
// javascripts and images // javascripts and images
QRegularExpressionMatch match = mdxRx.inlineScriptRe.match( linkTxt ); QRegularExpressionMatch match = RX::Mdx::inlineScriptRe.match( linkTxt );
if( linkType.at( 1 ) == 'c' // "script" tag if( linkType.at( 1 ) == 'c' // "script" tag
&& match.hasMatch() && match.capturedLength() == linkTxt.length() ) && match.hasMatch() && match.capturedLength() == linkTxt.length() )
{ {
// skip inline scripts // skip inline scripts
articleNewText += linkTxt; articleNewText += linkTxt;
match = mdxRx.closeScriptTagRe.match( article, linkPos ); match = RX::Mdx::closeScriptTagRe.match( article, linkPos );
if( match.hasMatch() ) if( match.hasMatch() )
{ {
articleNewText += article.mid( linkPos, match.capturedEnd() - linkPos ); articleNewText += article.mid( linkPos, match.capturedEnd() - linkPos );
@ -1083,7 +1036,7 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
} }
else else
{ {
match = mdxRx.srcRe.match( linkTxt ); match = RX::Mdx::srcRe.match( linkTxt );
if( match.hasMatch() ) if( match.hasMatch() )
{ {
QString newText; QString newText;
@ -1104,7 +1057,7 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
newLink = linkTxt.replace( match.capturedStart(), match.capturedLength(), newText ); newLink = linkTxt.replace( match.capturedStart(), match.capturedLength(), newText );
} }
else else
newLink = linkTxt.replace( mdxRx.srcRe2, newLink = linkTxt.replace( RX::Mdx::srcRe2,
"\\1\"bres://" + id + "/\\2\"" ); "\\1\"bres://" + id + "/\\2\"" );
} }
} }