diff --git a/.github/workflows/macos-PR-check.yml b/.github/workflows/macos-PR-check.yml index def35f86..3fb902b7 100644 --- a/.github/workflows/macos-PR-check.yml +++ b/.github/workflows/macos-PR-check.yml @@ -1,5 +1,7 @@ name: macos-PR-check - +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true on: workflow_dispatch: diff --git a/.github/workflows/ubuntu-PR-check.yml b/.github/workflows/ubuntu-PR-check.yml index 5fad1eb0..781c5225 100644 --- a/.github/workflows/ubuntu-PR-check.yml +++ b/.github/workflows/ubuntu-PR-check.yml @@ -1,5 +1,7 @@ name: Ubuntu-PR-check - +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true on: workflow_dispatch: diff --git a/.github/workflows/windows-PR-check.yml b/.github/workflows/windows-PR-check.yml index ea7348bb..9a0724d6 100644 --- a/.github/workflows/windows-PR-check.yml +++ b/.github/workflows/windows-PR-check.yml @@ -1,5 +1,7 @@ name: Windows-PR-check - +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true on: workflow_dispatch: diff --git a/article-style-st-lingvo.css b/article-style-st-lingvo.css index 8da59179..20974124 100644 --- a/article-style-st-lingvo.css +++ b/article-style-st-lingvo.css @@ -24,24 +24,19 @@ a:hover background: white; } -/* Dictionary's name heading */ -.gddictname -{ - border: 1px dotted black; padding: 0.2em; padding-left: 0.5em; - margin-top: 1.2em; margin-bottom: 0.1em; font-weight: bold; font-size: 14px; - background: #87CEEB; -} - /* The 'From ' string which preceeds dictionary name in the heading */ .gdfromprefix { display: none; } +/* Dictionary's name heading */ .gddictname { + padding: 0.2em; padding-left: 0.5em; + margin-bottom: 0.1em; + font-size: 14px; font-weight: normal; - float: right; border: 1px solid white; margin-top: 7px; diff --git a/article-style.css b/article-style.css index 959edbaa..4f2f871e 100644 --- a/article-style.css +++ b/article-style.css @@ -42,6 +42,11 @@ pre /*background: #ffffdd;*/ } +.gddicttitle +{ + user-select: none; +} + .gddictnamebodyseparator { clear: both; diff --git a/base/globalregex.cc b/base/globalregex.cc new file mode 100644 index 00000000..fab12fe5 --- /dev/null +++ b/base/globalregex.cc @@ -0,0 +1,50 @@ +#include "globalregex.hh" +#include "fulltextsearch.hh" + +using namespace RX; + +QRegularExpression Ftx::regBrackets( + "(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+)(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+){0,1}(\\([\\w\\p{M}]+\\)){0,1}", + QRegularExpression::UseUnicodePropertiesOption ); +QRegularExpression Ftx::regSplit( "[^\\w\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption ); + +QRegularExpression Ftx::spacesRegExp( "\\W+", QRegularExpression::UseUnicodePropertiesOption ); +QRegularExpression Ftx::wordRegExp( QString( "\\w{" ) + QString::number( FTS::MinimumWordSize ) + ",}", + QRegularExpression::UseUnicodePropertiesOption ); +QRegularExpression Ftx::setsRegExp( "\\[[^\\]]+\\]", QRegularExpression::CaseInsensitiveOption ); +QRegularExpression Ftx::regexRegExp( "\\\\[afnrtvdDwWsSbB]|\\\\x([0-9A-Fa-f]{4})|\\\\0([0-7]{3})", + QRegularExpression::CaseInsensitiveOption ); + + +//mdx + +QRegularExpression Mdx::allLinksRe( "(?:<\\s*(a(?:rea)?|img|link|script|source)(?:\\s+[^>]+|\\s*)>)", + QRegularExpression::CaseInsensitiveOption ); +QRegularExpression Mdx::wordCrossLink( "([\\s\"']href\\s*=)\\s*([\"'])entry://([^>#]*?)((?:#[^>]*?)?)\\2", + QRegularExpression::CaseInsensitiveOption ); +QRegularExpression Mdx::anchorIdRe( "([\\s\"'](?:name|id)\\s*=)\\s*([\"'])\\s*(?=\\S)", + QRegularExpression::CaseInsensitiveOption ); +QRegularExpression Mdx::anchorIdReWord( "([\\s\"'](?:name|id)\\s*=)\\s*([\"'])\\s*(?=\\S)([^\"]*)", + QRegularExpression::CaseInsensitiveOption ); +QRegularExpression Mdx::anchorIdRe2( "([\\s\"'](?:name|id)\\s*=)\\s*(?=[^\"'])([^\\s\">]+)", + QRegularExpression::CaseInsensitiveOption ); +QRegularExpression Mdx::anchorLinkRe( "([\\s\"']href\\s*=\\s*[\"'])entry://#", + QRegularExpression::CaseInsensitiveOption ); +QRegularExpression Mdx::audioRe( "([\\s\"']href\\s*=)\\s*([\"'])sound://([^\">]+)\\2", + QRegularExpression::CaseInsensitiveOption + | QRegularExpression::InvertedGreedinessOption ); +QRegularExpression Mdx::stylesRe( "([\\s\"']href\\s*=)\\s*([\"'])(?!\\s*\\b(?:(?:bres|https?|ftp)://" + "|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\">]+)\\2", + QRegularExpression::CaseInsensitiveOption ); +QRegularExpression Mdx::stylesRe2( "([\\s\"']href\\s*=)\\s*(?![\\s\"']|\\b(?:(?:bres|https?|ftp)://" + "|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\\s\">]+)", + QRegularExpression::CaseInsensitiveOption ); +QRegularExpression Mdx::inlineScriptRe( "<\\s*script(?:(?=\\s)(?:(?![\\s\"']src\\s*=)[^>])+|\\s*)>", + QRegularExpression::CaseInsensitiveOption ); +QRegularExpression Mdx::closeScriptTagRe( "<\\s*/script\\s*>", QRegularExpression::CaseInsensitiveOption ); +QRegularExpression Mdx::srcRe( "([\\s\"']src\\s*=)\\s*([\"'])(?!\\s*\\b(?:(?:bres|https?|ftp)://" + "|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\">]+)\\2", + QRegularExpression::CaseInsensitiveOption ); +QRegularExpression Mdx::srcRe2( "([\\s\"']src\\s*=)\\s*(?![\\s\"']|\\b(?:(?:bres|https?|ftp)://" + "|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\\s\">]+)", + QRegularExpression::CaseInsensitiveOption ); diff --git a/base/globalregex.hh b/base/globalregex.hh new file mode 100644 index 00000000..012aba0d --- /dev/null +++ b/base/globalregex.hh @@ -0,0 +1,40 @@ +#ifndef GLOBALREGEX_HH +#define GLOBALREGEX_HH + +#include + +namespace RX +{ +class Ftx +{ +public: + static QRegularExpression regBrackets; + static QRegularExpression regSplit; + static QRegularExpression spacesRegExp; + static QRegularExpression wordRegExp; + static QRegularExpression setsRegExp; + static QRegularExpression regexRegExp; +}; + + +class Mdx +{ +public: + static QRegularExpression allLinksRe; + static QRegularExpression wordCrossLink; + static QRegularExpression anchorIdRe; + static QRegularExpression anchorIdReWord; + static QRegularExpression anchorIdRe2; + static QRegularExpression anchorLinkRe; + static QRegularExpression audioRe; + static QRegularExpression stylesRe; + static QRegularExpression stylesRe2; + static QRegularExpression inlineScriptRe; + static QRegularExpression closeScriptTagRe; + static QRegularExpression srcRe; + static QRegularExpression srcRe2; +}; + +} // namespace RX + +#endif // GLOBALREGEX_HH diff --git a/ftshelpers.cc b/ftshelpers.cc index e155e06f..997c05c0 100644 --- a/ftshelpers.cc +++ b/ftshelpers.cc @@ -17,6 +17,8 @@ #include #include "wildcard.hh" +#include +#include "base/globalregex.hh" using std::vector; using std::string; @@ -147,36 +149,36 @@ bool parseSearchString( QString const & str, QStringList & indexWords, { searchWords.clear(); indexWords.clear(); - QRegularExpression spacesRegExp( "\\W+", QRegularExpression::UseUnicodePropertiesOption ); - QRegularExpression wordRegExp( QString( "\\w{" ) + QString::number( FTS::MinimumWordSize ) + ",}", QRegularExpression::UseUnicodePropertiesOption ); - QRegularExpression setsRegExp( "\\[[^\\]]+\\]", QRegularExpression::CaseInsensitiveOption ); - QRegularExpression regexRegExp( "\\\\[afnrtvdDwWsSbB]|\\\\x([0-9A-Fa-f]{4})|\\\\0([0-7]{3})", QRegularExpression::CaseInsensitiveOption); + // QRegularExpression spacesRegExp( "\\W+", QRegularExpression::UseUnicodePropertiesOption ); + // QRegularExpression wordRegExp( QString( "\\w{" ) + QString::number( FTS::MinimumWordSize ) + ",}", QRegularExpression::UseUnicodePropertiesOption ); + // QRegularExpression setsRegExp( "\\[[^\\]]+\\]", QRegularExpression::CaseInsensitiveOption ); + // QRegularExpression regexRegExp( "\\\\[afnrtvdDwWsSbB]|\\\\x([0-9A-Fa-f]{4})|\\\\0([0-7]{3})", QRegularExpression::CaseInsensitiveOption); hasCJK = containCJK( str ); if( searchMode == FTS::WholeWords || searchMode == FTS::PlainText ) { // Make words list for search in article text - searchWords = str.normalized( QString::NormalizationForm_C ).split( spacesRegExp, Qt::SkipEmptyParts ); + searchWords = str.normalized( QString::NormalizationForm_C ).split( RX::Ftx::spacesRegExp, Qt::SkipEmptyParts ); // Make words list for index search QStringList list = - str.normalized( QString::NormalizationForm_C ).toLower().split( spacesRegExp, Qt::SkipEmptyParts ); + str.normalized( QString::NormalizationForm_C ).toLower().split( RX::Ftx::spacesRegExp, Qt::SkipEmptyParts ); QString searchString; if( hasCJK ) { - tokenizeCJK( indexWords, wordRegExp, list ); + tokenizeCJK( indexWords, RX::Ftx::wordRegExp, list ); // QStringList allWords = str.split( spacesRegExp, Qt::SkipEmptyParts ); searchString = makeHiliteRegExpString( list, searchMode, distanceBetweenWords, hasCJK , ignoreWordsOrder); } else { - indexWords = list.filter( wordRegExp ); + indexWords = list.filter( RX::Ftx::wordRegExp ); indexWords.removeDuplicates(); // Make regexp for results hilite - QStringList allWords = str.split( spacesRegExp, Qt::SkipEmptyParts ); + QStringList allWords = str.split( RX::Ftx::spacesRegExp, Qt::SkipEmptyParts ); searchString = makeHiliteRegExpString( allWords, searchMode, distanceBetweenWords,false, ignoreWordsOrder ); } searchRegExp = QRegExp( searchString, matchCase ? Qt::CaseSensitive : Qt::CaseInsensitive, QRegExp::RegExp2 ); @@ -191,21 +193,21 @@ bool parseSearchString( QString const & str, QStringList & indexWords, // Remove RegExp commands if( searchMode == FTS::RegExp ) - tmp.replace( regexRegExp, " " ); + tmp.replace( RX::Ftx::regexRegExp, " " ); // Remove all symbol sets - tmp.replace( setsRegExp, " " ); + tmp.replace( RX::Ftx::setsRegExp, " " ); QStringList list = tmp.normalized( QString::NormalizationForm_C ) - .toLower().split( spacesRegExp, Qt::SkipEmptyParts ); + .toLower().split( RX::Ftx::spacesRegExp, Qt::SkipEmptyParts ); if( hasCJK ) { - tokenizeCJK( indexWords, wordRegExp, list ); + tokenizeCJK( indexWords, RX::Ftx::wordRegExp, list ); } else { - indexWords = list.filter( wordRegExp ); + indexWords = list.filter( RX::Ftx::wordRegExp ); indexWords.removeDuplicates(); } @@ -224,9 +226,9 @@ void parseArticleForFts( uint32_t articleAddress, QString & articleText, if( articleText.isEmpty() ) return; - QRegularExpression regBrackets( "(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+)(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+){0,1}(\\([\\w\\p{M}]+\\)){0,1}", - QRegularExpression::UseUnicodePropertiesOption); - QRegularExpression regSplit( "[^\\w\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption ); + // QRegularExpression regBrackets( "(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+)(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+){0,1}(\\([\\w\\p{M}]+\\)){0,1}", + // QRegularExpression::UseUnicodePropertiesOption); + // QRegularExpression regSplit( "[^\\w\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption ); QStringList articleWords = articleText.normalized( QString::NormalizationForm_C ) .split( QRegularExpression( handleRoundBrackets ? "[^\\w\\(\\)\\p{M}]+" : "[^\\w\\p{M}]+", @@ -275,12 +277,12 @@ void parseArticleForFts( uint32_t articleAddress, QString & articleText, // Special handle for words with round brackets - DSL feature QStringList list; - QStringList oldVariant = word.split( regSplit, Qt::SkipEmptyParts ); + QStringList oldVariant = word.split( RX::Ftx::regSplit, Qt::SkipEmptyParts ); for( QStringList::iterator it = oldVariant.begin(); it != oldVariant.end(); ++it ) if( it->size() >= FTS::MinimumWordSize && !list.contains( *it ) ) list.append( *it ); - QRegularExpressionMatch match = regBrackets.match( word ); + QRegularExpressionMatch match = RX::Ftx::regBrackets.match( word ); if( match.hasMatch() ) { QStringList parts = match.capturedTexts(); @@ -445,21 +447,20 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets, QStringList const & words, QRegExp const & searchRegexp ) { - int results = 0; + QtConcurrent::blockingMap( offsets, [ & ]( uint32_t offset ) { checkSingleArticle( offset, words, searchRegexp ); } ); +} + +void FTSResultsRequest::checkSingleArticle( uint32_t offset, + QStringList const & words, + QRegExp const & searchRegexp ) +{ + qDebug()<<"checking"< offsetsForHeadwords; QVector< QStringList > hiliteRegExps; QString id = QString::fromUtf8( dict.getId().c_str() ); - bool needHandleBrackets; - { - QString name = QString::fromUtf8( dict.getDictionaryFilenames()[ 0 ].c_str() ).toLower(); - needHandleBrackets = name.endsWith( ".dsl" ) || name.endsWith( ".dsl.dz" ); - } - - QRegularExpression regBrackets( "(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+)(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+){0,1}(\\([\\w\\p{M}]+\\)){0,1}", - QRegularExpression::UseUnicodePropertiesOption); - QRegularExpression regSplit( "[^\\w\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption ); // RegExp mode QRegularExpression searchRegularExpression; @@ -478,12 +479,13 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets, if( searchMode == FTS::Wildcards || searchMode == FTS::RegExp ) { - for( int i = 0; i < offsets.size(); i++ ) + // for( int i = 0; i < offsets.size(); i++ ) { if( Utils::AtomicInt::loadAcquire( isCancelled ) ) - break; + return; - dict.getArticleText( offsets.at( i ), headword, articleText ); + // auto article_address = offsets.at( i ); + dict.getArticleText( offset, headword, articleText ); articleText = articleText.normalized( QString::NormalizationForm_C ); if( ignoreDiacritics ) @@ -492,13 +494,13 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets, if( articleText.contains( searchRegularExpression ) ) { if( headword.isEmpty() ) - offsetsForHeadwords.append( offsets.at( i ) ); + offsetsForHeadwords.append( offset ); else foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) ); - results++; + ++results; if( maxResults > 0 && results >= maxResults ) - break; + return; } } } @@ -506,10 +508,6 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets, { // Words mode - QRegularExpression splitWithBrackets( "[^\\w\\(\\)\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption ); - QRegularExpression splitWithoutBrackets( "[^\\w\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption ); - - Qt::CaseSensitivity cs = matchCase ? Qt::CaseSensitive : Qt::CaseInsensitive; QVector< QPair< QString, bool > > wordsList; if( ignoreWordsOrder ) { @@ -517,18 +515,10 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets, wordsList.append( QPair< QString, bool >( *it, true ) ); } - for( int i = 0; i < offsets.size(); i++ ) + // for( int i = 0; i < offsets.size(); i++ ) { if( Utils::AtomicInt::loadAcquire( isCancelled ) ) - break; - - int pos = 0; - int matchWordNom = 0; - int unmatchWordNom = 0; - int nextNotFoundPos = 0; - - QVector< QStringList > allOrders; - QStringList order; + return; if( ignoreWordsOrder ) { @@ -536,17 +526,14 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets, wordsList[ i ].second = true; } - dict.getArticleText( offsets.at( i ), headword, articleText ); + dict.getArticleText( offset, headword, articleText ); articleText = articleText.normalized( QString::NormalizationForm_C ); if( ignoreDiacritics ) articleText = gd::toQString( Folding::applyDiacriticsOnly( gd::toWString( articleText ) ) ); - //QStringList articleWords = articleText.split( needHandleBrackets ? splitWithBrackets : splitWithoutBrackets, - // Qt::SkipEmptyParts ); - - if(ignoreWordsOrder) + if( ignoreWordsOrder ) { bool allMatch = true; foreach( QString word, words ) @@ -559,75 +546,78 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets, break; } } - else if( searchMode == FTS::WholeWords) + else if( searchMode == FTS::WholeWords ) { - QRegularExpression tmpReg( QString( "\b%1\b" ).arg( word ),QRegularExpression::CaseInsensitiveOption|QRegularExpression::UseUnicodePropertiesOption ); - if( !articleText.contains( tmpReg) ) + QRegularExpression tmpReg( QString( "\b%1\b" ).arg( word ), + QRegularExpression::CaseInsensitiveOption + | QRegularExpression::UseUnicodePropertiesOption ); + if( !articleText.contains( tmpReg ) ) { allMatch = false; break; } } - } - if(!allMatch) + if( !allMatch ) { - continue; + return; } if( distanceBetweenWords >= 0 ) { // the article text contains all the needed words. // determine if distance restriction is meet - QRegularExpression replaceReg( QString( "(%1)" ).arg( words.join( '|' ) ), - QRegularExpression::CaseInsensitiveOption | - QRegularExpression::UseUnicodePropertiesOption ); + const QRegularExpression replaceReg( QString( "(%1)" ).arg( words.join( '|' ) ), + QRegularExpression::CaseInsensitiveOption + | QRegularExpression::UseUnicodePropertiesOption ); // use a string that could not be presented in the article. articleText = articleText.replace( replaceReg, "=@XXXXX@=" ); auto hasCJK = false; - foreach(QString word,words) + foreach( QString word, words ) { - if(containCJK( word )) + if( containCJK( word ) ) { hasCJK = true; break; } } - //hascjk value ,perhaps should depend on each word - auto searchRegStr = makeHiliteRegExpString( Utils::repeat( "=@XXXXX@=", words.size() ), searchMode, distanceBetweenWords,hasCJK ); - QRegularExpression distanceOrderReg( searchRegStr, - QRegularExpression::CaseInsensitiveOption | - QRegularExpression::UseUnicodePropertiesOption ); + // hascjk value ,perhaps should depend on each word + const auto searchRegStr = makeHiliteRegExpString( Utils::repeat( "=@XXXXX@=", words.size() ), + searchMode, + distanceBetweenWords, + hasCJK ); + const QRegularExpression distanceOrderReg( searchRegStr, + QRegularExpression::CaseInsensitiveOption + | QRegularExpression::UseUnicodePropertiesOption ); // use a string that could not be presented in the article. - if(articleText.contains(distanceOrderReg)) + if( articleText.contains( distanceOrderReg ) ) { if( headword.isEmpty() ) - offsetsForHeadwords.append( offsets.at( i ) ); + offsetsForHeadwords.append( offset ); else foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) ); - results++; + ++results; if( maxResults > 0 && results >= maxResults ) - break; + return; } } - } else { if( articleText.contains( searchRegularExpression ) ) { if( headword.isEmpty() ) - offsetsForHeadwords.append( offsets.at( i ) ); + offsetsForHeadwords.append( offset ); else foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) ); - - results++; + + ++results; if( maxResults > 0 && results >= maxResults ) - break; + return; } } } @@ -637,7 +627,10 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets, QVector< QString > headwords; dict.getHeadwordsFromOffsets( offsetsForHeadwords, headwords, &isCancelled ); for( int x = 0; x < headwords.size(); x++ ) - foundHeadwords->append( FTS::FtsHeadword( headwords.at( x ), id, x < hiliteRegExps.size() ? hiliteRegExps.at( x ) : QStringList(), matchCase ) ); + foundHeadwords->append( FTS::FtsHeadword( headwords.at( x ), + id, + x < hiliteRegExps.size() ? hiliteRegExps.at( x ) : QStringList(), + matchCase ) ); } } @@ -648,27 +641,28 @@ void FTSResultsRequest::indexSearch( BtreeIndexing::BtreeIndex & ftsIndex, { // Find articles which contains all requested words - vector< BtreeIndexing::WordArticleLink > links; - QSet< uint32_t > setOfOffsets, tmp; - uint32_t size; + QSet< uint32_t > setOfOffsets; if( indexWords.isEmpty() ) return; - int n = indexWords.length(); - for( int i = 0; i < n; i++ ) + QList< QSet< uint32_t > > addressLists; + + auto findLinks = [ & ]( const QString & word ) { + QSet< uint32_t > tmp; + uint32_t size; + if( Utils::AtomicInt::loadAcquire( isCancelled ) ) - return; + addressLists<< tmp; - tmp.clear(); - - links = ftsIndex.findArticles( gd::toWString( indexWords.at( i ) ), ignoreDiacritics ); + vector< BtreeIndexing::WordArticleLink > links = + ftsIndex.findArticles( gd::toWString( word ), ignoreDiacritics ); for( unsigned x = 0; x < links.size(); x++ ) { if( Utils::AtomicInt::loadAcquire( isCancelled ) ) - return; + addressLists<< tmp; vector< char > chunk; char * linksPtr; @@ -677,24 +671,31 @@ void FTSResultsRequest::indexSearch( BtreeIndexing::BtreeIndex & ftsIndex, linksPtr = chunks->getBlock( links[ x ].articleOffset, chunk ); } - memcpy( &size, linksPtr, sizeof(uint32_t) ); - linksPtr += sizeof(uint32_t); + memcpy( &size, linksPtr, sizeof( uint32_t ) ); + linksPtr += sizeof( uint32_t ); for( uint32_t y = 0; y < size; y++ ) { tmp.insert( *( reinterpret_cast< uint32_t * >( linksPtr ) ) ); - linksPtr += sizeof(uint32_t); + linksPtr += sizeof( uint32_t ); } } links.clear(); - if( i == 0 ) - setOfOffsets = tmp; + addressLists<< tmp; + }; + // int n = indexWords.length(); + QtConcurrent::blockingMap( indexWords, findLinks ); + + int i = 0; + for( auto & elem : addressLists ) + { + if( i++ == 0 ) + setOfOffsets = elem; else - setOfOffsets = setOfOffsets.intersect( tmp ); + setOfOffsets = setOfOffsets.intersect( elem ); } - tmp.clear(); if( setOfOffsets.isEmpty() ) return; @@ -757,17 +758,15 @@ void FTSResultsRequest::combinedIndexSearch( BtreeIndexing::BtreeIndex & ftsInde if( !hieroglyphsList.empty() ) { - QSet< uint32_t > tmp; - vector< BtreeIndexing::WordArticleLink > links; - - for( int i = 0; i < hieroglyphsList.size(); i++ ) + QList< QSet< uint32_t > > sets; + auto fn_wordLink = [ & ](const QString & word ) { - links = ftsIndex.findArticles( gd::toWString( hieroglyphsList.at( i ) ) ); + QSet< uint32_t > tmp; + vector< BtreeIndexing::WordArticleLink > links = ftsIndex.findArticles( gd::toWString( word ) ); for( unsigned x = 0; x < links.size(); x++ ) { - if( Utils::AtomicInt::loadAcquire( isCancelled ) ) - return; + sets<< tmp; vector< char > chunk; char * linksPtr; @@ -786,11 +785,17 @@ void FTSResultsRequest::combinedIndexSearch( BtreeIndexing::BtreeIndex & ftsInde } links.clear(); + sets<< tmp; + }; + QtConcurrent::blockingMap( hieroglyphsList, fn_wordLink ); - if( i == 0 ) - setOfOffsets = tmp; + int i = 0; + for( auto & elem : sets ) + { + if( i++ == 0 ) + setOfOffsets = elem; else - setOfOffsets = setOfOffsets.intersect( tmp ); + setOfOffsets = setOfOffsets.intersect( elem ); } allWordsLinks[ wordNom ] = setOfOffsets; diff --git a/ftshelpers.hh b/ftshelpers.hh index e5c712b9..4bd309ac 100644 --- a/ftshelpers.hh +++ b/ftshelpers.hh @@ -82,12 +82,16 @@ class FTSResultsRequest : public Dictionary::DataRequest QAtomicInt isCancelled; + QAtomicInt results; + QList< FTS::FtsHeadword > * foundHeadwords; void checkArticles( QVector< uint32_t > const & offsets, QStringList const & words, QRegExp const & searchRegexp = QRegExp() ); + void checkSingleArticle( uint32_t offset, QStringList const & words, QRegExp const & searchRegexp = QRegExp() ); + void indexSearch( BtreeIndexing::BtreeIndex & ftsIndex, sptr< ChunkedStorage::Reader > chunks, QStringList & indexWords, @@ -127,6 +131,7 @@ public: searchString = gd::toQString( Folding::applyDiacriticsOnly( gd::toWString( searchString_ ) ) ); foundHeadwords = new QList< FTS::FtsHeadword >; + results = 0; QThreadPool::globalInstance()->start( [ this ]() { this->run(); }, -100 ); } diff --git a/goldendict.pro b/goldendict.pro index 440dda9a..7ab55f53 100644 --- a/goldendict.pro +++ b/goldendict.pro @@ -47,7 +47,8 @@ QT += core \ webenginewidgets\ webchannel\ printsupport \ - help + help \ + concurrent greaterThan(QT_MAJOR_VERSION, 5): QT += webenginecore core5compat @@ -242,6 +243,7 @@ HEADERS += folding.hh \ ankiconnector.h \ article_inspect.h \ articlewebpage.h \ + base/globalregex.hh \ globalbroadcaster.h \ iframeschemehandler.h \ inc_case_folding.hh \ @@ -384,6 +386,7 @@ SOURCES += folding.cc \ ankiconnector.cpp \ article_inspect.cpp \ articlewebpage.cpp \ + base/globalregex.cc \ globalbroadcaster.cpp \ iframeschemehandler.cpp \ main.cc \ diff --git a/mdx.cc b/mdx.cc index caa3ec0f..8998bfa5 100644 --- a/mdx.cc +++ b/mdx.cc @@ -42,6 +42,7 @@ #include "tiff.hh" #include "utils.hh" +#include "base/globalregex.hh" namespace Mdx { @@ -192,51 +193,6 @@ public: }; -struct MdxRegex -{ - MdxRegex() : - allLinksRe( "(?:<\\s*(a(?:rea)?|img|link|script|source)(?:\\s+[^>]+|\\s*)>)", - QRegularExpression::CaseInsensitiveOption ), - wordCrossLink( "([\\s\"']href\\s*=)\\s*([\"'])entry://([^>#]*?)((?:#[^>]*?)?)\\2", - QRegularExpression::CaseInsensitiveOption ), - anchorIdRe( "([\\s\"'](?:name|id)\\s*=)\\s*([\"'])\\s*(?=\\S)", QRegularExpression::CaseInsensitiveOption ), - anchorIdReWord( "([\\s\"'](?:name|id)\\s*=)\\s*([\"'])\\s*(?=\\S)([^\"]*)", QRegularExpression::CaseInsensitiveOption ), - anchorIdRe2( "([\\s\"'](?:name|id)\\s*=)\\s*(?=[^\"'])([^\\s\">]+)", QRegularExpression::CaseInsensitiveOption ), - anchorLinkRe( "([\\s\"']href\\s*=\\s*[\"'])entry://#", QRegularExpression::CaseInsensitiveOption ), - audioRe( "([\\s\"']href\\s*=)\\s*([\"'])sound://([^\">]+)\\2", - QRegularExpression::CaseInsensitiveOption | QRegularExpression::InvertedGreedinessOption ), - stylesRe( "([\\s\"']href\\s*=)\\s*([\"'])(?!\\s*\\b(?:(?:bres|https?|ftp)://" - "|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\">]+)\\2", - QRegularExpression::CaseInsensitiveOption ), - stylesRe2( "([\\s\"']href\\s*=)\\s*(?![\\s\"']|\\b(?:(?:bres|https?|ftp)://" - "|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\\s\">]+)", - QRegularExpression::CaseInsensitiveOption ), - inlineScriptRe( "<\\s*script(?:(?=\\s)(?:(?![\\s\"']src\\s*=)[^>])+|\\s*)>", - QRegularExpression::CaseInsensitiveOption ), - closeScriptTagRe( "<\\s*/script\\s*>", QRegularExpression::CaseInsensitiveOption ), - srcRe( "([\\s\"']src\\s*=)\\s*([\"'])(?!\\s*\\b(?:(?:bres|https?|ftp)://" - "|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\">]+)\\2", - QRegularExpression::CaseInsensitiveOption ), - srcRe2( "([\\s\"']src\\s*=)\\s*(?![\\s\"']|\\b(?:(?:bres|https?|ftp)://" - "|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\\s\">]+)", - QRegularExpression::CaseInsensitiveOption ) - { - } - QRegularExpression allLinksRe; - QRegularExpression wordCrossLink; - QRegularExpression anchorIdRe; - QRegularExpression anchorIdReWord; - QRegularExpression anchorIdRe2; - QRegularExpression anchorLinkRe; - QRegularExpression audioRe; - QRegularExpression stylesRe; - QRegularExpression stylesRe2; - QRegularExpression inlineScriptRe; - QRegularExpression closeScriptTagRe; - QRegularExpression srcRe; - QRegularExpression srcRe2; -}; - class MdxDictionary: public BtreeIndexing::BtreeDictionary { Mutex idxMutex; @@ -256,8 +212,6 @@ class MdxDictionary: public BtreeIndexing::BtreeDictionary string initError; QString cacheDirName; - static MdxRegex mdxRx; - public: MdxDictionary( string const & id, string const & indexFile, vector const & dictionaryFiles ); @@ -347,8 +301,6 @@ private: friend class MddResourceRequest; }; -MdxRegex MdxDictionary::mdxRx; - MdxDictionary::MdxDictionary( string const & id, string const & indexFile, vector const & dictionaryFiles ): BtreeDictionary( id, dictionaryFiles ), @@ -972,10 +924,11 @@ void MdxDictionary::loadArticle( uint32_t offset, string & articleText, bool noF decompressed.constData() + recordInfo.recordOffset, recordInfo.recordSize ); - article = MdictParser::substituteStylesheet( article, styleSheets ); - if( !noFilter ) + { + article = MdictParser::substituteStylesheet( article, styleSheets ); article = filterResource( articleId, article ); + } articleText = article.toStdString(); } @@ -987,7 +940,7 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar QString articleNewText; int linkPos = 0; - QRegularExpressionMatchIterator it = mdxRx.allLinksRe.globalMatch( article ); + QRegularExpressionMatchIterator it = RX::Mdx::allLinksRe.globalMatch( article ); QMap idMap; while( it.hasNext() ) { @@ -1005,10 +958,10 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar if( !linkType.isEmpty() && linkType.at( 0 ) == 'a' ) { - QRegularExpressionMatch match = mdxRx.anchorIdRe.match( linkTxt ); + QRegularExpressionMatch match = RX::Mdx::anchorIdRe.match( linkTxt ); if( match.hasMatch() ) { - auto wordMatch = mdxRx.anchorIdReWord.match( linkTxt ); + auto wordMatch = RX::Mdx::anchorIdReWord.match( linkTxt ); if( wordMatch.hasMatch() ) { idMap.insert( wordMatch.captured( 3 ), uniquePrefix + wordMatch.captured( 3 ) ); @@ -1017,11 +970,11 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar newLink = linkTxt.replace( match.capturedStart(), match.capturedLength(), newText ); } else - newLink = linkTxt.replace( mdxRx.anchorIdRe2, "\\1\"" + uniquePrefix + "\\2\"" ); + newLink = linkTxt.replace( RX::Mdx::anchorIdRe2, "\\1\"" + uniquePrefix + "\\2\"" ); - newLink = newLink.replace( mdxRx.anchorLinkRe, "\\1#" + uniquePrefix ); + newLink = newLink.replace( RX::Mdx::anchorLinkRe, "\\1#" + uniquePrefix ); - match = mdxRx.audioRe.match( newLink ); + match = RX::Mdx::audioRe.match( newLink ); if( match.hasMatch() ) { // sounds and audio link script @@ -1032,7 +985,7 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar + newLink.replace( match.capturedStart(), match.capturedLength(), newTxt ); } - match = mdxRx.wordCrossLink.match( newLink ); + match = RX::Mdx::wordCrossLink.match( newLink ); if( match.hasMatch() ) { QString newTxt = match.captured( 1 ) + match.captured( 2 ) @@ -1050,7 +1003,7 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar if( linkType.compare( "link" ) == 0 ) { // stylesheets - QRegularExpressionMatch match = mdxRx.stylesRe.match( linkTxt ); + QRegularExpressionMatch match = RX::Mdx::stylesRe.match( linkTxt ); if( match.hasMatch() ) { QString newText = match.captured( 1 ) + match.captured( 2 ) @@ -1059,7 +1012,7 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar newLink = linkTxt.replace( match.capturedStart(), match.capturedLength(), newText ); } else - newLink = linkTxt.replace( mdxRx.stylesRe2, + newLink = linkTxt.replace( RX::Mdx::stylesRe2, "\\1\"bres://" + id + "/\\2\"" ); } else @@ -1067,13 +1020,13 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar || linkType.compare( "source" ) == 0 ) { // javascripts and images - QRegularExpressionMatch match = mdxRx.inlineScriptRe.match( linkTxt ); + QRegularExpressionMatch match = RX::Mdx::inlineScriptRe.match( linkTxt ); if( linkType.at( 1 ) == 'c' // "script" tag && match.hasMatch() && match.capturedLength() == linkTxt.length() ) { // skip inline scripts articleNewText += linkTxt; - match = mdxRx.closeScriptTagRe.match( article, linkPos ); + match = RX::Mdx::closeScriptTagRe.match( article, linkPos ); if( match.hasMatch() ) { articleNewText += article.mid( linkPos, match.capturedEnd() - linkPos ); @@ -1083,7 +1036,7 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar } else { - match = mdxRx.srcRe.match( linkTxt ); + match = RX::Mdx::srcRe.match( linkTxt ); if( match.hasMatch() ) { QString newText; @@ -1104,7 +1057,7 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar newLink = linkTxt.replace( match.capturedStart(), match.capturedLength(), newText ); } else - newLink = linkTxt.replace( mdxRx.srcRe2, + newLink = linkTxt.replace( RX::Mdx::srcRe2, "\\1\"bres://" + id + "/\\2\"" ); } }