Merge branch 'staged' into dev

This commit is contained in:
Xiao YiFang 2022-06-13 19:48:06 +08:00
commit e2d470d6dc
11 changed files with 246 additions and 184 deletions

View file

@ -1,5 +1,7 @@
name: macos-PR-check
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
on:
workflow_dispatch:

View file

@ -1,5 +1,7 @@
name: Ubuntu-PR-check
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
on:
workflow_dispatch:

View file

@ -1,5 +1,7 @@
name: Windows-PR-check
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
on:
workflow_dispatch:

View file

@ -24,24 +24,19 @@ a:hover
background: white;
}
/* Dictionary's name heading */
.gddictname
{
border: 1px dotted black; padding: 0.2em; padding-left: 0.5em;
margin-top: 1.2em; margin-bottom: 0.1em; font-weight: bold; font-size: 14px;
background: #87CEEB;
}
/* The 'From ' string which preceeds dictionary name in the heading */
.gdfromprefix
{
display: none;
}
/* Dictionary's name heading */
.gddictname
{
padding: 0.2em; padding-left: 0.5em;
margin-bottom: 0.1em;
font-size: 14px;
font-weight: normal;
float: right;
border: 1px solid white;
margin-top: 7px;

View file

@ -42,6 +42,11 @@ pre
/*background: #ffffdd;*/
}
.gddicttitle
{
user-select: none;
}
.gddictnamebodyseparator
{
clear: both;

50
base/globalregex.cc Normal file
View file

@ -0,0 +1,50 @@
#include "globalregex.hh"
#include "fulltextsearch.hh"
using namespace RX;
QRegularExpression Ftx::regBrackets(
"(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+)(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+){0,1}(\\([\\w\\p{M}]+\\)){0,1}",
QRegularExpression::UseUnicodePropertiesOption );
QRegularExpression Ftx::regSplit( "[^\\w\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption );
QRegularExpression Ftx::spacesRegExp( "\\W+", QRegularExpression::UseUnicodePropertiesOption );
QRegularExpression Ftx::wordRegExp( QString( "\\w{" ) + QString::number( FTS::MinimumWordSize ) + ",}",
QRegularExpression::UseUnicodePropertiesOption );
QRegularExpression Ftx::setsRegExp( "\\[[^\\]]+\\]", QRegularExpression::CaseInsensitiveOption );
QRegularExpression Ftx::regexRegExp( "\\\\[afnrtvdDwWsSbB]|\\\\x([0-9A-Fa-f]{4})|\\\\0([0-7]{3})",
QRegularExpression::CaseInsensitiveOption );
//mdx
QRegularExpression Mdx::allLinksRe( "(?:<\\s*(a(?:rea)?|img|link|script|source)(?:\\s+[^>]+|\\s*)>)",
QRegularExpression::CaseInsensitiveOption );
QRegularExpression Mdx::wordCrossLink( "([\\s\"']href\\s*=)\\s*([\"'])entry://([^>#]*?)((?:#[^>]*?)?)\\2",
QRegularExpression::CaseInsensitiveOption );
QRegularExpression Mdx::anchorIdRe( "([\\s\"'](?:name|id)\\s*=)\\s*([\"'])\\s*(?=\\S)",
QRegularExpression::CaseInsensitiveOption );
QRegularExpression Mdx::anchorIdReWord( "([\\s\"'](?:name|id)\\s*=)\\s*([\"'])\\s*(?=\\S)([^\"]*)",
QRegularExpression::CaseInsensitiveOption );
QRegularExpression Mdx::anchorIdRe2( "([\\s\"'](?:name|id)\\s*=)\\s*(?=[^\"'])([^\\s\">]+)",
QRegularExpression::CaseInsensitiveOption );
QRegularExpression Mdx::anchorLinkRe( "([\\s\"']href\\s*=\\s*[\"'])entry://#",
QRegularExpression::CaseInsensitiveOption );
QRegularExpression Mdx::audioRe( "([\\s\"']href\\s*=)\\s*([\"'])sound://([^\">]+)\\2",
QRegularExpression::CaseInsensitiveOption
| QRegularExpression::InvertedGreedinessOption );
QRegularExpression Mdx::stylesRe( "([\\s\"']href\\s*=)\\s*([\"'])(?!\\s*\\b(?:(?:bres|https?|ftp)://"
"|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\">]+)\\2",
QRegularExpression::CaseInsensitiveOption );
QRegularExpression Mdx::stylesRe2( "([\\s\"']href\\s*=)\\s*(?![\\s\"']|\\b(?:(?:bres|https?|ftp)://"
"|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\\s\">]+)",
QRegularExpression::CaseInsensitiveOption );
QRegularExpression Mdx::inlineScriptRe( "<\\s*script(?:(?=\\s)(?:(?![\\s\"']src\\s*=)[^>])+|\\s*)>",
QRegularExpression::CaseInsensitiveOption );
QRegularExpression Mdx::closeScriptTagRe( "<\\s*/script\\s*>", QRegularExpression::CaseInsensitiveOption );
QRegularExpression Mdx::srcRe( "([\\s\"']src\\s*=)\\s*([\"'])(?!\\s*\\b(?:(?:bres|https?|ftp)://"
"|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\">]+)\\2",
QRegularExpression::CaseInsensitiveOption );
QRegularExpression Mdx::srcRe2( "([\\s\"']src\\s*=)\\s*(?![\\s\"']|\\b(?:(?:bres|https?|ftp)://"
"|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\\s\">]+)",
QRegularExpression::CaseInsensitiveOption );

40
base/globalregex.hh Normal file
View file

@ -0,0 +1,40 @@
#ifndef GLOBALREGEX_HH
#define GLOBALREGEX_HH
#include <QRegularExpression>
namespace RX
{
class Ftx
{
public:
static QRegularExpression regBrackets;
static QRegularExpression regSplit;
static QRegularExpression spacesRegExp;
static QRegularExpression wordRegExp;
static QRegularExpression setsRegExp;
static QRegularExpression regexRegExp;
};
class Mdx
{
public:
static QRegularExpression allLinksRe;
static QRegularExpression wordCrossLink;
static QRegularExpression anchorIdRe;
static QRegularExpression anchorIdReWord;
static QRegularExpression anchorIdRe2;
static QRegularExpression anchorLinkRe;
static QRegularExpression audioRe;
static QRegularExpression stylesRe;
static QRegularExpression stylesRe2;
static QRegularExpression inlineScriptRe;
static QRegularExpression closeScriptTagRe;
static QRegularExpression srcRe;
static QRegularExpression srcRe2;
};
} // namespace RX
#endif // GLOBALREGEX_HH

View file

@ -17,6 +17,8 @@
#include <QRegularExpression>
#include "wildcard.hh"
#include <QtConcurrent>
#include "base/globalregex.hh"
using std::vector;
using std::string;
@ -147,36 +149,36 @@ bool parseSearchString( QString const & str, QStringList & indexWords,
{
searchWords.clear();
indexWords.clear();
QRegularExpression spacesRegExp( "\\W+", QRegularExpression::UseUnicodePropertiesOption );
QRegularExpression wordRegExp( QString( "\\w{" ) + QString::number( FTS::MinimumWordSize ) + ",}", QRegularExpression::UseUnicodePropertiesOption );
QRegularExpression setsRegExp( "\\[[^\\]]+\\]", QRegularExpression::CaseInsensitiveOption );
QRegularExpression regexRegExp( "\\\\[afnrtvdDwWsSbB]|\\\\x([0-9A-Fa-f]{4})|\\\\0([0-7]{3})", QRegularExpression::CaseInsensitiveOption);
// QRegularExpression spacesRegExp( "\\W+", QRegularExpression::UseUnicodePropertiesOption );
// QRegularExpression wordRegExp( QString( "\\w{" ) + QString::number( FTS::MinimumWordSize ) + ",}", QRegularExpression::UseUnicodePropertiesOption );
// QRegularExpression setsRegExp( "\\[[^\\]]+\\]", QRegularExpression::CaseInsensitiveOption );
// QRegularExpression regexRegExp( "\\\\[afnrtvdDwWsSbB]|\\\\x([0-9A-Fa-f]{4})|\\\\0([0-7]{3})", QRegularExpression::CaseInsensitiveOption);
hasCJK = containCJK( str );
if( searchMode == FTS::WholeWords || searchMode == FTS::PlainText )
{
// Make words list for search in article text
searchWords = str.normalized( QString::NormalizationForm_C ).split( spacesRegExp, Qt::SkipEmptyParts );
searchWords = str.normalized( QString::NormalizationForm_C ).split( RX::Ftx::spacesRegExp, Qt::SkipEmptyParts );
// Make words list for index search
QStringList list =
str.normalized( QString::NormalizationForm_C ).toLower().split( spacesRegExp, Qt::SkipEmptyParts );
str.normalized( QString::NormalizationForm_C ).toLower().split( RX::Ftx::spacesRegExp, Qt::SkipEmptyParts );
QString searchString;
if( hasCJK )
{
tokenizeCJK( indexWords, wordRegExp, list );
tokenizeCJK( indexWords, RX::Ftx::wordRegExp, list );
// QStringList allWords = str.split( spacesRegExp, Qt::SkipEmptyParts );
searchString = makeHiliteRegExpString( list, searchMode, distanceBetweenWords, hasCJK , ignoreWordsOrder);
}
else
{
indexWords = list.filter( wordRegExp );
indexWords = list.filter( RX::Ftx::wordRegExp );
indexWords.removeDuplicates();
// Make regexp for results hilite
QStringList allWords = str.split( spacesRegExp, Qt::SkipEmptyParts );
QStringList allWords = str.split( RX::Ftx::spacesRegExp, Qt::SkipEmptyParts );
searchString = makeHiliteRegExpString( allWords, searchMode, distanceBetweenWords,false, ignoreWordsOrder );
}
searchRegExp = QRegExp( searchString, matchCase ? Qt::CaseSensitive : Qt::CaseInsensitive, QRegExp::RegExp2 );
@ -191,21 +193,21 @@ bool parseSearchString( QString const & str, QStringList & indexWords,
// Remove RegExp commands
if( searchMode == FTS::RegExp )
tmp.replace( regexRegExp, " " );
tmp.replace( RX::Ftx::regexRegExp, " " );
// Remove all symbol sets
tmp.replace( setsRegExp, " " );
tmp.replace( RX::Ftx::setsRegExp, " " );
QStringList list = tmp.normalized( QString::NormalizationForm_C )
.toLower().split( spacesRegExp, Qt::SkipEmptyParts );
.toLower().split( RX::Ftx::spacesRegExp, Qt::SkipEmptyParts );
if( hasCJK )
{
tokenizeCJK( indexWords, wordRegExp, list );
tokenizeCJK( indexWords, RX::Ftx::wordRegExp, list );
}
else
{
indexWords = list.filter( wordRegExp );
indexWords = list.filter( RX::Ftx::wordRegExp );
indexWords.removeDuplicates();
}
@ -224,9 +226,9 @@ void parseArticleForFts( uint32_t articleAddress, QString & articleText,
if( articleText.isEmpty() )
return;
QRegularExpression regBrackets( "(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+)(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+){0,1}(\\([\\w\\p{M}]+\\)){0,1}",
QRegularExpression::UseUnicodePropertiesOption);
QRegularExpression regSplit( "[^\\w\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption );
// QRegularExpression regBrackets( "(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+)(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+){0,1}(\\([\\w\\p{M}]+\\)){0,1}",
// QRegularExpression::UseUnicodePropertiesOption);
// QRegularExpression regSplit( "[^\\w\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption );
QStringList articleWords = articleText.normalized( QString::NormalizationForm_C )
.split( QRegularExpression( handleRoundBrackets ? "[^\\w\\(\\)\\p{M}]+" : "[^\\w\\p{M}]+",
@ -275,12 +277,12 @@ void parseArticleForFts( uint32_t articleAddress, QString & articleText,
// Special handle for words with round brackets - DSL feature
QStringList list;
QStringList oldVariant = word.split( regSplit, Qt::SkipEmptyParts );
QStringList oldVariant = word.split( RX::Ftx::regSplit, Qt::SkipEmptyParts );
for( QStringList::iterator it = oldVariant.begin(); it != oldVariant.end(); ++it )
if( it->size() >= FTS::MinimumWordSize && !list.contains( *it ) )
list.append( *it );
QRegularExpressionMatch match = regBrackets.match( word );
QRegularExpressionMatch match = RX::Ftx::regBrackets.match( word );
if( match.hasMatch() )
{
QStringList parts = match.capturedTexts();
@ -445,21 +447,20 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
QStringList const & words,
QRegExp const & searchRegexp )
{
int results = 0;
QtConcurrent::blockingMap( offsets, [ & ]( uint32_t offset ) { checkSingleArticle( offset, words, searchRegexp ); } );
}
void FTSResultsRequest::checkSingleArticle( uint32_t offset,
QStringList const & words,
QRegExp const & searchRegexp )
{
qDebug()<<"checking"<<offset<<QThread::currentThreadId();
// int results = 0;
QString headword, articleText;
QList< uint32_t > offsetsForHeadwords;
QVector< QStringList > hiliteRegExps;
QString id = QString::fromUtf8( dict.getId().c_str() );
bool needHandleBrackets;
{
QString name = QString::fromUtf8( dict.getDictionaryFilenames()[ 0 ].c_str() ).toLower();
needHandleBrackets = name.endsWith( ".dsl" ) || name.endsWith( ".dsl.dz" );
}
QRegularExpression regBrackets( "(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+)(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+){0,1}(\\([\\w\\p{M}]+\\)){0,1}",
QRegularExpression::UseUnicodePropertiesOption);
QRegularExpression regSplit( "[^\\w\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption );
// RegExp mode
QRegularExpression searchRegularExpression;
@ -478,12 +479,13 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
if( searchMode == FTS::Wildcards || searchMode == FTS::RegExp )
{
for( int i = 0; i < offsets.size(); i++ )
// for( int i = 0; i < offsets.size(); i++ )
{
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
break;
return;
dict.getArticleText( offsets.at( i ), headword, articleText );
// auto article_address = offsets.at( i );
dict.getArticleText( offset, headword, articleText );
articleText = articleText.normalized( QString::NormalizationForm_C );
if( ignoreDiacritics )
@ -492,13 +494,13 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
if( articleText.contains( searchRegularExpression ) )
{
if( headword.isEmpty() )
offsetsForHeadwords.append( offsets.at( i ) );
offsetsForHeadwords.append( offset );
else
foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) );
results++;
++results;
if( maxResults > 0 && results >= maxResults )
break;
return;
}
}
}
@ -506,10 +508,6 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
{
// Words mode
QRegularExpression splitWithBrackets( "[^\\w\\(\\)\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption );
QRegularExpression splitWithoutBrackets( "[^\\w\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption );
Qt::CaseSensitivity cs = matchCase ? Qt::CaseSensitive : Qt::CaseInsensitive;
QVector< QPair< QString, bool > > wordsList;
if( ignoreWordsOrder )
{
@ -517,18 +515,10 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
wordsList.append( QPair< QString, bool >( *it, true ) );
}
for( int i = 0; i < offsets.size(); i++ )
// for( int i = 0; i < offsets.size(); i++ )
{
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
break;
int pos = 0;
int matchWordNom = 0;
int unmatchWordNom = 0;
int nextNotFoundPos = 0;
QVector< QStringList > allOrders;
QStringList order;
return;
if( ignoreWordsOrder )
{
@ -536,16 +526,13 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
wordsList[ i ].second = true;
}
dict.getArticleText( offsets.at( i ), headword, articleText );
dict.getArticleText( offset, headword, articleText );
articleText = articleText.normalized( QString::NormalizationForm_C );
if( ignoreDiacritics )
articleText = gd::toQString( Folding::applyDiacriticsOnly( gd::toWString( articleText ) ) );
//QStringList articleWords = articleText.split( needHandleBrackets ? splitWithBrackets : splitWithoutBrackets,
// Qt::SkipEmptyParts );
if( ignoreWordsOrder )
{
bool allMatch = true;
@ -561,28 +548,29 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
}
else if( searchMode == FTS::WholeWords )
{
QRegularExpression tmpReg( QString( "\b%1\b" ).arg( word ),QRegularExpression::CaseInsensitiveOption|QRegularExpression::UseUnicodePropertiesOption );
QRegularExpression tmpReg( QString( "\b%1\b" ).arg( word ),
QRegularExpression::CaseInsensitiveOption
| QRegularExpression::UseUnicodePropertiesOption );
if( !articleText.contains( tmpReg ) )
{
allMatch = false;
break;
}
}
}
if( !allMatch )
{
continue;
return;
}
if( distanceBetweenWords >= 0 )
{
// the article text contains all the needed words.
// determine if distance restriction is meet
QRegularExpression replaceReg( QString( "(%1)" ).arg( words.join( '|' ) ),
QRegularExpression::CaseInsensitiveOption |
QRegularExpression::UseUnicodePropertiesOption );
const QRegularExpression replaceReg( QString( "(%1)" ).arg( words.join( '|' ) ),
QRegularExpression::CaseInsensitiveOption
| QRegularExpression::UseUnicodePropertiesOption );
// use a string that could not be presented in the article.
articleText = articleText.replace( replaceReg, "=@XXXXX@=" );
@ -597,37 +585,39 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
}
// hascjk value ,perhaps should depend on each word
auto searchRegStr = makeHiliteRegExpString( Utils::repeat( "=@XXXXX@=", words.size() ), searchMode, distanceBetweenWords,hasCJK );
QRegularExpression distanceOrderReg( searchRegStr,
QRegularExpression::CaseInsensitiveOption |
QRegularExpression::UseUnicodePropertiesOption );
const auto searchRegStr = makeHiliteRegExpString( Utils::repeat( "=@XXXXX@=", words.size() ),
searchMode,
distanceBetweenWords,
hasCJK );
const QRegularExpression distanceOrderReg( searchRegStr,
QRegularExpression::CaseInsensitiveOption
| QRegularExpression::UseUnicodePropertiesOption );
// use a string that could not be presented in the article.
if( articleText.contains( distanceOrderReg ) )
{
if( headword.isEmpty() )
offsetsForHeadwords.append( offsets.at( i ) );
offsetsForHeadwords.append( offset );
else
foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) );
results++;
++results;
if( maxResults > 0 && results >= maxResults )
break;
return;
}
}
}
else
{
if( articleText.contains( searchRegularExpression ) )
{
if( headword.isEmpty() )
offsetsForHeadwords.append( offsets.at( i ) );
offsetsForHeadwords.append( offset );
else
foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) );
results++;
++results;
if( maxResults > 0 && results >= maxResults )
break;
return;
}
}
}
@ -637,7 +627,10 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
QVector< QString > headwords;
dict.getHeadwordsFromOffsets( offsetsForHeadwords, headwords, &isCancelled );
for( int x = 0; x < headwords.size(); x++ )
foundHeadwords->append( FTS::FtsHeadword( headwords.at( x ), id, x < hiliteRegExps.size() ? hiliteRegExps.at( x ) : QStringList(), matchCase ) );
foundHeadwords->append( FTS::FtsHeadword( headwords.at( x ),
id,
x < hiliteRegExps.size() ? hiliteRegExps.at( x ) : QStringList(),
matchCase ) );
}
}
@ -648,27 +641,28 @@ void FTSResultsRequest::indexSearch( BtreeIndexing::BtreeIndex & ftsIndex,
{
// Find articles which contains all requested words
vector< BtreeIndexing::WordArticleLink > links;
QSet< uint32_t > setOfOffsets, tmp;
uint32_t size;
QSet< uint32_t > setOfOffsets;
if( indexWords.isEmpty() )
return;
int n = indexWords.length();
for( int i = 0; i < n; i++ )
QList< QSet< uint32_t > > addressLists;
auto findLinks = [ & ]( const QString & word )
{
QSet< uint32_t > tmp;
uint32_t size;
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
return;
addressLists<< tmp;
tmp.clear();
links = ftsIndex.findArticles( gd::toWString( indexWords.at( i ) ), ignoreDiacritics );
vector< BtreeIndexing::WordArticleLink > links =
ftsIndex.findArticles( gd::toWString( word ), ignoreDiacritics );
for( unsigned x = 0; x < links.size(); x++ )
{
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
return;
addressLists<< tmp;
vector< char > chunk;
char * linksPtr;
@ -688,13 +682,20 @@ void FTSResultsRequest::indexSearch( BtreeIndexing::BtreeIndex & ftsIndex,
links.clear();
if( i == 0 )
setOfOffsets = tmp;
addressLists<< tmp;
};
// int n = indexWords.length();
QtConcurrent::blockingMap( indexWords, findLinks );
int i = 0;
for( auto & elem : addressLists )
{
if( i++ == 0 )
setOfOffsets = elem;
else
setOfOffsets = setOfOffsets.intersect( tmp );
setOfOffsets = setOfOffsets.intersect( elem );
}
tmp.clear();
if( setOfOffsets.isEmpty() )
return;
@ -757,17 +758,15 @@ void FTSResultsRequest::combinedIndexSearch( BtreeIndexing::BtreeIndex & ftsInde
if( !hieroglyphsList.empty() )
{
QSet< uint32_t > tmp;
vector< BtreeIndexing::WordArticleLink > links;
for( int i = 0; i < hieroglyphsList.size(); i++ )
QList< QSet< uint32_t > > sets;
auto fn_wordLink = [ & ](const QString & word )
{
links = ftsIndex.findArticles( gd::toWString( hieroglyphsList.at( i ) ) );
QSet< uint32_t > tmp;
vector< BtreeIndexing::WordArticleLink > links = ftsIndex.findArticles( gd::toWString( word ) );
for( unsigned x = 0; x < links.size(); x++ )
{
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
return;
sets<< tmp;
vector< char > chunk;
char * linksPtr;
@ -786,11 +785,17 @@ void FTSResultsRequest::combinedIndexSearch( BtreeIndexing::BtreeIndex & ftsInde
}
links.clear();
sets<< tmp;
};
QtConcurrent::blockingMap( hieroglyphsList, fn_wordLink );
if( i == 0 )
setOfOffsets = tmp;
int i = 0;
for( auto & elem : sets )
{
if( i++ == 0 )
setOfOffsets = elem;
else
setOfOffsets = setOfOffsets.intersect( tmp );
setOfOffsets = setOfOffsets.intersect( elem );
}
allWordsLinks[ wordNom ] = setOfOffsets;

View file

@ -82,12 +82,16 @@ class FTSResultsRequest : public Dictionary::DataRequest
QAtomicInt isCancelled;
QAtomicInt results;
QList< FTS::FtsHeadword > * foundHeadwords;
void checkArticles( QVector< uint32_t > const & offsets,
QStringList const & words,
QRegExp const & searchRegexp = QRegExp() );
void checkSingleArticle( uint32_t offset, QStringList const & words, QRegExp const & searchRegexp = QRegExp() );
void indexSearch( BtreeIndexing::BtreeIndex & ftsIndex,
sptr< ChunkedStorage::Reader > chunks,
QStringList & indexWords,
@ -127,6 +131,7 @@ public:
searchString = gd::toQString( Folding::applyDiacriticsOnly( gd::toWString( searchString_ ) ) );
foundHeadwords = new QList< FTS::FtsHeadword >;
results = 0;
QThreadPool::globalInstance()->start( [ this ]() { this->run(); }, -100 );
}

View file

@ -47,7 +47,8 @@ QT += core \
webenginewidgets\
webchannel\
printsupport \
help
help \
concurrent
greaterThan(QT_MAJOR_VERSION, 5): QT += webenginecore core5compat
@ -242,6 +243,7 @@ HEADERS += folding.hh \
ankiconnector.h \
article_inspect.h \
articlewebpage.h \
base/globalregex.hh \
globalbroadcaster.h \
iframeschemehandler.h \
inc_case_folding.hh \
@ -384,6 +386,7 @@ SOURCES += folding.cc \
ankiconnector.cpp \
article_inspect.cpp \
articlewebpage.cpp \
base/globalregex.cc \
globalbroadcaster.cpp \
iframeschemehandler.cpp \
main.cc \

81
mdx.cc
View file

@ -42,6 +42,7 @@
#include "tiff.hh"
#include "utils.hh"
#include "base/globalregex.hh"
namespace Mdx
{
@ -192,51 +193,6 @@ public:
};
struct MdxRegex
{
MdxRegex() :
allLinksRe( "(?:<\\s*(a(?:rea)?|img|link|script|source)(?:\\s+[^>]+|\\s*)>)",
QRegularExpression::CaseInsensitiveOption ),
wordCrossLink( "([\\s\"']href\\s*=)\\s*([\"'])entry://([^>#]*?)((?:#[^>]*?)?)\\2",
QRegularExpression::CaseInsensitiveOption ),
anchorIdRe( "([\\s\"'](?:name|id)\\s*=)\\s*([\"'])\\s*(?=\\S)", QRegularExpression::CaseInsensitiveOption ),
anchorIdReWord( "([\\s\"'](?:name|id)\\s*=)\\s*([\"'])\\s*(?=\\S)([^\"]*)", QRegularExpression::CaseInsensitiveOption ),
anchorIdRe2( "([\\s\"'](?:name|id)\\s*=)\\s*(?=[^\"'])([^\\s\">]+)", QRegularExpression::CaseInsensitiveOption ),
anchorLinkRe( "([\\s\"']href\\s*=\\s*[\"'])entry://#", QRegularExpression::CaseInsensitiveOption ),
audioRe( "([\\s\"']href\\s*=)\\s*([\"'])sound://([^\">]+)\\2",
QRegularExpression::CaseInsensitiveOption | QRegularExpression::InvertedGreedinessOption ),
stylesRe( "([\\s\"']href\\s*=)\\s*([\"'])(?!\\s*\\b(?:(?:bres|https?|ftp)://"
"|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\">]+)\\2",
QRegularExpression::CaseInsensitiveOption ),
stylesRe2( "([\\s\"']href\\s*=)\\s*(?![\\s\"']|\\b(?:(?:bres|https?|ftp)://"
"|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\\s\">]+)",
QRegularExpression::CaseInsensitiveOption ),
inlineScriptRe( "<\\s*script(?:(?=\\s)(?:(?![\\s\"']src\\s*=)[^>])+|\\s*)>",
QRegularExpression::CaseInsensitiveOption ),
closeScriptTagRe( "<\\s*/script\\s*>", QRegularExpression::CaseInsensitiveOption ),
srcRe( "([\\s\"']src\\s*=)\\s*([\"'])(?!\\s*\\b(?:(?:bres|https?|ftp)://"
"|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\">]+)\\2",
QRegularExpression::CaseInsensitiveOption ),
srcRe2( "([\\s\"']src\\s*=)\\s*(?![\\s\"']|\\b(?:(?:bres|https?|ftp)://"
"|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\\s\">]+)",
QRegularExpression::CaseInsensitiveOption )
{
}
QRegularExpression allLinksRe;
QRegularExpression wordCrossLink;
QRegularExpression anchorIdRe;
QRegularExpression anchorIdReWord;
QRegularExpression anchorIdRe2;
QRegularExpression anchorLinkRe;
QRegularExpression audioRe;
QRegularExpression stylesRe;
QRegularExpression stylesRe2;
QRegularExpression inlineScriptRe;
QRegularExpression closeScriptTagRe;
QRegularExpression srcRe;
QRegularExpression srcRe2;
};
class MdxDictionary: public BtreeIndexing::BtreeDictionary
{
Mutex idxMutex;
@ -256,8 +212,6 @@ class MdxDictionary: public BtreeIndexing::BtreeDictionary
string initError;
QString cacheDirName;
static MdxRegex mdxRx;
public:
MdxDictionary( string const & id, string const & indexFile, vector<string> const & dictionaryFiles );
@ -347,8 +301,6 @@ private:
friend class MddResourceRequest;
};
MdxRegex MdxDictionary::mdxRx;
MdxDictionary::MdxDictionary( string const & id, string const & indexFile,
vector<string> const & dictionaryFiles ):
BtreeDictionary( id, dictionaryFiles ),
@ -972,10 +924,11 @@ void MdxDictionary::loadArticle( uint32_t offset, string & articleText, bool noF
decompressed.constData() + recordInfo.recordOffset,
recordInfo.recordSize );
article = MdictParser::substituteStylesheet( article, styleSheets );
if( !noFilter )
{
article = MdictParser::substituteStylesheet( article, styleSheets );
article = filterResource( articleId, article );
}
articleText = article.toStdString();
}
@ -987,7 +940,7 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
QString articleNewText;
int linkPos = 0;
QRegularExpressionMatchIterator it = mdxRx.allLinksRe.globalMatch( article );
QRegularExpressionMatchIterator it = RX::Mdx::allLinksRe.globalMatch( article );
QMap<QString,QString> idMap;
while( it.hasNext() )
{
@ -1005,10 +958,10 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
if( !linkType.isEmpty() && linkType.at( 0 ) == 'a' )
{
QRegularExpressionMatch match = mdxRx.anchorIdRe.match( linkTxt );
QRegularExpressionMatch match = RX::Mdx::anchorIdRe.match( linkTxt );
if( match.hasMatch() )
{
auto wordMatch = mdxRx.anchorIdReWord.match( linkTxt );
auto wordMatch = RX::Mdx::anchorIdReWord.match( linkTxt );
if( wordMatch.hasMatch() )
{
idMap.insert( wordMatch.captured( 3 ), uniquePrefix + wordMatch.captured( 3 ) );
@ -1017,11 +970,11 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
newLink = linkTxt.replace( match.capturedStart(), match.capturedLength(), newText );
}
else
newLink = linkTxt.replace( mdxRx.anchorIdRe2, "\\1\"" + uniquePrefix + "\\2\"" );
newLink = linkTxt.replace( RX::Mdx::anchorIdRe2, "\\1\"" + uniquePrefix + "\\2\"" );
newLink = newLink.replace( mdxRx.anchorLinkRe, "\\1#" + uniquePrefix );
newLink = newLink.replace( RX::Mdx::anchorLinkRe, "\\1#" + uniquePrefix );
match = mdxRx.audioRe.match( newLink );
match = RX::Mdx::audioRe.match( newLink );
if( match.hasMatch() )
{
// sounds and audio link script
@ -1032,7 +985,7 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
+ newLink.replace( match.capturedStart(), match.capturedLength(), newTxt );
}
match = mdxRx.wordCrossLink.match( newLink );
match = RX::Mdx::wordCrossLink.match( newLink );
if( match.hasMatch() )
{
QString newTxt = match.captured( 1 ) + match.captured( 2 )
@ -1050,7 +1003,7 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
if( linkType.compare( "link" ) == 0 )
{
// stylesheets
QRegularExpressionMatch match = mdxRx.stylesRe.match( linkTxt );
QRegularExpressionMatch match = RX::Mdx::stylesRe.match( linkTxt );
if( match.hasMatch() )
{
QString newText = match.captured( 1 ) + match.captured( 2 )
@ -1059,7 +1012,7 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
newLink = linkTxt.replace( match.capturedStart(), match.capturedLength(), newText );
}
else
newLink = linkTxt.replace( mdxRx.stylesRe2,
newLink = linkTxt.replace( RX::Mdx::stylesRe2,
"\\1\"bres://" + id + "/\\2\"" );
}
else
@ -1067,13 +1020,13 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
|| linkType.compare( "source" ) == 0 )
{
// javascripts and images
QRegularExpressionMatch match = mdxRx.inlineScriptRe.match( linkTxt );
QRegularExpressionMatch match = RX::Mdx::inlineScriptRe.match( linkTxt );
if( linkType.at( 1 ) == 'c' // "script" tag
&& match.hasMatch() && match.capturedLength() == linkTxt.length() )
{
// skip inline scripts
articleNewText += linkTxt;
match = mdxRx.closeScriptTagRe.match( article, linkPos );
match = RX::Mdx::closeScriptTagRe.match( article, linkPos );
if( match.hasMatch() )
{
articleNewText += article.mid( linkPos, match.capturedEnd() - linkPos );
@ -1083,7 +1036,7 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
}
else
{
match = mdxRx.srcRe.match( linkTxt );
match = RX::Mdx::srcRe.match( linkTxt );
if( match.hasMatch() )
{
QString newText;
@ -1104,7 +1057,7 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
newLink = linkTxt.replace( match.capturedStart(), match.capturedLength(), newText );
}
else
newLink = linkTxt.replace( mdxRx.srcRe2,
newLink = linkTxt.replace( RX::Mdx::srcRe2,
"\\1\"bres://" + id + "/\\2\"" );
}
}