mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-24 12:44:07 +00:00
Merge branch 'staged' into dev
This commit is contained in:
commit
e2d470d6dc
4
.github/workflows/macos-PR-check.yml
vendored
4
.github/workflows/macos-PR-check.yml
vendored
|
@ -1,5 +1,7 @@
|
|||
name: macos-PR-check
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
on:
|
||||
|
||||
workflow_dispatch:
|
||||
|
|
4
.github/workflows/ubuntu-PR-check.yml
vendored
4
.github/workflows/ubuntu-PR-check.yml
vendored
|
@ -1,5 +1,7 @@
|
|||
name: Ubuntu-PR-check
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
on:
|
||||
|
||||
workflow_dispatch:
|
||||
|
|
4
.github/workflows/windows-PR-check.yml
vendored
4
.github/workflows/windows-PR-check.yml
vendored
|
@ -1,5 +1,7 @@
|
|||
name: Windows-PR-check
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
on:
|
||||
|
||||
workflow_dispatch:
|
||||
|
|
|
@ -24,24 +24,19 @@ a:hover
|
|||
background: white;
|
||||
}
|
||||
|
||||
/* Dictionary's name heading */
|
||||
.gddictname
|
||||
{
|
||||
border: 1px dotted black; padding: 0.2em; padding-left: 0.5em;
|
||||
margin-top: 1.2em; margin-bottom: 0.1em; font-weight: bold; font-size: 14px;
|
||||
background: #87CEEB;
|
||||
}
|
||||
|
||||
/* The 'From ' string which preceeds dictionary name in the heading */
|
||||
.gdfromprefix
|
||||
{
|
||||
display: none;
|
||||
}
|
||||
|
||||
/* Dictionary's name heading */
|
||||
.gddictname
|
||||
{
|
||||
padding: 0.2em; padding-left: 0.5em;
|
||||
margin-bottom: 0.1em;
|
||||
font-size: 14px;
|
||||
font-weight: normal;
|
||||
|
||||
float: right;
|
||||
border: 1px solid white;
|
||||
margin-top: 7px;
|
||||
|
|
|
@ -42,6 +42,11 @@ pre
|
|||
/*background: #ffffdd;*/
|
||||
}
|
||||
|
||||
.gddicttitle
|
||||
{
|
||||
user-select: none;
|
||||
}
|
||||
|
||||
.gddictnamebodyseparator
|
||||
{
|
||||
clear: both;
|
||||
|
|
50
base/globalregex.cc
Normal file
50
base/globalregex.cc
Normal file
|
@ -0,0 +1,50 @@
|
|||
#include "globalregex.hh"
|
||||
#include "fulltextsearch.hh"
|
||||
|
||||
using namespace RX;
|
||||
|
||||
QRegularExpression Ftx::regBrackets(
|
||||
"(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+)(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+){0,1}(\\([\\w\\p{M}]+\\)){0,1}",
|
||||
QRegularExpression::UseUnicodePropertiesOption );
|
||||
QRegularExpression Ftx::regSplit( "[^\\w\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption );
|
||||
|
||||
QRegularExpression Ftx::spacesRegExp( "\\W+", QRegularExpression::UseUnicodePropertiesOption );
|
||||
QRegularExpression Ftx::wordRegExp( QString( "\\w{" ) + QString::number( FTS::MinimumWordSize ) + ",}",
|
||||
QRegularExpression::UseUnicodePropertiesOption );
|
||||
QRegularExpression Ftx::setsRegExp( "\\[[^\\]]+\\]", QRegularExpression::CaseInsensitiveOption );
|
||||
QRegularExpression Ftx::regexRegExp( "\\\\[afnrtvdDwWsSbB]|\\\\x([0-9A-Fa-f]{4})|\\\\0([0-7]{3})",
|
||||
QRegularExpression::CaseInsensitiveOption );
|
||||
|
||||
|
||||
//mdx
|
||||
|
||||
QRegularExpression Mdx::allLinksRe( "(?:<\\s*(a(?:rea)?|img|link|script|source)(?:\\s+[^>]+|\\s*)>)",
|
||||
QRegularExpression::CaseInsensitiveOption );
|
||||
QRegularExpression Mdx::wordCrossLink( "([\\s\"']href\\s*=)\\s*([\"'])entry://([^>#]*?)((?:#[^>]*?)?)\\2",
|
||||
QRegularExpression::CaseInsensitiveOption );
|
||||
QRegularExpression Mdx::anchorIdRe( "([\\s\"'](?:name|id)\\s*=)\\s*([\"'])\\s*(?=\\S)",
|
||||
QRegularExpression::CaseInsensitiveOption );
|
||||
QRegularExpression Mdx::anchorIdReWord( "([\\s\"'](?:name|id)\\s*=)\\s*([\"'])\\s*(?=\\S)([^\"]*)",
|
||||
QRegularExpression::CaseInsensitiveOption );
|
||||
QRegularExpression Mdx::anchorIdRe2( "([\\s\"'](?:name|id)\\s*=)\\s*(?=[^\"'])([^\\s\">]+)",
|
||||
QRegularExpression::CaseInsensitiveOption );
|
||||
QRegularExpression Mdx::anchorLinkRe( "([\\s\"']href\\s*=\\s*[\"'])entry://#",
|
||||
QRegularExpression::CaseInsensitiveOption );
|
||||
QRegularExpression Mdx::audioRe( "([\\s\"']href\\s*=)\\s*([\"'])sound://([^\">]+)\\2",
|
||||
QRegularExpression::CaseInsensitiveOption
|
||||
| QRegularExpression::InvertedGreedinessOption );
|
||||
QRegularExpression Mdx::stylesRe( "([\\s\"']href\\s*=)\\s*([\"'])(?!\\s*\\b(?:(?:bres|https?|ftp)://"
|
||||
"|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\">]+)\\2",
|
||||
QRegularExpression::CaseInsensitiveOption );
|
||||
QRegularExpression Mdx::stylesRe2( "([\\s\"']href\\s*=)\\s*(?![\\s\"']|\\b(?:(?:bres|https?|ftp)://"
|
||||
"|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\\s\">]+)",
|
||||
QRegularExpression::CaseInsensitiveOption );
|
||||
QRegularExpression Mdx::inlineScriptRe( "<\\s*script(?:(?=\\s)(?:(?![\\s\"']src\\s*=)[^>])+|\\s*)>",
|
||||
QRegularExpression::CaseInsensitiveOption );
|
||||
QRegularExpression Mdx::closeScriptTagRe( "<\\s*/script\\s*>", QRegularExpression::CaseInsensitiveOption );
|
||||
QRegularExpression Mdx::srcRe( "([\\s\"']src\\s*=)\\s*([\"'])(?!\\s*\\b(?:(?:bres|https?|ftp)://"
|
||||
"|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\">]+)\\2",
|
||||
QRegularExpression::CaseInsensitiveOption );
|
||||
QRegularExpression Mdx::srcRe2( "([\\s\"']src\\s*=)\\s*(?![\\s\"']|\\b(?:(?:bres|https?|ftp)://"
|
||||
"|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\\s\">]+)",
|
||||
QRegularExpression::CaseInsensitiveOption );
|
40
base/globalregex.hh
Normal file
40
base/globalregex.hh
Normal file
|
@ -0,0 +1,40 @@
|
|||
#ifndef GLOBALREGEX_HH
|
||||
#define GLOBALREGEX_HH
|
||||
|
||||
#include <QRegularExpression>
|
||||
|
||||
namespace RX
|
||||
{
|
||||
class Ftx
|
||||
{
|
||||
public:
|
||||
static QRegularExpression regBrackets;
|
||||
static QRegularExpression regSplit;
|
||||
static QRegularExpression spacesRegExp;
|
||||
static QRegularExpression wordRegExp;
|
||||
static QRegularExpression setsRegExp;
|
||||
static QRegularExpression regexRegExp;
|
||||
};
|
||||
|
||||
|
||||
class Mdx
|
||||
{
|
||||
public:
|
||||
static QRegularExpression allLinksRe;
|
||||
static QRegularExpression wordCrossLink;
|
||||
static QRegularExpression anchorIdRe;
|
||||
static QRegularExpression anchorIdReWord;
|
||||
static QRegularExpression anchorIdRe2;
|
||||
static QRegularExpression anchorLinkRe;
|
||||
static QRegularExpression audioRe;
|
||||
static QRegularExpression stylesRe;
|
||||
static QRegularExpression stylesRe2;
|
||||
static QRegularExpression inlineScriptRe;
|
||||
static QRegularExpression closeScriptTagRe;
|
||||
static QRegularExpression srcRe;
|
||||
static QRegularExpression srcRe2;
|
||||
};
|
||||
|
||||
} // namespace RX
|
||||
|
||||
#endif // GLOBALREGEX_HH
|
219
ftshelpers.cc
219
ftshelpers.cc
|
@ -17,6 +17,8 @@
|
|||
#include <QRegularExpression>
|
||||
|
||||
#include "wildcard.hh"
|
||||
#include <QtConcurrent>
|
||||
#include "base/globalregex.hh"
|
||||
|
||||
using std::vector;
|
||||
using std::string;
|
||||
|
@ -147,36 +149,36 @@ bool parseSearchString( QString const & str, QStringList & indexWords,
|
|||
{
|
||||
searchWords.clear();
|
||||
indexWords.clear();
|
||||
QRegularExpression spacesRegExp( "\\W+", QRegularExpression::UseUnicodePropertiesOption );
|
||||
QRegularExpression wordRegExp( QString( "\\w{" ) + QString::number( FTS::MinimumWordSize ) + ",}", QRegularExpression::UseUnicodePropertiesOption );
|
||||
QRegularExpression setsRegExp( "\\[[^\\]]+\\]", QRegularExpression::CaseInsensitiveOption );
|
||||
QRegularExpression regexRegExp( "\\\\[afnrtvdDwWsSbB]|\\\\x([0-9A-Fa-f]{4})|\\\\0([0-7]{3})", QRegularExpression::CaseInsensitiveOption);
|
||||
// QRegularExpression spacesRegExp( "\\W+", QRegularExpression::UseUnicodePropertiesOption );
|
||||
// QRegularExpression wordRegExp( QString( "\\w{" ) + QString::number( FTS::MinimumWordSize ) + ",}", QRegularExpression::UseUnicodePropertiesOption );
|
||||
// QRegularExpression setsRegExp( "\\[[^\\]]+\\]", QRegularExpression::CaseInsensitiveOption );
|
||||
// QRegularExpression regexRegExp( "\\\\[afnrtvdDwWsSbB]|\\\\x([0-9A-Fa-f]{4})|\\\\0([0-7]{3})", QRegularExpression::CaseInsensitiveOption);
|
||||
|
||||
hasCJK = containCJK( str );
|
||||
|
||||
if( searchMode == FTS::WholeWords || searchMode == FTS::PlainText )
|
||||
{
|
||||
// Make words list for search in article text
|
||||
searchWords = str.normalized( QString::NormalizationForm_C ).split( spacesRegExp, Qt::SkipEmptyParts );
|
||||
searchWords = str.normalized( QString::NormalizationForm_C ).split( RX::Ftx::spacesRegExp, Qt::SkipEmptyParts );
|
||||
// Make words list for index search
|
||||
QStringList list =
|
||||
str.normalized( QString::NormalizationForm_C ).toLower().split( spacesRegExp, Qt::SkipEmptyParts );
|
||||
str.normalized( QString::NormalizationForm_C ).toLower().split( RX::Ftx::spacesRegExp, Qt::SkipEmptyParts );
|
||||
|
||||
QString searchString;
|
||||
if( hasCJK )
|
||||
{
|
||||
tokenizeCJK( indexWords, wordRegExp, list );
|
||||
tokenizeCJK( indexWords, RX::Ftx::wordRegExp, list );
|
||||
// QStringList allWords = str.split( spacesRegExp, Qt::SkipEmptyParts );
|
||||
searchString = makeHiliteRegExpString( list, searchMode, distanceBetweenWords, hasCJK , ignoreWordsOrder);
|
||||
}
|
||||
else
|
||||
{
|
||||
indexWords = list.filter( wordRegExp );
|
||||
indexWords = list.filter( RX::Ftx::wordRegExp );
|
||||
indexWords.removeDuplicates();
|
||||
|
||||
// Make regexp for results hilite
|
||||
|
||||
QStringList allWords = str.split( spacesRegExp, Qt::SkipEmptyParts );
|
||||
QStringList allWords = str.split( RX::Ftx::spacesRegExp, Qt::SkipEmptyParts );
|
||||
searchString = makeHiliteRegExpString( allWords, searchMode, distanceBetweenWords,false, ignoreWordsOrder );
|
||||
}
|
||||
searchRegExp = QRegExp( searchString, matchCase ? Qt::CaseSensitive : Qt::CaseInsensitive, QRegExp::RegExp2 );
|
||||
|
@ -191,21 +193,21 @@ bool parseSearchString( QString const & str, QStringList & indexWords,
|
|||
|
||||
// Remove RegExp commands
|
||||
if( searchMode == FTS::RegExp )
|
||||
tmp.replace( regexRegExp, " " );
|
||||
tmp.replace( RX::Ftx::regexRegExp, " " );
|
||||
|
||||
// Remove all symbol sets
|
||||
tmp.replace( setsRegExp, " " );
|
||||
tmp.replace( RX::Ftx::setsRegExp, " " );
|
||||
|
||||
QStringList list = tmp.normalized( QString::NormalizationForm_C )
|
||||
.toLower().split( spacesRegExp, Qt::SkipEmptyParts );
|
||||
.toLower().split( RX::Ftx::spacesRegExp, Qt::SkipEmptyParts );
|
||||
|
||||
if( hasCJK )
|
||||
{
|
||||
tokenizeCJK( indexWords, wordRegExp, list );
|
||||
tokenizeCJK( indexWords, RX::Ftx::wordRegExp, list );
|
||||
}
|
||||
else
|
||||
{
|
||||
indexWords = list.filter( wordRegExp );
|
||||
indexWords = list.filter( RX::Ftx::wordRegExp );
|
||||
indexWords.removeDuplicates();
|
||||
}
|
||||
|
||||
|
@ -224,9 +226,9 @@ void parseArticleForFts( uint32_t articleAddress, QString & articleText,
|
|||
if( articleText.isEmpty() )
|
||||
return;
|
||||
|
||||
QRegularExpression regBrackets( "(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+)(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+){0,1}(\\([\\w\\p{M}]+\\)){0,1}",
|
||||
QRegularExpression::UseUnicodePropertiesOption);
|
||||
QRegularExpression regSplit( "[^\\w\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption );
|
||||
// QRegularExpression regBrackets( "(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+)(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+){0,1}(\\([\\w\\p{M}]+\\)){0,1}",
|
||||
// QRegularExpression::UseUnicodePropertiesOption);
|
||||
// QRegularExpression regSplit( "[^\\w\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption );
|
||||
|
||||
QStringList articleWords = articleText.normalized( QString::NormalizationForm_C )
|
||||
.split( QRegularExpression( handleRoundBrackets ? "[^\\w\\(\\)\\p{M}]+" : "[^\\w\\p{M}]+",
|
||||
|
@ -275,12 +277,12 @@ void parseArticleForFts( uint32_t articleAddress, QString & articleText,
|
|||
// Special handle for words with round brackets - DSL feature
|
||||
QStringList list;
|
||||
|
||||
QStringList oldVariant = word.split( regSplit, Qt::SkipEmptyParts );
|
||||
QStringList oldVariant = word.split( RX::Ftx::regSplit, Qt::SkipEmptyParts );
|
||||
for( QStringList::iterator it = oldVariant.begin(); it != oldVariant.end(); ++it )
|
||||
if( it->size() >= FTS::MinimumWordSize && !list.contains( *it ) )
|
||||
list.append( *it );
|
||||
|
||||
QRegularExpressionMatch match = regBrackets.match( word );
|
||||
QRegularExpressionMatch match = RX::Ftx::regBrackets.match( word );
|
||||
if( match.hasMatch() )
|
||||
{
|
||||
QStringList parts = match.capturedTexts();
|
||||
|
@ -445,21 +447,20 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
|
|||
QStringList const & words,
|
||||
QRegExp const & searchRegexp )
|
||||
{
|
||||
int results = 0;
|
||||
QtConcurrent::blockingMap( offsets, [ & ]( uint32_t offset ) { checkSingleArticle( offset, words, searchRegexp ); } );
|
||||
}
|
||||
|
||||
void FTSResultsRequest::checkSingleArticle( uint32_t offset,
|
||||
QStringList const & words,
|
||||
QRegExp const & searchRegexp )
|
||||
{
|
||||
qDebug()<<"checking"<<offset<<QThread::currentThreadId();
|
||||
// int results = 0;
|
||||
QString headword, articleText;
|
||||
QList< uint32_t > offsetsForHeadwords;
|
||||
QVector< QStringList > hiliteRegExps;
|
||||
|
||||
QString id = QString::fromUtf8( dict.getId().c_str() );
|
||||
bool needHandleBrackets;
|
||||
{
|
||||
QString name = QString::fromUtf8( dict.getDictionaryFilenames()[ 0 ].c_str() ).toLower();
|
||||
needHandleBrackets = name.endsWith( ".dsl" ) || name.endsWith( ".dsl.dz" );
|
||||
}
|
||||
|
||||
QRegularExpression regBrackets( "(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+)(\\([\\w\\p{M}]+\\)){0,1}([\\w\\p{M}]+){0,1}(\\([\\w\\p{M}]+\\)){0,1}",
|
||||
QRegularExpression::UseUnicodePropertiesOption);
|
||||
QRegularExpression regSplit( "[^\\w\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption );
|
||||
|
||||
// RegExp mode
|
||||
QRegularExpression searchRegularExpression;
|
||||
|
@ -478,12 +479,13 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
|
|||
|
||||
if( searchMode == FTS::Wildcards || searchMode == FTS::RegExp )
|
||||
{
|
||||
for( int i = 0; i < offsets.size(); i++ )
|
||||
// for( int i = 0; i < offsets.size(); i++ )
|
||||
{
|
||||
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
||||
break;
|
||||
return;
|
||||
|
||||
dict.getArticleText( offsets.at( i ), headword, articleText );
|
||||
// auto article_address = offsets.at( i );
|
||||
dict.getArticleText( offset, headword, articleText );
|
||||
articleText = articleText.normalized( QString::NormalizationForm_C );
|
||||
|
||||
if( ignoreDiacritics )
|
||||
|
@ -492,13 +494,13 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
|
|||
if( articleText.contains( searchRegularExpression ) )
|
||||
{
|
||||
if( headword.isEmpty() )
|
||||
offsetsForHeadwords.append( offsets.at( i ) );
|
||||
offsetsForHeadwords.append( offset );
|
||||
else
|
||||
foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) );
|
||||
|
||||
results++;
|
||||
++results;
|
||||
if( maxResults > 0 && results >= maxResults )
|
||||
break;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -506,10 +508,6 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
|
|||
{
|
||||
// Words mode
|
||||
|
||||
QRegularExpression splitWithBrackets( "[^\\w\\(\\)\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption );
|
||||
QRegularExpression splitWithoutBrackets( "[^\\w\\p{M}]+", QRegularExpression::UseUnicodePropertiesOption );
|
||||
|
||||
Qt::CaseSensitivity cs = matchCase ? Qt::CaseSensitive : Qt::CaseInsensitive;
|
||||
QVector< QPair< QString, bool > > wordsList;
|
||||
if( ignoreWordsOrder )
|
||||
{
|
||||
|
@ -517,18 +515,10 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
|
|||
wordsList.append( QPair< QString, bool >( *it, true ) );
|
||||
}
|
||||
|
||||
for( int i = 0; i < offsets.size(); i++ )
|
||||
// for( int i = 0; i < offsets.size(); i++ )
|
||||
{
|
||||
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
||||
break;
|
||||
|
||||
int pos = 0;
|
||||
int matchWordNom = 0;
|
||||
int unmatchWordNom = 0;
|
||||
int nextNotFoundPos = 0;
|
||||
|
||||
QVector< QStringList > allOrders;
|
||||
QStringList order;
|
||||
return;
|
||||
|
||||
if( ignoreWordsOrder )
|
||||
{
|
||||
|
@ -536,17 +526,14 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
|
|||
wordsList[ i ].second = true;
|
||||
}
|
||||
|
||||
dict.getArticleText( offsets.at( i ), headword, articleText );
|
||||
dict.getArticleText( offset, headword, articleText );
|
||||
|
||||
articleText = articleText.normalized( QString::NormalizationForm_C );
|
||||
|
||||
if( ignoreDiacritics )
|
||||
articleText = gd::toQString( Folding::applyDiacriticsOnly( gd::toWString( articleText ) ) );
|
||||
|
||||
//QStringList articleWords = articleText.split( needHandleBrackets ? splitWithBrackets : splitWithoutBrackets,
|
||||
// Qt::SkipEmptyParts );
|
||||
|
||||
if(ignoreWordsOrder)
|
||||
if( ignoreWordsOrder )
|
||||
{
|
||||
bool allMatch = true;
|
||||
foreach( QString word, words )
|
||||
|
@ -559,75 +546,78 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
|
|||
break;
|
||||
}
|
||||
}
|
||||
else if( searchMode == FTS::WholeWords)
|
||||
else if( searchMode == FTS::WholeWords )
|
||||
{
|
||||
QRegularExpression tmpReg( QString( "\b%1\b" ).arg( word ),QRegularExpression::CaseInsensitiveOption|QRegularExpression::UseUnicodePropertiesOption );
|
||||
if( !articleText.contains( tmpReg) )
|
||||
QRegularExpression tmpReg( QString( "\b%1\b" ).arg( word ),
|
||||
QRegularExpression::CaseInsensitiveOption
|
||||
| QRegularExpression::UseUnicodePropertiesOption );
|
||||
if( !articleText.contains( tmpReg ) )
|
||||
{
|
||||
allMatch = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if(!allMatch)
|
||||
if( !allMatch )
|
||||
{
|
||||
continue;
|
||||
return;
|
||||
}
|
||||
|
||||
if( distanceBetweenWords >= 0 )
|
||||
{
|
||||
// the article text contains all the needed words.
|
||||
// determine if distance restriction is meet
|
||||
QRegularExpression replaceReg( QString( "(%1)" ).arg( words.join( '|' ) ),
|
||||
QRegularExpression::CaseInsensitiveOption |
|
||||
QRegularExpression::UseUnicodePropertiesOption );
|
||||
const QRegularExpression replaceReg( QString( "(%1)" ).arg( words.join( '|' ) ),
|
||||
QRegularExpression::CaseInsensitiveOption
|
||||
| QRegularExpression::UseUnicodePropertiesOption );
|
||||
// use a string that could not be presented in the article.
|
||||
articleText = articleText.replace( replaceReg, "=@XXXXX@=" );
|
||||
|
||||
auto hasCJK = false;
|
||||
foreach(QString word,words)
|
||||
foreach( QString word, words )
|
||||
{
|
||||
if(containCJK( word ))
|
||||
if( containCJK( word ) )
|
||||
{
|
||||
hasCJK = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
//hascjk value ,perhaps should depend on each word
|
||||
auto searchRegStr = makeHiliteRegExpString( Utils::repeat( "=@XXXXX@=", words.size() ), searchMode, distanceBetweenWords,hasCJK );
|
||||
QRegularExpression distanceOrderReg( searchRegStr,
|
||||
QRegularExpression::CaseInsensitiveOption |
|
||||
QRegularExpression::UseUnicodePropertiesOption );
|
||||
// hascjk value ,perhaps should depend on each word
|
||||
const auto searchRegStr = makeHiliteRegExpString( Utils::repeat( "=@XXXXX@=", words.size() ),
|
||||
searchMode,
|
||||
distanceBetweenWords,
|
||||
hasCJK );
|
||||
const QRegularExpression distanceOrderReg( searchRegStr,
|
||||
QRegularExpression::CaseInsensitiveOption
|
||||
| QRegularExpression::UseUnicodePropertiesOption );
|
||||
// use a string that could not be presented in the article.
|
||||
if(articleText.contains(distanceOrderReg))
|
||||
if( articleText.contains( distanceOrderReg ) )
|
||||
{
|
||||
if( headword.isEmpty() )
|
||||
offsetsForHeadwords.append( offsets.at( i ) );
|
||||
offsetsForHeadwords.append( offset );
|
||||
else
|
||||
foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) );
|
||||
|
||||
results++;
|
||||
++results;
|
||||
if( maxResults > 0 && results >= maxResults )
|
||||
break;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
if( articleText.contains( searchRegularExpression ) )
|
||||
{
|
||||
if( headword.isEmpty() )
|
||||
offsetsForHeadwords.append( offsets.at( i ) );
|
||||
offsetsForHeadwords.append( offset );
|
||||
else
|
||||
foundHeadwords->append( FTS::FtsHeadword( headword, id, QStringList(), matchCase ) );
|
||||
|
||||
results++;
|
||||
|
||||
++results;
|
||||
if( maxResults > 0 && results >= maxResults )
|
||||
break;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -637,7 +627,10 @@ void FTSResultsRequest::checkArticles( QVector< uint32_t > const & offsets,
|
|||
QVector< QString > headwords;
|
||||
dict.getHeadwordsFromOffsets( offsetsForHeadwords, headwords, &isCancelled );
|
||||
for( int x = 0; x < headwords.size(); x++ )
|
||||
foundHeadwords->append( FTS::FtsHeadword( headwords.at( x ), id, x < hiliteRegExps.size() ? hiliteRegExps.at( x ) : QStringList(), matchCase ) );
|
||||
foundHeadwords->append( FTS::FtsHeadword( headwords.at( x ),
|
||||
id,
|
||||
x < hiliteRegExps.size() ? hiliteRegExps.at( x ) : QStringList(),
|
||||
matchCase ) );
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -648,27 +641,28 @@ void FTSResultsRequest::indexSearch( BtreeIndexing::BtreeIndex & ftsIndex,
|
|||
{
|
||||
// Find articles which contains all requested words
|
||||
|
||||
vector< BtreeIndexing::WordArticleLink > links;
|
||||
QSet< uint32_t > setOfOffsets, tmp;
|
||||
uint32_t size;
|
||||
QSet< uint32_t > setOfOffsets;
|
||||
|
||||
if( indexWords.isEmpty() )
|
||||
return;
|
||||
|
||||
int n = indexWords.length();
|
||||
for( int i = 0; i < n; i++ )
|
||||
QList< QSet< uint32_t > > addressLists;
|
||||
|
||||
auto findLinks = [ & ]( const QString & word )
|
||||
{
|
||||
QSet< uint32_t > tmp;
|
||||
uint32_t size;
|
||||
|
||||
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
||||
return;
|
||||
addressLists<< tmp;
|
||||
|
||||
tmp.clear();
|
||||
|
||||
links = ftsIndex.findArticles( gd::toWString( indexWords.at( i ) ), ignoreDiacritics );
|
||||
vector< BtreeIndexing::WordArticleLink > links =
|
||||
ftsIndex.findArticles( gd::toWString( word ), ignoreDiacritics );
|
||||
for( unsigned x = 0; x < links.size(); x++ )
|
||||
{
|
||||
|
||||
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
||||
return;
|
||||
addressLists<< tmp;
|
||||
|
||||
vector< char > chunk;
|
||||
char * linksPtr;
|
||||
|
@ -677,24 +671,31 @@ void FTSResultsRequest::indexSearch( BtreeIndexing::BtreeIndex & ftsIndex,
|
|||
linksPtr = chunks->getBlock( links[ x ].articleOffset, chunk );
|
||||
}
|
||||
|
||||
memcpy( &size, linksPtr, sizeof(uint32_t) );
|
||||
linksPtr += sizeof(uint32_t);
|
||||
memcpy( &size, linksPtr, sizeof( uint32_t ) );
|
||||
linksPtr += sizeof( uint32_t );
|
||||
for( uint32_t y = 0; y < size; y++ )
|
||||
{
|
||||
tmp.insert( *( reinterpret_cast< uint32_t * >( linksPtr ) ) );
|
||||
linksPtr += sizeof(uint32_t);
|
||||
linksPtr += sizeof( uint32_t );
|
||||
}
|
||||
}
|
||||
|
||||
links.clear();
|
||||
|
||||
if( i == 0 )
|
||||
setOfOffsets = tmp;
|
||||
addressLists<< tmp;
|
||||
};
|
||||
// int n = indexWords.length();
|
||||
QtConcurrent::blockingMap( indexWords, findLinks );
|
||||
|
||||
int i = 0;
|
||||
for( auto & elem : addressLists )
|
||||
{
|
||||
if( i++ == 0 )
|
||||
setOfOffsets = elem;
|
||||
else
|
||||
setOfOffsets = setOfOffsets.intersect( tmp );
|
||||
setOfOffsets = setOfOffsets.intersect( elem );
|
||||
}
|
||||
|
||||
tmp.clear();
|
||||
|
||||
if( setOfOffsets.isEmpty() )
|
||||
return;
|
||||
|
@ -757,17 +758,15 @@ void FTSResultsRequest::combinedIndexSearch( BtreeIndexing::BtreeIndex & ftsInde
|
|||
|
||||
if( !hieroglyphsList.empty() )
|
||||
{
|
||||
QSet< uint32_t > tmp;
|
||||
vector< BtreeIndexing::WordArticleLink > links;
|
||||
|
||||
for( int i = 0; i < hieroglyphsList.size(); i++ )
|
||||
QList< QSet< uint32_t > > sets;
|
||||
auto fn_wordLink = [ & ](const QString & word )
|
||||
{
|
||||
links = ftsIndex.findArticles( gd::toWString( hieroglyphsList.at( i ) ) );
|
||||
QSet< uint32_t > tmp;
|
||||
vector< BtreeIndexing::WordArticleLink > links = ftsIndex.findArticles( gd::toWString( word ) );
|
||||
for( unsigned x = 0; x < links.size(); x++ )
|
||||
{
|
||||
|
||||
if( Utils::AtomicInt::loadAcquire( isCancelled ) )
|
||||
return;
|
||||
sets<< tmp;
|
||||
|
||||
vector< char > chunk;
|
||||
char * linksPtr;
|
||||
|
@ -786,11 +785,17 @@ void FTSResultsRequest::combinedIndexSearch( BtreeIndexing::BtreeIndex & ftsInde
|
|||
}
|
||||
|
||||
links.clear();
|
||||
sets<< tmp;
|
||||
};
|
||||
QtConcurrent::blockingMap( hieroglyphsList, fn_wordLink );
|
||||
|
||||
if( i == 0 )
|
||||
setOfOffsets = tmp;
|
||||
int i = 0;
|
||||
for( auto & elem : sets )
|
||||
{
|
||||
if( i++ == 0 )
|
||||
setOfOffsets = elem;
|
||||
else
|
||||
setOfOffsets = setOfOffsets.intersect( tmp );
|
||||
setOfOffsets = setOfOffsets.intersect( elem );
|
||||
}
|
||||
|
||||
allWordsLinks[ wordNom ] = setOfOffsets;
|
||||
|
|
|
@ -82,12 +82,16 @@ class FTSResultsRequest : public Dictionary::DataRequest
|
|||
|
||||
QAtomicInt isCancelled;
|
||||
|
||||
QAtomicInt results;
|
||||
|
||||
QList< FTS::FtsHeadword > * foundHeadwords;
|
||||
|
||||
void checkArticles( QVector< uint32_t > const & offsets,
|
||||
QStringList const & words,
|
||||
QRegExp const & searchRegexp = QRegExp() );
|
||||
|
||||
void checkSingleArticle( uint32_t offset, QStringList const & words, QRegExp const & searchRegexp = QRegExp() );
|
||||
|
||||
void indexSearch( BtreeIndexing::BtreeIndex & ftsIndex,
|
||||
sptr< ChunkedStorage::Reader > chunks,
|
||||
QStringList & indexWords,
|
||||
|
@ -127,6 +131,7 @@ public:
|
|||
searchString = gd::toQString( Folding::applyDiacriticsOnly( gd::toWString( searchString_ ) ) );
|
||||
|
||||
foundHeadwords = new QList< FTS::FtsHeadword >;
|
||||
results = 0;
|
||||
QThreadPool::globalInstance()->start( [ this ]() { this->run(); }, -100 );
|
||||
}
|
||||
|
||||
|
|
|
@ -47,7 +47,8 @@ QT += core \
|
|||
webenginewidgets\
|
||||
webchannel\
|
||||
printsupport \
|
||||
help
|
||||
help \
|
||||
concurrent
|
||||
|
||||
greaterThan(QT_MAJOR_VERSION, 5): QT += webenginecore core5compat
|
||||
|
||||
|
@ -242,6 +243,7 @@ HEADERS += folding.hh \
|
|||
ankiconnector.h \
|
||||
article_inspect.h \
|
||||
articlewebpage.h \
|
||||
base/globalregex.hh \
|
||||
globalbroadcaster.h \
|
||||
iframeschemehandler.h \
|
||||
inc_case_folding.hh \
|
||||
|
@ -384,6 +386,7 @@ SOURCES += folding.cc \
|
|||
ankiconnector.cpp \
|
||||
article_inspect.cpp \
|
||||
articlewebpage.cpp \
|
||||
base/globalregex.cc \
|
||||
globalbroadcaster.cpp \
|
||||
iframeschemehandler.cpp \
|
||||
main.cc \
|
||||
|
|
81
mdx.cc
81
mdx.cc
|
@ -42,6 +42,7 @@
|
|||
|
||||
#include "tiff.hh"
|
||||
#include "utils.hh"
|
||||
#include "base/globalregex.hh"
|
||||
|
||||
namespace Mdx
|
||||
{
|
||||
|
@ -192,51 +193,6 @@ public:
|
|||
|
||||
};
|
||||
|
||||
struct MdxRegex
|
||||
{
|
||||
MdxRegex() :
|
||||
allLinksRe( "(?:<\\s*(a(?:rea)?|img|link|script|source)(?:\\s+[^>]+|\\s*)>)",
|
||||
QRegularExpression::CaseInsensitiveOption ),
|
||||
wordCrossLink( "([\\s\"']href\\s*=)\\s*([\"'])entry://([^>#]*?)((?:#[^>]*?)?)\\2",
|
||||
QRegularExpression::CaseInsensitiveOption ),
|
||||
anchorIdRe( "([\\s\"'](?:name|id)\\s*=)\\s*([\"'])\\s*(?=\\S)", QRegularExpression::CaseInsensitiveOption ),
|
||||
anchorIdReWord( "([\\s\"'](?:name|id)\\s*=)\\s*([\"'])\\s*(?=\\S)([^\"]*)", QRegularExpression::CaseInsensitiveOption ),
|
||||
anchorIdRe2( "([\\s\"'](?:name|id)\\s*=)\\s*(?=[^\"'])([^\\s\">]+)", QRegularExpression::CaseInsensitiveOption ),
|
||||
anchorLinkRe( "([\\s\"']href\\s*=\\s*[\"'])entry://#", QRegularExpression::CaseInsensitiveOption ),
|
||||
audioRe( "([\\s\"']href\\s*=)\\s*([\"'])sound://([^\">]+)\\2",
|
||||
QRegularExpression::CaseInsensitiveOption | QRegularExpression::InvertedGreedinessOption ),
|
||||
stylesRe( "([\\s\"']href\\s*=)\\s*([\"'])(?!\\s*\\b(?:(?:bres|https?|ftp)://"
|
||||
"|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\">]+)\\2",
|
||||
QRegularExpression::CaseInsensitiveOption ),
|
||||
stylesRe2( "([\\s\"']href\\s*=)\\s*(?![\\s\"']|\\b(?:(?:bres|https?|ftp)://"
|
||||
"|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\\s\">]+)",
|
||||
QRegularExpression::CaseInsensitiveOption ),
|
||||
inlineScriptRe( "<\\s*script(?:(?=\\s)(?:(?![\\s\"']src\\s*=)[^>])+|\\s*)>",
|
||||
QRegularExpression::CaseInsensitiveOption ),
|
||||
closeScriptTagRe( "<\\s*/script\\s*>", QRegularExpression::CaseInsensitiveOption ),
|
||||
srcRe( "([\\s\"']src\\s*=)\\s*([\"'])(?!\\s*\\b(?:(?:bres|https?|ftp)://"
|
||||
"|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\">]+)\\2",
|
||||
QRegularExpression::CaseInsensitiveOption ),
|
||||
srcRe2( "([\\s\"']src\\s*=)\\s*(?![\\s\"']|\\b(?:(?:bres|https?|ftp)://"
|
||||
"|(?:data|javascript):))(?:file://)?[\\x00-\\x1f\\x7f]*\\.*/?([^\\s\">]+)",
|
||||
QRegularExpression::CaseInsensitiveOption )
|
||||
{
|
||||
}
|
||||
QRegularExpression allLinksRe;
|
||||
QRegularExpression wordCrossLink;
|
||||
QRegularExpression anchorIdRe;
|
||||
QRegularExpression anchorIdReWord;
|
||||
QRegularExpression anchorIdRe2;
|
||||
QRegularExpression anchorLinkRe;
|
||||
QRegularExpression audioRe;
|
||||
QRegularExpression stylesRe;
|
||||
QRegularExpression stylesRe2;
|
||||
QRegularExpression inlineScriptRe;
|
||||
QRegularExpression closeScriptTagRe;
|
||||
QRegularExpression srcRe;
|
||||
QRegularExpression srcRe2;
|
||||
};
|
||||
|
||||
class MdxDictionary: public BtreeIndexing::BtreeDictionary
|
||||
{
|
||||
Mutex idxMutex;
|
||||
|
@ -256,8 +212,6 @@ class MdxDictionary: public BtreeIndexing::BtreeDictionary
|
|||
string initError;
|
||||
QString cacheDirName;
|
||||
|
||||
static MdxRegex mdxRx;
|
||||
|
||||
public:
|
||||
|
||||
MdxDictionary( string const & id, string const & indexFile, vector<string> const & dictionaryFiles );
|
||||
|
@ -347,8 +301,6 @@ private:
|
|||
friend class MddResourceRequest;
|
||||
};
|
||||
|
||||
MdxRegex MdxDictionary::mdxRx;
|
||||
|
||||
MdxDictionary::MdxDictionary( string const & id, string const & indexFile,
|
||||
vector<string> const & dictionaryFiles ):
|
||||
BtreeDictionary( id, dictionaryFiles ),
|
||||
|
@ -972,10 +924,11 @@ void MdxDictionary::loadArticle( uint32_t offset, string & articleText, bool noF
|
|||
decompressed.constData() + recordInfo.recordOffset,
|
||||
recordInfo.recordSize );
|
||||
|
||||
article = MdictParser::substituteStylesheet( article, styleSheets );
|
||||
|
||||
if( !noFilter )
|
||||
{
|
||||
article = MdictParser::substituteStylesheet( article, styleSheets );
|
||||
article = filterResource( articleId, article );
|
||||
}
|
||||
|
||||
articleText = article.toStdString();
|
||||
}
|
||||
|
@ -987,7 +940,7 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
|
|||
|
||||
QString articleNewText;
|
||||
int linkPos = 0;
|
||||
QRegularExpressionMatchIterator it = mdxRx.allLinksRe.globalMatch( article );
|
||||
QRegularExpressionMatchIterator it = RX::Mdx::allLinksRe.globalMatch( article );
|
||||
QMap<QString,QString> idMap;
|
||||
while( it.hasNext() )
|
||||
{
|
||||
|
@ -1005,10 +958,10 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
|
|||
|
||||
if( !linkType.isEmpty() && linkType.at( 0 ) == 'a' )
|
||||
{
|
||||
QRegularExpressionMatch match = mdxRx.anchorIdRe.match( linkTxt );
|
||||
QRegularExpressionMatch match = RX::Mdx::anchorIdRe.match( linkTxt );
|
||||
if( match.hasMatch() )
|
||||
{
|
||||
auto wordMatch = mdxRx.anchorIdReWord.match( linkTxt );
|
||||
auto wordMatch = RX::Mdx::anchorIdReWord.match( linkTxt );
|
||||
if( wordMatch.hasMatch() )
|
||||
{
|
||||
idMap.insert( wordMatch.captured( 3 ), uniquePrefix + wordMatch.captured( 3 ) );
|
||||
|
@ -1017,11 +970,11 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
|
|||
newLink = linkTxt.replace( match.capturedStart(), match.capturedLength(), newText );
|
||||
}
|
||||
else
|
||||
newLink = linkTxt.replace( mdxRx.anchorIdRe2, "\\1\"" + uniquePrefix + "\\2\"" );
|
||||
newLink = linkTxt.replace( RX::Mdx::anchorIdRe2, "\\1\"" + uniquePrefix + "\\2\"" );
|
||||
|
||||
newLink = newLink.replace( mdxRx.anchorLinkRe, "\\1#" + uniquePrefix );
|
||||
newLink = newLink.replace( RX::Mdx::anchorLinkRe, "\\1#" + uniquePrefix );
|
||||
|
||||
match = mdxRx.audioRe.match( newLink );
|
||||
match = RX::Mdx::audioRe.match( newLink );
|
||||
if( match.hasMatch() )
|
||||
{
|
||||
// sounds and audio link script
|
||||
|
@ -1032,7 +985,7 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
|
|||
+ newLink.replace( match.capturedStart(), match.capturedLength(), newTxt );
|
||||
}
|
||||
|
||||
match = mdxRx.wordCrossLink.match( newLink );
|
||||
match = RX::Mdx::wordCrossLink.match( newLink );
|
||||
if( match.hasMatch() )
|
||||
{
|
||||
QString newTxt = match.captured( 1 ) + match.captured( 2 )
|
||||
|
@ -1050,7 +1003,7 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
|
|||
if( linkType.compare( "link" ) == 0 )
|
||||
{
|
||||
// stylesheets
|
||||
QRegularExpressionMatch match = mdxRx.stylesRe.match( linkTxt );
|
||||
QRegularExpressionMatch match = RX::Mdx::stylesRe.match( linkTxt );
|
||||
if( match.hasMatch() )
|
||||
{
|
||||
QString newText = match.captured( 1 ) + match.captured( 2 )
|
||||
|
@ -1059,7 +1012,7 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
|
|||
newLink = linkTxt.replace( match.capturedStart(), match.capturedLength(), newText );
|
||||
}
|
||||
else
|
||||
newLink = linkTxt.replace( mdxRx.stylesRe2,
|
||||
newLink = linkTxt.replace( RX::Mdx::stylesRe2,
|
||||
"\\1\"bres://" + id + "/\\2\"" );
|
||||
}
|
||||
else
|
||||
|
@ -1067,13 +1020,13 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
|
|||
|| linkType.compare( "source" ) == 0 )
|
||||
{
|
||||
// javascripts and images
|
||||
QRegularExpressionMatch match = mdxRx.inlineScriptRe.match( linkTxt );
|
||||
QRegularExpressionMatch match = RX::Mdx::inlineScriptRe.match( linkTxt );
|
||||
if( linkType.at( 1 ) == 'c' // "script" tag
|
||||
&& match.hasMatch() && match.capturedLength() == linkTxt.length() )
|
||||
{
|
||||
// skip inline scripts
|
||||
articleNewText += linkTxt;
|
||||
match = mdxRx.closeScriptTagRe.match( article, linkPos );
|
||||
match = RX::Mdx::closeScriptTagRe.match( article, linkPos );
|
||||
if( match.hasMatch() )
|
||||
{
|
||||
articleNewText += article.mid( linkPos, match.capturedEnd() - linkPos );
|
||||
|
@ -1083,7 +1036,7 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
|
|||
}
|
||||
else
|
||||
{
|
||||
match = mdxRx.srcRe.match( linkTxt );
|
||||
match = RX::Mdx::srcRe.match( linkTxt );
|
||||
if( match.hasMatch() )
|
||||
{
|
||||
QString newText;
|
||||
|
@ -1104,7 +1057,7 @@ QString & MdxDictionary::filterResource( QString const & articleId, QString & ar
|
|||
newLink = linkTxt.replace( match.capturedStart(), match.capturedLength(), newText );
|
||||
}
|
||||
else
|
||||
newLink = linkTxt.replace( mdxRx.srcRe2,
|
||||
newLink = linkTxt.replace( RX::Mdx::srcRe2,
|
||||
"\\1\"bres://" + id + "/\\2\"" );
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue