Merge pull request #206 from xiaoyifang/fix/zim

fix: zim dictionary auto refresh logic
This commit is contained in:
xiaoyifang 2022-11-19 17:46:46 +08:00 committed by GitHub
commit 276056c9e1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 73 additions and 61 deletions

View file

@ -167,10 +167,17 @@ QNetworkReply * ArticleNetworkAccessManager::getArticleReply( QNetworkRequest co
if(req.url().scheme()=="gdlookup"){
QString path=url.path();
if(!path.isEmpty()){
Utils::Url::addQueryItem(url,"word",path.mid(1));
url.setPath("");
Utils::Url::addQueryItem(url,"group","1");
if( !path.isEmpty() )
{
url.setPath( "" );
QByteArray referer = req.rawHeader( "Referer" );
QUrl refererUrl = QUrl::fromEncoded( referer );
Utils::Url::addQueryItem( url, "word", path.mid( 1 ) );
if( Utils::Url::hasQueryItem( refererUrl, "group" ) )
{
Utils::Url::addQueryItem( url, "group", Utils::Url::queryItemValue( refererUrl, "group" ) );
}
}
}
@ -311,7 +318,7 @@ sptr< Dictionary::DataRequest > ArticleNetworkAccessManager::getResource(
bool ignoreDiacritics = Utils::Url::queryItemValue( url, "ignore_diacritics" ) == "1";
if ( groupIsValid && phrase.isValid() ) // Require group and phrase to be passed
if ( phrase.isValid() ) // Require group and phrase to be passed
return articleMaker.makeDefinitionFor( phrase, group, contexts, mutedDicts, QStringList(), ignoreDiacritics );
}

View file

@ -65,3 +65,4 @@ QRegularExpression Mdx::styleElment( "(<style[^>]*>)([\\w\\W]*?)(<\\/style>)",
QRegularExpression::CaseInsensitiveOption);
QRegularExpression Zim::linkSpecialChar("[\\.\\/]");

View file

@ -45,6 +45,11 @@ public:
static QRegularExpression styleElment;
};
class Zim{
public:
static QRegularExpression linkSpecialChar;
};
} // namespace RX
#endif // GLOBALREGEX_HH

111
zim.cc
View file

@ -42,6 +42,7 @@
#include <map>
#include <algorithm>
#include <QtConcurrent>
#include "base/globalregex.hh"
namespace Zim {
@ -838,7 +839,7 @@ string ZimDictionary::convert( const string & in )
QString( "<body \\1" ) );
// pattern of img and script
text.replace( QRegularExpression( "<\\s*(img|script)\\s+([^>]*)src=(\"|)(\\.\\.|)/" ),
text.replace( QRegularExpression( "<\\s*(img|script)\\s+([^>]*)src=(\"|)(\\.\\./)*" ),
QString( "<\\1 \\2src=\\3bres://%1/").arg( getId().c_str() ) );
// Fix links without '"'
@ -874,54 +875,35 @@ string ZimDictionary::convert( const string & in )
for( int i = list.size(); i < 5; i++ )
list.append( QString() );
QString formatTag;
QString tag = list[3]; // a url, ex: Precambrian_Chaotian.html
if ( !list[4].isEmpty() ) // a title, ex: title="Precambrian/Chaotian"
tag = list[4].split("\"")[1];
// Check type of links inside articles
if( linksType == UNKNOWN && tag.indexOf( '/' ) >= 0 )
{
QString word = QUrl::fromPercentEncoding( tag.toLatin1() );
QRegularExpression htmlRx( "\\.(s|)htm(l|)$", QRegularExpression::CaseInsensitiveOption );
word.remove( htmlRx ).
replace( "_", " " );
vector< WordArticleLink > links;
links = findArticles( gd::toWString( word ) );
if( !links.empty() )
{
linksType = SLASH;
}
else
{
word.remove( QRegularExpression(".*/") );
links = findArticles( gd::toWString( word ) );
if( !links.empty() )
{
linksType = NO_SLASH;
links.clear();
}
}
tag = list[4];
formatTag=tag.split("\"")[1];
}
else{
//tag from list[3]
formatTag = tag;
}
if( linksType == SLASH || linksType == UNKNOWN )
{
tag.remove( QRegularExpression( "\\.(s|)htm(l|)$", QRegularExpression::PatternOption::CaseInsensitiveOption ) ).
replace( "_", "%20" ).
prepend( "<a href=\"gdlookup://localhost/" ).
append( "\" " + list[4] + ">" );
formatTag.replace(RX::Zim::linkSpecialChar,"");
vector< WordArticleLink > links;
links = findArticles( gd::toWString( formatTag ) );
QString urlLink = match.captured();
QString replacedLink ;
if(!links.empty()){
replacedLink = urlLink.replace(tag,"gdlookup://localhost/"+formatTag);
}
else
{
tag.remove( QRegularExpression(".*/") ).
remove( QRegularExpression( "\\.(s|)htm(l|)$", QRegularExpression::PatternOption::CaseInsensitiveOption ) ).
replace( "_", "%20" ).
prepend( "<a href=\"gdlookup://localhost/" ).
append( "\" " + list[4] + ">" );
else{
replacedLink = urlLink.replace(tag,"bres://localhost/"+formatTag);
}
newText += tag;
newText += replacedLink;
}
if( pos )
{
@ -1507,9 +1489,9 @@ void ZimResourceRequest::run()
}
sptr< Dictionary::DataRequest > ZimDictionary::getResource( string const & name )
{
return new ZimResourceRequest( *this, name );
auto formatedName = QString::fromStdString(name).replace(RX::Zim::linkSpecialChar,"");
return new ZimResourceRequest( *this, formatedName.toStdString() );
}
//} // anonymous namespace
@ -1648,25 +1630,41 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
|| ( mimetype == 0xFFFF && df.isArticleMime( redirected_mime ) ) ) ) )
{
wstring word;
if( !title.empty() )
word = Utf8::decode( title );
else
word = Utf8::decode( url );
if( df.isArticleMime( mimetype )
|| ( mimetype == 0xFFFF && df.isArticleMime( redirected_mime ) ) )
{
if( maxHeadwordsToExpand && zh.articleCount >= maxHeadwordsToExpand )
indexedWords.addSingleWord( word, n );
{
if( !title.empty() )
{
word = Utf8::decode( title );
indexedWords.addSingleWord( word, n );
}
if( !url.empty() )
{
auto formatedUrl = QString::fromStdString( url ).replace( RX::Zim::linkSpecialChar, "" );
indexedWords.addSingleWord( Utf8::decode( formatedUrl.toStdString() ), n );
}
}
else
indexedWords.addWord( word, n );
{
if( !title.empty() )
{
word = Utf8::decode( title );
indexedWords.addWord( word, n );
}
if( !url.empty() )
{
auto formatedUrl = QString::fromStdString( url ).replace( RX::Zim::linkSpecialChar, "" );
indexedWords.addWord( Utf8::decode( formatedUrl.toStdString() ), n );
}
}
wordCount++;
}
else
{
url.insert( url.begin(), '/' );
url.insert( url.begin(), nameSpace );
indexedResources.addSingleWord( Utf8::decode( url ), n );
auto formatedUrl = QString::fromStdString(url).replace(RX::Zim::linkSpecialChar,"");
indexedResources.addSingleWord( Utf8::decode( formatedUrl.toStdString() ), n );
}
}
else
@ -1702,9 +1700,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
}
else
{
url.insert( url.begin(), '/' );
url.insert( url.begin(), nameSpace );
indexedResources.addSingleWord( Utf8::decode( url ), n );
// url.insert( url.begin(), '/' );
// url.insert( url.begin(), nameSpace );
auto formatedUrl = QString::fromStdString(url).replace(RX::Zim::linkSpecialChar,"");
indexedResources.addSingleWord( Utf8::decode( formatedUrl.toStdString() ), n );
}
}