2012-02-20 21:47:14 +00:00
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
2009-03-26 19:00:08 +00:00
* Part of GoldenDict . Licensed under GPLv3 or later , see the LICENSE file */
# include "mediawiki.hh"
2009-04-18 17:20:12 +00:00
# include "wstring_qt.hh"
2009-03-26 19:00:08 +00:00
# include <QNetworkAccessManager>
# include <QNetworkReply>
# include <QUrl>
# include <QtXml>
2021-06-10 16:28:23 +00:00
# include <algorithm>
2009-05-05 08:56:46 +00:00
# include <list>
2013-11-16 18:34:09 +00:00
# include "gddebug.hh"
2012-01-09 13:48:27 +00:00
# include "audiolink.hh"
2013-07-10 13:48:09 +00:00
# include "langcoder.hh"
2021-11-27 07:17:33 +00:00
# include "utils.hh"
2009-03-26 19:00:08 +00:00
2018-02-27 16:42:21 +00:00
# include <QRegularExpression>
2022-07-11 13:01:49 +00:00
# include "globalbroadcaster.h"
2018-02-27 16:42:21 +00:00
2009-03-26 19:00:08 +00:00
namespace MediaWiki {
using namespace Dictionary ;
namespace {
class MediaWikiDictionary : public Dictionary : : Class
{
string name ;
2012-06-21 09:43:28 +00:00
QString url , icon ;
2009-03-26 19:00:08 +00:00
QNetworkAccessManager & netMgr ;
2013-07-10 13:48:09 +00:00
quint32 langId ;
2009-04-21 20:09:02 +00:00
2009-03-26 19:00:08 +00:00
public :
MediaWikiDictionary ( string const & id , string const & name_ ,
QString const & url_ ,
2012-06-21 09:43:28 +00:00
QString const & icon_ ,
2009-03-26 19:00:08 +00:00
QNetworkAccessManager & netMgr_ ) :
Dictionary : : Class ( id , vector < string > ( ) ) ,
name ( name_ ) ,
url ( url_ ) ,
2012-06-21 09:43:28 +00:00
icon ( icon_ ) ,
2013-07-10 13:48:09 +00:00
netMgr ( netMgr_ ) ,
langId ( 0 )
2009-03-26 19:00:08 +00:00
{
2013-07-10 13:48:09 +00:00
int n = url . indexOf ( " . " ) ;
if ( n = = 2 | | ( n > 3 & & url [ n - 3 ] = = ' / ' ) )
langId = LangCoder : : code2toInt ( url . mid ( n - 2 , 2 ) . toLatin1 ( ) . data ( ) ) ;
2009-03-26 19:00:08 +00:00
}
2009-04-21 20:09:02 +00:00
2022-12-29 07:07:40 +00:00
string getName ( ) noexcept override
2009-03-26 19:00:08 +00:00
{ return name ; }
2022-12-29 07:07:40 +00:00
map < Property , string > getProperties ( ) noexcept override
2009-03-26 19:00:08 +00:00
{ return map < Property , string > ( ) ; }
2022-12-29 07:07:40 +00:00
unsigned long getArticleCount ( ) noexcept override
2009-03-26 19:00:08 +00:00
{ return 0 ; }
2022-12-29 07:07:40 +00:00
unsigned long getWordCount ( ) noexcept override
2009-03-26 19:00:08 +00:00
{ return 0 ; }
2022-12-29 07:07:40 +00:00
sptr < WordSearchRequest > prefixMatch ( wstring const & ,
unsigned long maxResults ) override ;
2009-03-26 19:00:08 +00:00
2022-12-29 07:07:40 +00:00
sptr < DataRequest > getArticle ( wstring const & , vector < wstring > const & alts ,
wstring const & , bool ) override ;
2012-12-03 12:47:43 +00:00
2022-12-29 07:07:40 +00:00
quint32 getLangFrom ( ) const override
2013-07-10 13:48:09 +00:00
{ return langId ; }
2022-12-29 07:07:40 +00:00
quint32 getLangTo ( ) const override
2013-07-10 13:48:09 +00:00
{ return langId ; }
2012-12-03 12:47:43 +00:00
protected :
2022-12-29 07:07:40 +00:00
void loadIcon ( ) noexcept override ;
2012-12-03 12:47:43 +00:00
2009-03-26 19:00:08 +00:00
} ;
2022-06-03 13:28:41 +00:00
void MediaWikiDictionary : : loadIcon ( ) noexcept
2012-12-03 12:47:43 +00:00
{
2013-01-31 22:30:11 +00:00
if ( dictionaryIconLoaded )
return ;
2012-12-03 12:47:43 +00:00
if ( ! icon . isEmpty ( ) )
{
QFileInfo fInfo ( QDir ( Config : : getConfigDir ( ) ) , icon ) ;
if ( fInfo . isFile ( ) )
loadIconFromFile ( fInfo . absoluteFilePath ( ) , true ) ;
}
if ( dictionaryIcon . isNull ( ) )
2022-03-24 12:14:01 +00:00
{
if ( url . contains ( " tionary " ) )
dictionaryIcon = dictionaryNativeIcon = QIcon ( " :/icons/wiktionary.png " ) ;
else
dictionaryIcon = dictionaryNativeIcon = QIcon ( " :/icons/icon32_wiki.png " ) ;
}
2012-12-03 12:47:43 +00:00
dictionaryIconLoaded = true ;
}
2009-03-26 19:00:08 +00:00
class MediaWikiWordSearchRequest : public MediaWikiWordSearchRequestSlots
{
sptr < QNetworkReply > netReply ;
bool isCancelling ;
public :
MediaWikiWordSearchRequest ( wstring const & ,
QString const & url , QNetworkAccessManager & mgr ) ;
~ MediaWikiWordSearchRequest ( ) ;
2022-12-29 07:07:40 +00:00
void cancel ( ) override ;
2009-03-26 19:00:08 +00:00
private :
2009-04-21 20:09:02 +00:00
2022-12-29 07:07:40 +00:00
void downloadFinished ( ) override ;
2009-03-26 19:00:08 +00:00
} ;
MediaWikiWordSearchRequest : : MediaWikiWordSearchRequest ( wstring const & str ,
QString const & url ,
2022-02-25 15:41:54 +00:00
QNetworkAccessManager & mgr ) :
isCancelling ( false )
2009-03-26 19:00:08 +00:00
{
2014-05-10 21:02:31 +00:00
GD_DPRINTF ( " request begin \n " ) ;
2013-05-31 04:20:25 +00:00
QUrl reqUrl ( url + " /api.php?action=query&list=allpages&aplimit=40&format=xml " ) ;
2009-03-26 19:00:08 +00:00
2022-07-11 13:01:49 +00:00
GlobalBroadcaster : : instance ( ) - > addWhitelist ( reqUrl . host ( ) ) ;
2021-11-27 07:17:33 +00:00
Utils : : Url : : addQueryItem ( reqUrl , " apfrom " , gd : : toQString ( str ) . replace ( ' + ' , " %2B " ) ) ;
2009-03-26 19:00:08 +00:00
2022-11-29 03:54:31 +00:00
netReply = std : : shared_ptr < QNetworkReply > ( mgr . get ( QNetworkRequest ( reqUrl ) ) ) ;
2009-03-26 19:00:08 +00:00
connect ( netReply . get ( ) , SIGNAL ( finished ( ) ) ,
this , SLOT ( downloadFinished ( ) ) ) ;
2022-12-15 07:11:09 +00:00
# ifndef QT_NO_SSL
2015-02-20 14:18:03 +00:00
connect ( netReply . get ( ) , SIGNAL ( sslErrors ( QList < QSslError > ) ) ,
netReply . get ( ) , SLOT ( ignoreSslErrors ( ) ) ) ;
# endif
2009-03-26 19:00:08 +00:00
}
MediaWikiWordSearchRequest : : ~ MediaWikiWordSearchRequest ( )
{
2014-05-10 21:02:31 +00:00
GD_DPRINTF ( " request end \n " ) ;
2009-03-26 19:00:08 +00:00
}
void MediaWikiWordSearchRequest : : cancel ( )
{
// We either finish it in place, or in the timer handler
isCancelling = true ;
2022-02-25 15:41:54 +00:00
if ( netReply . get ( ) )
2009-03-26 19:00:08 +00:00
netReply . reset ( ) ;
2022-02-25 15:41:54 +00:00
finish ( ) ;
GD_DPRINTF ( " cancel the request " ) ;
2009-03-26 19:00:08 +00:00
}
void MediaWikiWordSearchRequest : : downloadFinished ( )
{
if ( isCancelling | | isFinished ( ) ) // Was cancelled
return ;
if ( netReply - > error ( ) = = QNetworkReply : : NoError )
{
QDomDocument dd ;
QString errorStr ;
int errorLine , errorColumn ;
if ( ! dd . setContent ( netReply . get ( ) , false , & errorStr , & errorLine , & errorColumn ) )
{
setErrorString ( QString ( tr ( " XML parse error: %1 at %2,%3 " ) .
arg ( errorStr ) . arg ( errorLine ) . arg ( errorColumn ) ) ) ;
}
else
{
QDomNode pages = dd . namedItem ( " api " ) . namedItem ( " query " ) . namedItem ( " allpages " ) ;
if ( ! pages . isNull ( ) )
{
QDomNodeList nl = pages . toElement ( ) . elementsByTagName ( " p " ) ;
Mutex : : Lock _ ( dataMutex ) ;
2009-04-21 20:09:02 +00:00
2022-04-05 12:53:25 +00:00
for ( int x = 0 ; x < nl . length ( ) ; + + x )
2009-04-18 17:20:12 +00:00
matches . push_back ( gd : : toWString ( nl . item ( x ) . toElement ( ) . attribute ( " title " ) ) ) ;
2009-03-26 19:00:08 +00:00
}
}
2014-05-10 21:02:31 +00:00
GD_DPRINTF ( " done. \n " ) ;
2009-03-26 19:00:08 +00:00
}
else
setErrorString ( netReply - > errorString ( ) ) ;
finish ( ) ;
}
2023-02-07 06:26:36 +00:00
class MediaWikiSectionsParser
{
public :
/// Since a recent Wikipedia UI redesign, the table of contents (ToC) is no longer part of an article's HTML.
/// ToC is absent from the text node of Wikipedia's MediaWiki API reply. Quote from
/// https://www.mediawiki.org/wiki/Reading/Web/Desktop_Improvements/Features/Table_of_contents#How_can_I_get_the_old_table_of_contents?
/// We intentionally do not add the old table of contents to the article in addition to the new sidebar location...
/// Users can restore the old table of contents position with the following JavaScript code:
/// document.querySelector('mw\\3Atocplace,meta[property="mw:PageProp/toc"]').replaceWith( document.getElementById('mw-panel-toc') )
///
/// This function searches for an indicator of the empty ToC in an article HTML. If the indicator is present,
/// generates ToC HTML from the sections element and replaces the indicator with the generated ToC.
static void generateTableOfContentsIfEmpty ( QDomNode const & parseNode , QString & articleString )
{
QString const emptyTocIndicator = " <meta property= \" mw:PageProp/toc \" /> " ;
int const emptyTocPos = articleString . indexOf ( emptyTocIndicator ) ;
if ( emptyTocPos = = - 1 )
return ; // The ToC must be absent or nonempty => nothing to do.
QDomElement const sectionsElement = parseNode . firstChildElement ( " sections " ) ;
if ( sectionsElement . isNull ( ) )
{
gdWarning ( " MediaWiki: empty table of contents and missing sections element. " ) ;
return ;
}
gdDebug ( " MediaWiki: generating table of contents from the sections element. " ) ;
MediaWikiSectionsParser parser ;
parser . generateTableOfContents ( sectionsElement ) ;
articleString . replace ( emptyTocPos , emptyTocIndicator . size ( ) , parser . tableOfContents ) ;
}
private :
MediaWikiSectionsParser ( ) : previousLevel ( 0 ) { }
void generateTableOfContents ( QDomElement const & sectionsElement ) ;
bool addListLevel ( QString const & levelString ) ;
void closeListTags ( int currentLevel ) ;
QString tableOfContents ;
int previousLevel ;
} ;
void MediaWikiSectionsParser : : generateTableOfContents ( QDomElement const & sectionsElement )
{
// A real example of a typical child of the <sections> element:
// <s linkAnchor="Marginal_densities" toclevel="2" fromtitle="Probability_density_function" level="3"
// line="Marginal densities" byteoffset="15868" anchor="Marginal_densities" number="7.1" index="9"/>
// Use Wiktionary's ToC style, which had also been Wikipedia's ToC style until the UI redesign.
// Replace double quotes with single quotes to avoid escaping " within string literals.
QString const elTagName = " s " ;
QDomElement el = sectionsElement . firstChildElement ( elTagName ) ;
if ( el . isNull ( ) )
return ;
// Omit invisible and useless toctogglecheckbox, toctogglespan and toctogglelabel elements.
// The values of lang (e.g. 'en') and dir (e.g. 'ltr') attributes of the toctitle element depend on
// the article's language. These attributes have no visible effect and so are simply omitted here.
// TODO: the "Contents" string should be translated to the article's language, but I don't know how
// to implement this. Should "Contents" be enclosed in tr() to at least translate it to GoldenDict's
// interface language? Is there a language-agnostic Unicode symbol that stands for "Contents"?
tableOfContents = " <div id='toc' class='toc' role='navigation' aria-labelledby='mw-toc-heading'> "
" <div class='toctitle'><h2 id='mw-toc-heading'>Contents</h2></div> " ;
do
{
if ( ! addListLevel ( el . attribute ( " toclevel " ) ) )
{
tableOfContents . clear ( ) ;
return ;
}
// From https://gerrit.wikimedia.org/r/c/mediawiki/core/+/831147/
// The anchor property ... should be used if you want to (eg) look up an element by ID using
// document.getElementById(). The linkAnchor property ... contains additional escaping appropriate for
// use in a URL fragment, and should be used (eg) if you are creating the href attribute of an <a> tag.
tableOfContents + = " <a href='# " ;
tableOfContents + = el . attribute ( " linkAnchor " ) ;
tableOfContents + = " '> " ;
// Omit <span class="tocnumber"> because it has no visible effect.
tableOfContents + = el . attribute ( " number " ) ;
tableOfContents + = ' ' ;
// Omit <span class="toctext"> because it has no visible effect.
tableOfContents + = el . attribute ( " line " ) ;
tableOfContents + = " </a> " ;
el = el . nextSiblingElement ( elTagName ) ;
} while ( ! el . isNull ( ) ) ;
closeListTags ( 1 ) ;
// Close the first-level list tag and the toc div tag.
tableOfContents + = " </ul> \n </div> " ;
}
bool MediaWikiSectionsParser : : addListLevel ( QString const & levelString )
{
bool convertedToInt ;
int const level = levelString . toInt ( & convertedToInt ) ;
if ( ! convertedToInt )
{
gdWarning ( " MediaWiki: sections level is not an integer: %s " , levelString . toUtf8 ( ) . constData ( ) ) ;
return false ;
}
if ( level < = 0 )
{
gdWarning ( " MediaWiki: unsupported nonpositive sections level: %s " , levelString . toUtf8 ( ) . constData ( ) ) ;
return false ;
}
if ( level > previousLevel + 1 )
{
gdWarning ( " MediaWiki: unsupported sections level increase by more than one: from %d to %s " ,
previousLevel , levelString . toUtf8 ( ) . constData ( ) ) ;
return false ;
}
if ( level = = previousLevel + 1 )
{
// Don't close the previous list item tag to nest the current deeper level's list in it.
tableOfContents + = " \n <ul> \n " ;
previousLevel = level ;
}
else
closeListTags ( level ) ;
Q_ASSERT ( level = = previousLevel ) ;
// Open this list item tag.
// Omit the (e.g.) class="toclevel-4 tocsection-9" attribute of <li> because it has no visible effect.
tableOfContents + = " <li> " ;
return true ;
}
void MediaWikiSectionsParser : : closeListTags ( int currentLevel )
{
Q_ASSERT ( currentLevel < = previousLevel ) ;
// Close the previous list item tag.
tableOfContents + = " </li> \n " ;
// Close list and list item tags of deeper levels, if any.
while ( currentLevel < previousLevel )
{
tableOfContents + = " </ul> \n </li> \n " ;
- - previousLevel ;
}
}
2009-03-26 19:00:08 +00:00
class MediaWikiArticleRequest : public MediaWikiDataRequestSlots
{
2016-04-21 15:14:04 +00:00
typedef std : : list < std : : pair < QNetworkReply * , bool > > NetReplies ;
2009-05-05 08:56:46 +00:00
NetReplies netReplies ;
2009-03-26 19:00:08 +00:00
QString url ;
2009-04-21 20:09:02 +00:00
2009-03-26 19:00:08 +00:00
public :
2009-05-05 08:56:46 +00:00
MediaWikiArticleRequest ( wstring const & word , vector < wstring > const & alts ,
2012-01-09 13:48:27 +00:00
QString const & url , QNetworkAccessManager & mgr ,
2013-07-10 13:48:09 +00:00
Class * dictPtr_ ) ;
2009-03-26 19:00:08 +00:00
2022-12-29 07:07:40 +00:00
void cancel ( ) override ;
2009-03-26 19:00:08 +00:00
private :
2009-04-21 20:09:02 +00:00
2009-05-05 08:56:46 +00:00
void addQuery ( QNetworkAccessManager & mgr , wstring const & word ) ;
2022-12-29 07:07:40 +00:00
void requestFinished ( QNetworkReply * ) override ;
2021-06-10 16:28:23 +00:00
/// This simple set implementation should be much more efficient than tree-
/// and hash-based standard/Qt containers when there are very few elements.
template < typename T >
class SmallSet {
public :
bool insert ( T x )
{
if ( std : : find ( elements . begin ( ) , elements . end ( ) , x ) ! = elements . end ( ) )
return false ;
elements . push_back ( x ) ;
return true ;
}
private :
std : : vector < T > elements ;
} ;
/// The page id set allows to filter out duplicate articles in case MediaWiki
/// redirects the main word and words in the alts collection to the same page.
SmallSet < long long > addedPageIds ;
2013-07-10 13:48:09 +00:00
Class * dictPtr ;
2009-03-26 19:00:08 +00:00
} ;
void MediaWikiArticleRequest : : cancel ( )
{
finish ( ) ;
}
MediaWikiArticleRequest : : MediaWikiArticleRequest ( wstring const & str ,
2009-05-05 08:56:46 +00:00
vector < wstring > const & alts ,
2009-03-26 19:00:08 +00:00
QString const & url_ ,
2012-01-09 13:48:27 +00:00
QNetworkAccessManager & mgr ,
2013-07-10 13:48:09 +00:00
Class * dictPtr_ ) :
url ( url_ ) , dictPtr ( dictPtr_ )
2009-05-05 08:56:46 +00:00
{
connect ( & mgr , SIGNAL ( finished ( QNetworkReply * ) ) ,
2009-10-10 17:37:48 +00:00
this , SLOT ( requestFinished ( QNetworkReply * ) ) ,
Qt : : QueuedConnection ) ;
2009-05-05 08:56:46 +00:00
addQuery ( mgr , str ) ;
for ( unsigned x = 0 ; x < alts . size ( ) ; + + x )
addQuery ( mgr , alts [ x ] ) ;
}
void MediaWikiArticleRequest : : addQuery ( QNetworkAccessManager & mgr ,
wstring const & str )
2009-03-26 19:00:08 +00:00
{
2013-11-16 18:34:09 +00:00
gdDebug ( " MediaWiki: requesting article %s \n " , gd : : toQString ( str ) . toUtf8 ( ) . data ( ) ) ;
2009-04-21 20:09:02 +00:00
2023-02-07 06:26:36 +00:00
QUrl reqUrl ( url + " /api.php?action=parse&prop=text|revid|sections&format=xml&redirects " ) ;
2009-03-26 19:00:08 +00:00
2021-11-27 07:17:33 +00:00
Utils : : Url : : addQueryItem ( reqUrl , " page " , gd : : toQString ( str ) . replace ( ' + ' , " %2B " ) ) ;
2022-01-15 04:53:19 +00:00
QNetworkRequest req ( reqUrl ) ;
//millseconds.
req . setTransferTimeout ( 3000 ) ;
QNetworkReply * netReply = mgr . get ( req ) ;
connect ( netReply , & QNetworkReply : : errorOccurred , this , [ = ] ( QNetworkReply : : NetworkError e ) {
qDebug ( ) < < " error: " < < e ;
} ) ;
2022-12-15 07:11:09 +00:00
# ifndef QT_NO_SSL
2015-02-20 14:18:03 +00:00
2016-04-21 15:14:04 +00:00
connect ( netReply , SIGNAL ( sslErrors ( QList < QSslError > ) ) ,
netReply , SLOT ( ignoreSslErrors ( ) ) ) ;
2015-02-20 14:18:03 +00:00
# endif
2009-05-05 08:56:46 +00:00
netReplies . push_back ( std : : make_pair ( netReply , false ) ) ;
2009-03-26 19:00:08 +00:00
}
2009-05-05 08:56:46 +00:00
void MediaWikiArticleRequest : : requestFinished ( QNetworkReply * r )
2009-03-26 19:00:08 +00:00
{
2014-05-10 21:02:31 +00:00
GD_DPRINTF ( " Finished. \n " ) ;
2009-03-26 19:00:08 +00:00
if ( isFinished ( ) ) // Was cancelled
return ;
2009-05-05 08:56:46 +00:00
// Find this reply
2009-03-26 19:00:08 +00:00
2009-05-05 08:56:46 +00:00
bool found = false ;
for ( NetReplies : : iterator i = netReplies . begin ( ) ; i ! = netReplies . end ( ) ; + + i )
{
2016-04-21 15:14:04 +00:00
if ( i - > first = = r )
2009-03-26 19:00:08 +00:00
{
2009-05-05 08:56:46 +00:00
i - > second = true ; // Mark as finished
found = true ;
break ;
2009-03-26 19:00:08 +00:00
}
2009-05-05 08:56:46 +00:00
}
2009-03-26 19:00:08 +00:00
2009-05-05 08:56:46 +00:00
if ( ! found )
{
// Well, that's not our reply, don't do anything
return ;
}
bool updated = false ;
2009-03-26 19:00:08 +00:00
2009-05-05 08:56:46 +00:00
for ( ; netReplies . size ( ) & & netReplies . front ( ) . second ; netReplies . pop_front ( ) )
{
2016-04-21 15:14:04 +00:00
QNetworkReply * netReply = netReplies . front ( ) . first ;
2009-05-05 08:56:46 +00:00
if ( netReply - > error ( ) = = QNetworkReply : : NoError )
{
QDomDocument dd ;
QString errorStr ;
int errorLine , errorColumn ;
2016-04-21 15:14:04 +00:00
if ( ! dd . setContent ( netReply , false , & errorStr , & errorLine , & errorColumn ) )
2009-05-05 08:56:46 +00:00
{
setErrorString ( QString ( tr ( " XML parse error: %1 at %2,%3 " ) .
arg ( errorStr ) . arg ( errorLine ) . arg ( errorColumn ) ) ) ;
}
else
{
QDomNode parseNode = dd . namedItem ( " api " ) . namedItem ( " parse " ) ;
2021-06-10 16:28:23 +00:00
if ( ! parseNode . isNull ( ) & & parseNode . toElement ( ) . attribute ( " revid " ) ! = " 0 "
// Don't show the same article more than once:
& & addedPageIds . insert ( parseNode . toElement ( ) . attribute ( " pageid " ) . toLongLong ( ) ) )
2009-03-26 19:00:08 +00:00
{
2009-05-05 08:56:46 +00:00
QDomNode textNode = parseNode . namedItem ( " text " ) ;
if ( ! textNode . isNull ( ) )
2009-03-26 19:00:08 +00:00
{
2009-05-05 08:56:46 +00:00
QString articleString = textNode . toElement ( ) . text ( ) ;
2015-07-24 15:00:39 +00:00
// Replace all ":" in links, remove '#' part in links to other articles
int pos = 0 ;
2018-02-28 14:15:27 +00:00
QRegularExpression regLinks ( " <a \\ s+href= \" /([^ \" ]+) \ " " ) ;
2018-02-27 16:42:21 +00:00
QString articleNewString ;
QRegularExpressionMatchIterator it = regLinks . globalMatch ( articleString ) ;
while ( it . hasNext ( ) )
{
QRegularExpressionMatch match = it . next ( ) ;
2022-02-27 05:17:37 +00:00
articleNewString + = articleString . mid ( pos , match . capturedStart ( ) - pos ) ;
2018-02-27 16:42:21 +00:00
pos = match . capturedEnd ( ) ;
QString link = match . captured ( 1 ) ;
2021-11-19 13:47:22 +00:00
2015-07-24 15:00:39 +00:00
if ( link . indexOf ( " :// " ) > = 0 )
{
// External link
2018-02-27 16:42:21 +00:00
articleNewString + = match . captured ( ) ;
2021-11-19 13:47:22 +00:00
2015-07-24 15:00:39 +00:00
continue ;
}
if ( link . indexOf ( ' : ' ) > = 0 )
link . replace ( ' : ' , " %3A " ) ;
int n = link . indexOf ( ' # ' , 1 ) ;
if ( n > 0 )
2015-10-31 19:26:50 +00:00
{
QString anchor = link . mid ( n + 1 ) . replace ( ' _ ' , " %5F " ) ;
2015-07-24 15:00:39 +00:00
link . truncate ( n ) ;
2015-10-31 19:26:50 +00:00
link + = QString ( " ?gdanchor=%1 " ) . arg ( anchor ) ;
}
2015-07-24 15:00:39 +00:00
QString newLink = QString ( " <a href= \" /%1 \" " ) . arg ( link ) ;
2018-02-27 16:42:21 +00:00
articleNewString + = newLink ;
}
if ( pos )
{
2022-02-27 05:17:37 +00:00
articleNewString + = articleString . mid ( pos ) ;
2018-02-27 16:42:21 +00:00
articleString = articleNewString ;
articleNewString . clear ( ) ;
}
2021-11-19 13:47:22 +00:00
2015-07-24 15:00:39 +00:00
2013-05-31 04:20:25 +00:00
QUrl wikiUrl ( url ) ;
wikiUrl . setPath ( " / " ) ;
2009-05-05 08:56:46 +00:00
// Update any special index.php pages to be absolute
2022-12-24 22:01:50 +00:00
articleString . replace ( QRegularExpression ( R " (<a \ shref= " ( / ( [ \ w ] */ ) * index . php \ ? ) ) " ),
2018-02-27 16:42:21 +00:00
QString ( " <a href= \" %1 \\ 1 " ) . arg ( wikiUrl . toString ( ) ) ) ;
2021-11-19 13:47:22 +00:00
2012-01-26 07:56:23 +00:00
2014-07-24 14:10:36 +00:00
// audio tag
2018-02-27 16:42:21 +00:00
QRegularExpression reg1 ( " <audio \\ s.+?</audio> " ,
2018-03-11 12:42:02 +00:00
QRegularExpression : : CaseInsensitiveOption
| QRegularExpression : : DotMatchesEverythingOption ) ;
2022-12-24 22:01:50 +00:00
QRegularExpression reg2 ( R " (<source \ s+src= " ( [ ^ " ]+)) " ,
2018-02-28 14:15:27 +00:00
QRegularExpression : : CaseInsensitiveOption ) ;
2018-02-27 16:42:21 +00:00
pos = 0 ;
it = reg1 . globalMatch ( articleString ) ;
while ( it . hasNext ( ) )
{
QRegularExpressionMatch match = it . next ( ) ;
2022-02-27 05:17:37 +00:00
articleNewString + = articleString . mid ( pos , match . capturedStart ( ) - pos ) ;
2018-02-27 16:42:21 +00:00
pos = match . capturedEnd ( ) ;
QString tag = match . captured ( ) ;
QRegularExpressionMatch match2 = reg2 . match ( tag ) ;
if ( match2 . hasMatch ( ) )
{
QString ref = match2 . captured ( 1 ) ;
QString audio_url = " <a href= \" " + ref
2023-03-05 20:20:05 +00:00
+ R " ( " > < img src = " qrc:///icons/playsound.png " border = " 0 " align = " absmiddle " alt = " Play " / > < / a > ) " ;
2018-02-27 16:42:21 +00:00
articleNewString + = audio_url ;
}
else
articleNewString + = match . captured ( ) ;
}
if ( pos )
{
2022-02-27 05:17:37 +00:00
articleNewString + = articleString . mid ( pos ) ;
2018-02-27 16:42:21 +00:00
articleString = articleNewString ;
articleNewString . clear ( ) ;
}
2021-11-19 13:47:22 +00:00
2012-01-26 07:56:23 +00:00
// audio url
2021-06-11 15:04:42 +00:00
articleString . replace ( QRegularExpression ( " <a \\ s+href= \" (//upload \\ .wikimedia \\ .org/wikipedia/[^ \" '&]* \\ .og[ga](?: \\ .mp3|)) \" " ) ,
2021-11-19 13:47:22 +00:00
2015-06-24 14:50:14 +00:00
QString : : fromStdString ( addAudioLink ( string ( " \" " ) + wikiUrl . scheme ( ) . toStdString ( ) + " : \\ 1 \" " ,
2018-03-11 12:42:02 +00:00
this - > dictPtr - > getId ( ) ) + " <a href= \" " + wikiUrl . scheme ( ) . toStdString ( ) + " : \\ 1 \" " ) ) ;
2012-01-26 07:56:23 +00:00
2015-06-24 14:50:14 +00:00
// Add url scheme to image source urls
articleString . replace ( " src= \" // " , " src= \" " + wikiUrl . scheme ( ) + " :// " ) ;
2012-06-21 09:43:28 +00:00
//fix src="/foo/bar/Baz.png"
2016-06-13 21:15:12 +00:00
articleString . replace ( " src= \" / " , " src= \" " + wikiUrl . toString ( ) ) ;
2012-01-26 07:56:23 +00:00
2018-01-28 20:15:03 +00:00
// Remove the /wiki/ prefix from links
2020-11-23 16:43:50 +00:00
articleString . replace ( " <a href= \" /wiki/ " , " <a href= \" " ) ;
2012-01-26 07:56:23 +00:00
2009-05-05 08:56:46 +00:00
// In those strings, change any underscores to spaces
2022-12-24 22:01:50 +00:00
QRegularExpression rxLink ( R " (<a \ s+href= " [ ^ / : " >#]+) " ) ;
2018-02-27 16:42:21 +00:00
it = rxLink . globalMatch ( articleString ) ;
while ( it . hasNext ( ) )
{
QRegularExpressionMatch match = it . next ( ) ;
2018-02-28 14:15:27 +00:00
for ( int i = match . capturedStart ( ) + 9 ; i < match . capturedEnd ( ) ; i + + )
2018-02-27 16:42:21 +00:00
if ( articleString . at ( i ) = = QChar ( ' _ ' ) )
articleString [ i ] = ' ' ;
}
2021-11-19 13:47:22 +00:00
2012-01-26 07:56:23 +00:00
//fix file: url
2022-12-24 22:01:50 +00:00
articleString . replace ( QRegularExpression ( R " (<a \ s+href= " ( [ ^ : / " ]*file%3A[^/ " ] + " )) " ,
2018-02-28 14:15:27 +00:00
QRegularExpression : : CaseInsensitiveOption ) ,
2021-11-19 13:47:22 +00:00
2012-01-26 07:56:23 +00:00
QString ( " <a href= \" %1/index.php?title= \\ 1 " ) . arg ( url ) ) ;
2018-03-11 13:33:17 +00:00
// Add url scheme to other urls like "//xxx"
articleString . replace ( " href= \" // " , " href= \" " + wikiUrl . scheme ( ) + " :// " ) ;
2023-03-02 13:23:01 +00:00
// Add url scheme to other urls like embed css background: url("//upload.wikimedia.org/wikipedia/commons/6/65/Lock-green.svg")right 0.1em center/9px no-repeat
articleString . replace ( " url( \" // " , " url( \" " + wikiUrl . scheme ( ) + " :// " ) ;
2019-03-04 16:36:58 +00:00
// Fix urls in "srcset" attribute
pos = 0 ;
2022-12-24 22:01:50 +00:00
QRegularExpression regSrcset ( R " ( srcset \ s*= \ s* " / [ ^ " ]+ " ) " ) ;
2019-03-04 16:36:58 +00:00
it = regSrcset . globalMatch ( articleString ) ;
while ( it . hasNext ( ) )
{
QRegularExpressionMatch match = it . next ( ) ;
2022-02-27 05:17:37 +00:00
articleNewString + = articleString . mid ( pos , match . capturedStart ( ) - pos ) ;
2019-03-04 16:36:58 +00:00
pos = match . capturedEnd ( ) ;
QString srcset = match . captured ( ) ;
2021-11-19 13:47:22 +00:00
2019-03-04 16:36:58 +00:00
QString newSrcset = srcset . replace ( " // " , wikiUrl . scheme ( ) + " :// " ) ;
articleNewString + = newSrcset ;
}
if ( pos )
{
2022-02-27 05:17:37 +00:00
articleNewString + = articleString . mid ( pos ) ;
2019-03-04 16:36:58 +00:00
articleString = articleNewString ;
articleNewString . clear ( ) ;
}
2021-11-19 13:47:22 +00:00
2019-03-04 16:36:58 +00:00
2023-02-07 06:26:36 +00:00
// Insert the ToC in the end to improve performance because no replacements are needed in the generated ToC.
MediaWikiSectionsParser : : generateTableOfContentsIfEmpty ( parseNode , articleString ) ;
2009-05-05 08:56:46 +00:00
QByteArray articleBody = articleString . toUtf8 ( ) ;
2018-03-11 13:33:17 +00:00
2022-12-24 22:01:50 +00:00
articleBody . prepend ( dictPtr - > isToLanguageRTL ( ) ? R " (<div class= " mwiki " dir= " rtl " >) " :
2013-07-10 13:48:09 +00:00
" <div class= \" mwiki \" > " ) ;
2009-05-05 08:56:46 +00:00
articleBody . append ( " </div> " ) ;
Mutex : : Lock _ ( dataMutex ) ;
size_t prevSize = data . size ( ) ;
data . resize ( prevSize + articleBody . size ( ) ) ;
memcpy ( & data . front ( ) + prevSize , articleBody . data ( ) , articleBody . size ( ) ) ;
hasAnyData = true ;
updated = true ;
2009-03-26 19:00:08 +00:00
}
}
}
2014-05-10 21:02:31 +00:00
GD_DPRINTF ( " done. \n " ) ;
2009-03-26 19:00:08 +00:00
}
2009-05-05 08:56:46 +00:00
else
setErrorString ( netReply - > errorString ( ) ) ;
2016-04-21 15:14:04 +00:00
2017-03-20 14:31:02 +00:00
disconnect ( netReply , 0 , 0 , 0 ) ;
2016-04-21 15:14:04 +00:00
netReply - > deleteLater ( ) ;
2009-03-26 19:00:08 +00:00
}
2009-05-05 08:56:46 +00:00
if ( netReplies . empty ( ) )
finish ( ) ;
else
if ( updated )
update ( ) ;
2009-03-26 19:00:08 +00:00
}
sptr < WordSearchRequest > MediaWikiDictionary : : prefixMatch ( wstring const & word ,
unsigned long maxResults )
2022-01-09 08:35:07 +00:00
2009-03-26 19:00:08 +00:00
{
2012-10-31 13:58:35 +00:00
( void ) maxResults ;
2010-03-30 16:52:44 +00:00
if ( word . size ( ) > 80 )
{
// Don't make excessively large queries -- they're fruitless anyway
2022-11-29 03:54:31 +00:00
return std : : make_shared < WordSearchRequestInstant > ( ) ;
2010-03-30 16:52:44 +00:00
}
else
2022-11-29 03:54:31 +00:00
return std : : make_shared < MediaWikiWordSearchRequest > ( word , url , netMgr ) ;
2009-03-26 19:00:08 +00:00
}
2009-05-29 19:48:50 +00:00
sptr < DataRequest > MediaWikiDictionary : : getArticle ( wstring const & word ,
vector < wstring > const & alts ,
2018-06-13 16:00:42 +00:00
wstring const & , bool )
2022-01-09 08:35:07 +00:00
2009-03-26 19:00:08 +00:00
{
2010-03-30 16:52:44 +00:00
if ( word . size ( ) > 80 )
{
// Don't make excessively large queries -- they're fruitless anyway
2022-11-29 03:54:31 +00:00
return std : : make_shared < DataRequestInstant > ( false ) ;
2010-03-30 16:52:44 +00:00
}
else
2022-11-29 03:54:31 +00:00
return std : : make_shared < MediaWikiArticleRequest > ( word , alts , url , netMgr , this ) ;
2009-03-26 19:00:08 +00:00
}
}
vector < sptr < Dictionary : : Class > > makeDictionaries (
Dictionary : : Initializing & ,
Config : : MediaWikis const & wikis ,
QNetworkAccessManager & mgr )
2022-01-09 08:35:07 +00:00
2009-03-26 19:00:08 +00:00
{
vector < sptr < Dictionary : : Class > > result ;
2012-12-10 12:49:45 +00:00
for ( int x = 0 ; x < wikis . size ( ) ; + + x )
2009-03-26 19:00:08 +00:00
{
if ( wikis [ x ] . enabled )
2022-11-29 03:54:31 +00:00
result . push_back ( std : : make_shared < MediaWikiDictionary > ( wikis [ x ] . id . toStdString ( ) ,
2009-03-26 19:00:08 +00:00
wikis [ x ] . name . toUtf8 ( ) . data ( ) ,
wikis [ x ] . url ,
2012-06-21 09:43:28 +00:00
wikis [ x ] . icon ,
2009-03-26 19:00:08 +00:00
mgr ) ) ;
}
return result ;
}
2009-04-21 20:09:02 +00:00
2009-03-26 19:00:08 +00:00
}