2012-02-20 21:47:14 +00:00
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
2009-03-26 19:00:08 +00:00
* Part of GoldenDict . Licensed under GPLv3 or later , see the LICENSE file */
# include "mediawiki.hh"
2009-04-18 17:20:12 +00:00
# include "wstring_qt.hh"
2009-03-26 19:00:08 +00:00
# include <QNetworkAccessManager>
# include <QNetworkReply>
# include <QUrl>
# include <QtXml>
2021-06-10 16:28:23 +00:00
# include <algorithm>
2009-05-05 08:56:46 +00:00
# include <list>
2013-11-16 18:34:09 +00:00
# include "gddebug.hh"
2012-01-09 13:48:27 +00:00
# include "audiolink.hh"
2013-07-10 13:48:09 +00:00
# include "langcoder.hh"
2021-11-27 07:17:33 +00:00
# include "utils.hh"
2009-03-26 19:00:08 +00:00
2018-02-27 16:42:21 +00:00
# include <QRegularExpression>
2023-04-17 21:56:55 +00:00
# include "globalbroadcaster.hh"
2018-02-27 16:42:21 +00:00
2009-03-26 19:00:08 +00:00
namespace MediaWiki {
using namespace Dictionary ;
namespace {
class MediaWikiDictionary : public Dictionary : : Class
{
string name ;
2024-01-25 23:56:17 +00:00
QString url , icon , lang ;
2009-03-26 19:00:08 +00:00
QNetworkAccessManager & netMgr ;
2013-07-10 13:48:09 +00:00
quint32 langId ;
2009-04-21 20:09:02 +00:00
2009-03-26 19:00:08 +00:00
public :
MediaWikiDictionary ( string const & id ,
string const & name_ ,
QString const & url_ ,
2012-06-21 09:43:28 +00:00
QString const & icon_ ,
2024-01-25 23:56:17 +00:00
QString const & lang_ ,
2009-03-26 19:00:08 +00:00
QNetworkAccessManager & netMgr_ ) :
Dictionary : : Class ( id , vector < string > ( ) ) ,
name ( name_ ) ,
url ( url_ ) ,
2012-06-21 09:43:28 +00:00
icon ( icon_ ) ,
2024-01-25 23:56:17 +00:00
lang ( lang_ ) ,
2013-07-10 13:48:09 +00:00
netMgr ( netMgr_ ) ,
langId ( 0 )
2009-03-26 19:00:08 +00:00
{
2013-07-10 13:48:09 +00:00
int n = url . indexOf ( " . " ) ;
if ( n = = 2 | | ( n > 3 & & url [ n - 3 ] = = ' / ' ) ) {
langId = LangCoder : : code2toInt ( url . mid ( n - 2 , 2 ) . toLatin1 ( ) . data ( ) ) ;
2024-10-10 07:13:23 +00:00
}
2009-03-26 19:00:08 +00:00
}
2009-04-21 20:09:02 +00:00
2022-12-29 07:07:40 +00:00
string getName ( ) noexcept override
2009-03-26 19:00:08 +00:00
{
return name ;
}
2022-12-29 07:07:40 +00:00
map < Property , string > getProperties ( ) noexcept override
2009-03-26 19:00:08 +00:00
{
return map < Property , string > ( ) ;
}
2022-12-29 07:07:40 +00:00
unsigned long getArticleCount ( ) noexcept override
2009-03-26 19:00:08 +00:00
{
return 0 ;
}
2022-12-29 07:07:40 +00:00
unsigned long getWordCount ( ) noexcept override
2009-03-26 19:00:08 +00:00
{
return 0 ;
}
2022-12-29 07:07:40 +00:00
sptr < WordSearchRequest > prefixMatch ( wstring const & , unsigned long maxResults ) override ;
2009-03-26 19:00:08 +00:00
2022-12-29 07:07:40 +00:00
sptr < DataRequest > getArticle ( wstring const & , vector < wstring > const & alts , wstring const & , bool ) override ;
2012-12-03 12:47:43 +00:00
2022-12-29 07:07:40 +00:00
quint32 getLangFrom ( ) const override
2013-07-10 13:48:09 +00:00
{
return langId ;
}
2022-12-29 07:07:40 +00:00
quint32 getLangTo ( ) const override
2013-07-10 13:48:09 +00:00
{
return langId ;
}
2012-12-03 12:47:43 +00:00
protected :
2022-12-29 07:07:40 +00:00
void loadIcon ( ) noexcept override ;
2009-03-26 19:00:08 +00:00
} ;
2023-12-12 02:20:34 +00:00
class MediaWikiWordSearchRequestSlots : public Dictionary : : WordSearchRequest
{
Q_OBJECT
protected slots :
virtual void downloadFinished ( ) { }
} ;
class MediaWikiDataRequestSlots : public Dictionary : : DataRequest
{
Q_OBJECT
protected slots :
virtual void requestFinished ( QNetworkReply * ) { }
} ;
2022-06-03 13:28:41 +00:00
void MediaWikiDictionary : : loadIcon ( ) noexcept
2012-12-03 12:47:43 +00:00
{
2013-01-31 22:30:11 +00:00
if ( dictionaryIconLoaded ) {
return ;
2024-10-10 07:13:23 +00:00
}
2013-01-31 22:30:11 +00:00
2012-12-03 12:47:43 +00:00
if ( ! icon . isEmpty ( ) ) {
QFileInfo fInfo ( QDir ( Config : : getConfigDir ( ) ) , icon ) ;
if ( fInfo . isFile ( ) ) {
loadIconFromFile ( fInfo . absoluteFilePath ( ) , true ) ;
2024-10-10 07:13:23 +00:00
}
2012-12-03 12:47:43 +00:00
}
if ( dictionaryIcon . isNull ( ) ) {
2022-03-24 12:14:01 +00:00
if ( url . contains ( " tionary " ) ) {
2023-06-19 02:34:08 +00:00
dictionaryIcon = QIcon ( " :/icons/wiktionary.png " ) ;
2024-10-10 07:13:23 +00:00
}
2022-03-24 12:14:01 +00:00
else {
2023-06-19 02:34:08 +00:00
dictionaryIcon = QIcon ( " :/icons/icon32_wiki.png " ) ;
2024-10-10 07:13:23 +00:00
}
2022-03-24 12:14:01 +00:00
}
2012-12-03 12:47:43 +00:00
dictionaryIconLoaded = true ;
}
2009-03-26 19:00:08 +00:00
class MediaWikiWordSearchRequest : public MediaWikiWordSearchRequestSlots
{
sptr < QNetworkReply > netReply ;
bool isCancelling ;
public :
2024-01-25 23:56:17 +00:00
MediaWikiWordSearchRequest ( wstring const & , QString const & url , QString const & lang , QNetworkAccessManager & mgr ) ;
2009-03-26 19:00:08 +00:00
~ MediaWikiWordSearchRequest ( ) ;
2022-12-29 07:07:40 +00:00
void cancel ( ) override ;
2009-03-26 19:00:08 +00:00
private :
2009-04-21 20:09:02 +00:00
2022-12-29 07:07:40 +00:00
void downloadFinished ( ) override ;
2009-03-26 19:00:08 +00:00
} ;
MediaWikiWordSearchRequest : : MediaWikiWordSearchRequest ( wstring const & str ,
QString const & url ,
2024-01-25 23:56:17 +00:00
QString const & lang ,
2022-02-25 15:41:54 +00:00
QNetworkAccessManager & mgr ) :
isCancelling ( false )
2009-03-26 19:00:08 +00:00
{
2023-06-05 13:22:20 +00:00
GD_DPRINTF ( " wiki request begin \n " ) ;
2013-05-31 04:20:25 +00:00
QUrl reqUrl ( url + " /api.php?action=query&list=allpages&aplimit=40&format=xml " ) ;
2009-03-26 19:00:08 +00:00
2022-07-11 13:01:49 +00:00
GlobalBroadcaster : : instance ( ) - > addWhitelist ( reqUrl . host ( ) ) ;
2023-06-05 12:58:04 +00:00
Utils : : Url : : addQueryItem ( reqUrl , " apprefix " , QString : : fromStdU32String ( str ) . replace ( ' + ' , " %2B " ) ) ;
2024-01-25 23:56:17 +00:00
Utils : : Url : : addQueryItem ( reqUrl , " lang " , lang ) ;
2009-03-26 19:00:08 +00:00
2023-10-16 06:19:51 +00:00
QNetworkRequest req ( reqUrl ) ;
//millseconds.
req . setTransferTimeout ( 2000 ) ;
netReply = std : : shared_ptr < QNetworkReply > ( mgr . get ( req ) ) ;
2009-03-26 19:00:08 +00:00
connect ( netReply . get ( ) , SIGNAL ( finished ( ) ) , this , SLOT ( downloadFinished ( ) ) ) ;
2022-12-15 07:11:09 +00:00
# ifndef QT_NO_SSL
2015-02-20 14:18:03 +00:00
connect ( netReply . get ( ) , SIGNAL ( sslErrors ( QList < QSslError > ) ) , netReply . get ( ) , SLOT ( ignoreSslErrors ( ) ) ) ;
# endif
2009-03-26 19:00:08 +00:00
}
MediaWikiWordSearchRequest : : ~ MediaWikiWordSearchRequest ( )
{
2014-05-10 21:02:31 +00:00
GD_DPRINTF ( " request end \n " ) ;
2009-03-26 19:00:08 +00:00
}
void MediaWikiWordSearchRequest : : cancel ( )
{
// We either finish it in place, or in the timer handler
isCancelling = true ;
2022-02-25 15:41:54 +00:00
if ( netReply . get ( ) ) {
2009-03-26 19:00:08 +00:00
netReply . reset ( ) ;
2024-10-10 07:13:23 +00:00
}
2009-03-26 19:00:08 +00:00
2022-02-25 15:41:54 +00:00
finish ( ) ;
GD_DPRINTF ( " cancel the request " ) ;
2009-03-26 19:00:08 +00:00
}
void MediaWikiWordSearchRequest : : downloadFinished ( )
{
if ( isCancelling | | isFinished ( ) ) { // Was cancelled
return ;
2024-10-10 07:13:23 +00:00
}
2009-03-26 19:00:08 +00:00
if ( netReply - > error ( ) = = QNetworkReply : : NoError ) {
QDomDocument dd ;
QString errorStr ;
int errorLine , errorColumn ;
if ( ! dd . setContent ( netReply . get ( ) , false , & errorStr , & errorLine , & errorColumn ) ) {
setErrorString (
QString ( tr ( " XML parse error: %1 at %2,%3 " ) . arg ( errorStr ) . arg ( errorLine ) . arg ( errorColumn ) ) ) ;
}
else {
QDomNode pages = dd . namedItem ( " api " ) . namedItem ( " query " ) . namedItem ( " allpages " ) ;
if ( ! pages . isNull ( ) ) {
QDomNodeList nl = pages . toElement ( ) . elementsByTagName ( " p " ) ;
2023-05-29 13:56:04 +00:00
QMutexLocker _ ( & dataMutex ) ;
2009-04-21 20:09:02 +00:00
2023-06-05 13:22:20 +00:00
qDebug ( ) < < " matches " < < matches . size ( ) ;
for ( int x = 0 ; x < nl . length ( ) ; + + x ) {
matches . emplace_back ( gd : : toWString ( nl . item ( x ) . toElement ( ) . attribute ( " title " ) ) ) ;
2024-10-10 07:13:23 +00:00
}
2009-03-26 19:00:08 +00:00
}
}
2014-05-10 21:02:31 +00:00
GD_DPRINTF ( " done. \n " ) ;
2009-03-26 19:00:08 +00:00
}
else {
setErrorString ( netReply - > errorString ( ) ) ;
2024-10-10 07:13:23 +00:00
}
2009-03-26 19:00:08 +00:00
finish ( ) ;
}
2023-02-07 06:26:36 +00:00
class MediaWikiSectionsParser
{
public :
/// Since a recent Wikipedia UI redesign, the table of contents (ToC) is no longer part of an article's HTML.
/// ToC is absent from the text node of Wikipedia's MediaWiki API reply. Quote from
/// https://www.mediawiki.org/wiki/Reading/Web/Desktop_Improvements/Features/Table_of_contents#How_can_I_get_the_old_table_of_contents?
/// We intentionally do not add the old table of contents to the article in addition to the new sidebar location...
/// Users can restore the old table of contents position with the following JavaScript code:
/// document.querySelector('mw\\3Atocplace,meta[property="mw:PageProp/toc"]').replaceWith( document.getElementById('mw-panel-toc') )
///
/// This function searches for an indicator of the empty ToC in an article HTML. If the indicator is present,
/// generates ToC HTML from the sections element and replaces the indicator with the generated ToC.
static void generateTableOfContentsIfEmpty ( QDomNode const & parseNode , QString & articleString )
{
QString const emptyTocIndicator = " <meta property= \" mw:PageProp/toc \" /> " ;
int const emptyTocPos = articleString . indexOf ( emptyTocIndicator ) ;
if ( emptyTocPos = = - 1 ) {
return ; // The ToC must be absent or nonempty => nothing to do.
2024-10-10 07:13:23 +00:00
}
2023-02-07 06:26:36 +00:00
QDomElement const sectionsElement = parseNode . firstChildElement ( " sections " ) ;
if ( sectionsElement . isNull ( ) ) {
gdWarning ( " MediaWiki: empty table of contents and missing sections element. " ) ;
return ;
}
gdDebug ( " MediaWiki: generating table of contents from the sections element. " ) ;
MediaWikiSectionsParser parser ;
parser . generateTableOfContents ( sectionsElement ) ;
articleString . replace ( emptyTocPos , emptyTocIndicator . size ( ) , parser . tableOfContents ) ;
}
private :
MediaWikiSectionsParser ( ) :
previousLevel ( 0 )
{
}
void generateTableOfContents ( QDomElement const & sectionsElement ) ;
bool addListLevel ( QString const & levelString ) ;
void closeListTags ( int currentLevel ) ;
QString tableOfContents ;
int previousLevel ;
} ;
void MediaWikiSectionsParser : : generateTableOfContents ( QDomElement const & sectionsElement )
{
// A real example of a typical child of the <sections> element:
// <s linkAnchor="Marginal_densities" toclevel="2" fromtitle="Probability_density_function" level="3"
// line="Marginal densities" byteoffset="15868" anchor="Marginal_densities" number="7.1" index="9"/>
// Use Wiktionary's ToC style, which had also been Wikipedia's ToC style until the UI redesign.
// Replace double quotes with single quotes to avoid escaping " within string literals.
QString const elTagName = " s " ;
QDomElement el = sectionsElement . firstChildElement ( elTagName ) ;
if ( el . isNull ( ) ) {
return ;
2024-10-10 07:13:23 +00:00
}
2023-02-07 06:26:36 +00:00
// Omit invisible and useless toctogglecheckbox, toctogglespan and toctogglelabel elements.
// The values of lang (e.g. 'en') and dir (e.g. 'ltr') attributes of the toctitle element depend on
// the article's language. These attributes have no visible effect and so are simply omitted here.
// TODO: the "Contents" string should be translated to the article's language, but I don't know how
// to implement this. Should "Contents" be enclosed in tr() to at least translate it to GoldenDict's
// interface language? Is there a language-agnostic Unicode symbol that stands for "Contents"?
tableOfContents =
" <div id='toc' class='toc' role='navigation' aria-labelledby='mw-toc-heading'> "
" <div class='toctitle'><h2 id='mw-toc-heading'>Contents</h2></div> " ;
do {
if ( ! addListLevel ( el . attribute ( " toclevel " ) ) ) {
tableOfContents . clear ( ) ;
return ;
}
// From https://gerrit.wikimedia.org/r/c/mediawiki/core/+/831147/
// The anchor property ... should be used if you want to (eg) look up an element by ID using
// document.getElementById(). The linkAnchor property ... contains additional escaping appropriate for
// use in a URL fragment, and should be used (eg) if you are creating the href attribute of an <a> tag.
tableOfContents + = " <a href='# " ;
tableOfContents + = el . attribute ( " linkAnchor " ) ;
tableOfContents + = " '> " ;
// Omit <span class="tocnumber"> because it has no visible effect.
tableOfContents + = el . attribute ( " number " ) ;
tableOfContents + = ' ' ;
// Omit <span class="toctext"> because it has no visible effect.
tableOfContents + = el . attribute ( " line " ) ;
tableOfContents + = " </a> " ;
el = el . nextSiblingElement ( elTagName ) ;
} while ( ! el . isNull ( ) ) ;
closeListTags ( 1 ) ;
// Close the first-level list tag and the toc div tag.
tableOfContents + = " </ul> \n </div> " ;
}
bool MediaWikiSectionsParser : : addListLevel ( QString const & levelString )
{
bool convertedToInt ;
int const level = levelString . toInt ( & convertedToInt ) ;
if ( ! convertedToInt ) {
gdWarning ( " MediaWiki: sections level is not an integer: %s " , levelString . toUtf8 ( ) . constData ( ) ) ;
return false ;
}
if ( level < = 0 ) {
gdWarning ( " MediaWiki: unsupported nonpositive sections level: %s " , levelString . toUtf8 ( ) . constData ( ) ) ;
return false ;
}
if ( level > previousLevel + 1 ) {
gdWarning ( " MediaWiki: unsupported sections level increase by more than one: from %d to %s " ,
previousLevel ,
levelString . toUtf8 ( ) . constData ( ) ) ;
return false ;
}
if ( level = = previousLevel + 1 ) {
// Don't close the previous list item tag to nest the current deeper level's list in it.
tableOfContents + = " \n <ul> \n " ;
previousLevel = level ;
}
else {
closeListTags ( level ) ;
2024-10-10 07:13:23 +00:00
}
2023-02-07 06:26:36 +00:00
Q_ASSERT ( level = = previousLevel ) ;
// Open this list item tag.
// Omit the (e.g.) class="toclevel-4 tocsection-9" attribute of <li> because it has no visible effect.
tableOfContents + = " <li> " ;
return true ;
}
void MediaWikiSectionsParser : : closeListTags ( int currentLevel )
{
Q_ASSERT ( currentLevel < = previousLevel ) ;
// Close the previous list item tag.
tableOfContents + = " </li> \n " ;
// Close list and list item tags of deeper levels, if any.
while ( currentLevel < previousLevel ) {
tableOfContents + = " </ul> \n </li> \n " ;
- - previousLevel ;
}
}
2009-03-26 19:00:08 +00:00
class MediaWikiArticleRequest : public MediaWikiDataRequestSlots
{
2016-04-21 15:14:04 +00:00
typedef std : : list < std : : pair < QNetworkReply * , bool > > NetReplies ;
2009-05-05 08:56:46 +00:00
NetReplies netReplies ;
2009-03-26 19:00:08 +00:00
QString url ;
2024-01-25 23:56:17 +00:00
QString lang ;
2009-04-21 20:09:02 +00:00
2009-03-26 19:00:08 +00:00
public :
2009-05-05 08:56:46 +00:00
MediaWikiArticleRequest ( wstring const & word ,
vector < wstring > const & alts ,
2012-01-09 13:48:27 +00:00
QString const & url ,
2024-01-25 23:56:17 +00:00
QString const & lang ,
2012-01-09 13:48:27 +00:00
QNetworkAccessManager & mgr ,
2013-07-10 13:48:09 +00:00
Class * dictPtr_ ) ;
2009-03-26 19:00:08 +00:00
2022-12-29 07:07:40 +00:00
void cancel ( ) override ;
2009-03-26 19:00:08 +00:00
private :
2009-04-21 20:09:02 +00:00
2009-05-05 08:56:46 +00:00
void addQuery ( QNetworkAccessManager & mgr , wstring const & word ) ;
2022-12-29 07:07:40 +00:00
void requestFinished ( QNetworkReply * ) override ;
2021-06-10 16:28:23 +00:00
/// This simple set implementation should be much more efficient than tree-
/// and hash-based standard/Qt containers when there are very few elements.
template < typename T >
class SmallSet
{
public :
bool insert ( T x )
{
if ( std : : find ( elements . begin ( ) , elements . end ( ) , x ) ! = elements . end ( ) ) {
return false ;
2024-10-10 07:13:23 +00:00
}
2021-06-10 16:28:23 +00:00
elements . push_back ( x ) ;
return true ;
}
2023-07-20 08:02:22 +00:00
2021-06-10 16:28:23 +00:00
private :
std : : vector < T > elements ;
} ;
/// The page id set allows to filter out duplicate articles in case MediaWiki
/// redirects the main word and words in the alts collection to the same page.
SmallSet < long long > addedPageIds ;
2013-07-10 13:48:09 +00:00
Class * dictPtr ;
2009-03-26 19:00:08 +00:00
} ;
void MediaWikiArticleRequest : : cancel ( )
{
finish ( ) ;
}
MediaWikiArticleRequest : : MediaWikiArticleRequest ( wstring const & str ,
2009-05-05 08:56:46 +00:00
vector < wstring > const & alts ,
2009-03-26 19:00:08 +00:00
QString const & url_ ,
2024-01-25 23:56:17 +00:00
QString const & lang_ ,
2012-01-09 13:48:27 +00:00
QNetworkAccessManager & mgr ,
2013-07-10 13:48:09 +00:00
Class * dictPtr_ ) :
url ( url_ ) ,
2024-01-25 23:56:17 +00:00
lang ( lang_ ) ,
2013-07-10 13:48:09 +00:00
dictPtr ( dictPtr_ )
2009-05-05 08:56:46 +00:00
{
connect ( & mgr ,
SIGNAL ( finished ( QNetworkReply * ) ) ,
2009-10-10 17:37:48 +00:00
this ,
SLOT ( requestFinished ( QNetworkReply * ) ) ,
Qt : : QueuedConnection ) ;
2009-05-05 08:56:46 +00:00
addQuery ( mgr , str ) ;
2023-07-20 08:02:22 +00:00
2023-07-29 16:50:03 +00:00
for ( const auto & alt : alts ) {
addQuery ( mgr , alt ) ;
2024-10-10 07:13:23 +00:00
}
2009-05-05 08:56:46 +00:00
}
void MediaWikiArticleRequest : : addQuery ( QNetworkAccessManager & mgr , wstring const & str )
2009-03-26 19:00:08 +00:00
{
2023-04-16 09:07:07 +00:00
gdDebug ( " MediaWiki: requesting article %s \n " , QString : : fromStdU32String ( str ) . toUtf8 ( ) . data ( ) ) ;
2009-04-21 20:09:02 +00:00
2023-02-07 06:26:36 +00:00
QUrl reqUrl ( url + " /api.php?action=parse&prop=text|revid|sections&format=xml&redirects " ) ;
2009-03-26 19:00:08 +00:00
2023-04-16 09:07:07 +00:00
Utils : : Url : : addQueryItem ( reqUrl , " page " , QString : : fromStdU32String ( str ) . replace ( ' + ' , " %2B " ) ) ;
2024-01-25 23:56:17 +00:00
Utils : : Url : : addQueryItem ( reqUrl , " variant " , lang ) ;
2022-01-15 04:53:19 +00:00
QNetworkRequest req ( reqUrl ) ;
//millseconds.
req . setTransferTimeout ( 3000 ) ;
QNetworkReply * netReply = mgr . get ( req ) ;
connect ( netReply , & QNetworkReply : : errorOccurred , this , [ = ] ( QNetworkReply : : NetworkError e ) {
qDebug ( ) < < " error: " < < e ;
} ) ;
2022-12-15 07:11:09 +00:00
# ifndef QT_NO_SSL
2015-02-20 14:18:03 +00:00
2016-04-21 15:14:04 +00:00
connect ( netReply , SIGNAL ( sslErrors ( QList < QSslError > ) ) , netReply , SLOT ( ignoreSslErrors ( ) ) ) ;
2015-02-20 14:18:03 +00:00
# endif
2009-05-05 08:56:46 +00:00
netReplies . push_back ( std : : make_pair ( netReply , false ) ) ;
2009-03-26 19:00:08 +00:00
}
2009-05-05 08:56:46 +00:00
void MediaWikiArticleRequest : : requestFinished ( QNetworkReply * r )
2009-03-26 19:00:08 +00:00
{
2014-05-10 21:02:31 +00:00
GD_DPRINTF ( " Finished. \n " ) ;
2009-03-26 19:00:08 +00:00
if ( isFinished ( ) ) { // Was cancelled
return ;
2024-10-10 07:13:23 +00:00
}
2009-03-26 19:00:08 +00:00
2009-05-05 08:56:46 +00:00
// Find this reply
2009-03-26 19:00:08 +00:00
2009-05-05 08:56:46 +00:00
bool found = false ;
2023-07-20 08:02:22 +00:00
2023-07-29 16:50:03 +00:00
for ( auto & netReplie : netReplies ) {
if ( netReplie . first = = r ) {
netReplie . second = true ; // Mark as finished
2009-05-05 08:56:46 +00:00
found = true ;
break ;
2009-03-26 19:00:08 +00:00
}
2009-05-05 08:56:46 +00:00
}
2009-03-26 19:00:08 +00:00
2009-05-05 08:56:46 +00:00
if ( ! found ) {
// Well, that's not our reply, don't do anything
return ;
}
2023-07-20 08:02:22 +00:00
2009-05-05 08:56:46 +00:00
bool updated = false ;
2009-03-26 19:00:08 +00:00
2009-05-05 08:56:46 +00:00
for ( ; netReplies . size ( ) & & netReplies . front ( ) . second ; netReplies . pop_front ( ) ) {
2016-04-21 15:14:04 +00:00
QNetworkReply * netReply = netReplies . front ( ) . first ;
2023-07-20 08:02:22 +00:00
2009-05-05 08:56:46 +00:00
if ( netReply - > error ( ) = = QNetworkReply : : NoError ) {
QDomDocument dd ;
2023-07-20 08:02:22 +00:00
2009-05-05 08:56:46 +00:00
QString errorStr ;
int errorLine , errorColumn ;
2023-07-20 08:02:22 +00:00
2016-04-21 15:14:04 +00:00
if ( ! dd . setContent ( netReply , false , & errorStr , & errorLine , & errorColumn ) ) {
2009-05-05 08:56:46 +00:00
setErrorString (
QString ( tr ( " XML parse error: %1 at %2,%3 " ) . arg ( errorStr ) . arg ( errorLine ) . arg ( errorColumn ) ) ) ;
}
else {
QDomNode parseNode = dd . namedItem ( " api " ) . namedItem ( " parse " ) ;
2023-07-20 08:02:22 +00:00
2021-06-10 16:28:23 +00:00
if ( ! parseNode . isNull ( )
& & parseNode . toElement ( ) . attribute ( " revid " ) ! = " 0 "
// Don't show the same article more than once:
& & addedPageIds . insert ( parseNode . toElement ( ) . attribute ( " pageid " ) . toLongLong ( ) ) ) {
2009-05-05 08:56:46 +00:00
QDomNode textNode = parseNode . namedItem ( " text " ) ;
2023-07-20 08:02:22 +00:00
2009-05-05 08:56:46 +00:00
if ( ! textNode . isNull ( ) ) {
QString articleString = textNode . toElement ( ) . text ( ) ;
2015-07-24 15:00:39 +00:00
// Replace all ":" in links, remove '#' part in links to other articles
int pos = 0 ;
2018-02-28 14:15:27 +00:00
QRegularExpression regLinks ( " <a \\ s+href= \" /([^ \" ]+) \ " " ) ;
2018-02-27 16:42:21 +00:00
QString articleNewString ;
QRegularExpressionMatchIterator it = regLinks . globalMatch ( articleString ) ;
while ( it . hasNext ( ) ) {
QRegularExpressionMatch match = it . next ( ) ;
2022-02-27 05:17:37 +00:00
articleNewString + = articleString . mid ( pos , match . capturedStart ( ) - pos ) ;
2018-02-27 16:42:21 +00:00
pos = match . capturedEnd ( ) ;
QString link = match . captured ( 1 ) ;
2021-11-19 13:47:22 +00:00
2015-07-24 15:00:39 +00:00
if ( link . indexOf ( " :// " ) > = 0 ) {
// External link
2018-02-27 16:42:21 +00:00
articleNewString + = match . captured ( ) ;
2021-11-19 13:47:22 +00:00
2015-07-24 15:00:39 +00:00
continue ;
}
if ( link . indexOf ( ' : ' ) > = 0 ) {
link . replace ( ' : ' , " %3A " ) ;
2024-10-10 07:13:23 +00:00
}
2015-07-24 15:00:39 +00:00
int n = link . indexOf ( ' # ' , 1 ) ;
if ( n > 0 ) {
2015-10-31 19:26:50 +00:00
QString anchor = link . mid ( n + 1 ) . replace ( ' _ ' , " %5F " ) ;
2015-07-24 15:00:39 +00:00
link . truncate ( n ) ;
2015-10-31 19:26:50 +00:00
link + = QString ( " ?gdanchor=%1 " ) . arg ( anchor ) ;
}
2015-07-24 15:00:39 +00:00
QString newLink = QString ( " <a href= \" /%1 \" " ) . arg ( link ) ;
2018-02-27 16:42:21 +00:00
articleNewString + = newLink ;
}
if ( pos ) {
2022-02-27 05:17:37 +00:00
articleNewString + = articleString . mid ( pos ) ;
2018-02-27 16:42:21 +00:00
articleString = articleNewString ;
articleNewString . clear ( ) ;
}
2021-11-19 13:47:22 +00:00
2015-07-24 15:00:39 +00:00
2013-05-31 04:20:25 +00:00
QUrl wikiUrl ( url ) ;
wikiUrl . setPath ( " / " ) ;
2023-07-20 08:02:22 +00:00
2009-05-05 08:56:46 +00:00
// Update any special index.php pages to be absolute
2022-12-24 22:01:50 +00:00
articleString . replace ( QRegularExpression ( R " (<a \ shref= " ( / ( [ \ w ] */ ) * index . php \ ? ) ) " ),
2018-02-27 16:42:21 +00:00
QString ( " <a href= \" %1 \\ 1 " ) . arg ( wikiUrl . toString ( ) ) ) ;
2021-11-19 13:47:22 +00:00
2012-01-26 07:56:23 +00:00
2014-07-24 14:10:36 +00:00
// audio tag
2018-02-27 16:42:21 +00:00
QRegularExpression reg1 ( " <audio \\ s.+?</audio> " ,
2018-03-11 12:42:02 +00:00
QRegularExpression : : CaseInsensitiveOption
| QRegularExpression : : DotMatchesEverythingOption ) ;
2022-12-24 22:01:50 +00:00
QRegularExpression reg2 ( R " (<source \ s+src= " ( [ ^ " ]+) ) " , QRegularExpression::CaseInsensitiveOption ) ;
2018-02-27 16:42:21 +00:00
pos = 0 ;
it = reg1 . globalMatch ( articleString ) ;
while ( it . hasNext ( ) ) {
QRegularExpressionMatch match = it . next ( ) ;
2022-02-27 05:17:37 +00:00
articleNewString + = articleString . mid ( pos , match . capturedStart ( ) - pos ) ;
2018-02-27 16:42:21 +00:00
pos = match . capturedEnd ( ) ;
QString tag = match . captured ( ) ;
QRegularExpressionMatch match2 = reg2 . match ( tag ) ;
if ( match2 . hasMatch ( ) ) {
QString ref = match2 . captured ( 1 ) ;
2023-07-26 15:03:03 +00:00
// audio url may like this <a href="//upload.wikimedia.org/wikipedia/a.ogg"
if ( ref . startsWith ( " // " ) ) {
ref = wikiUrl . scheme ( ) + " : " + ref ;
}
2024-10-24 13:32:21 +00:00
auto script = addAudioLink ( ref , this - > dictPtr - > getId ( ) ) ;
2023-07-26 15:03:03 +00:00
QString audio_url = QString : : fromStdString ( script ) + " <a href= \" " + ref
2023-03-05 20:20:05 +00:00
+ R " ( " > < img src = " qrc:///icons/playsound.png " border = " 0 " align = " absmiddle " alt = " Play " / > < / a > ) " ;
2018-02-27 16:42:21 +00:00
articleNewString + = audio_url ;
}
else {
articleNewString + = match . captured ( ) ;
2024-10-10 07:13:23 +00:00
}
2018-02-27 16:42:21 +00:00
}
if ( pos ) {
2022-02-27 05:17:37 +00:00
articleNewString + = articleString . mid ( pos ) ;
2018-02-27 16:42:21 +00:00
articleString = articleNewString ;
articleNewString . clear ( ) ;
}
2021-11-19 13:47:22 +00:00
2012-01-26 07:56:23 +00:00
2015-06-24 14:50:14 +00:00
// Add url scheme to image source urls
articleString . replace ( " src= \" // " , " src= \" " + wikiUrl . scheme ( ) + " :// " ) ;
2012-06-21 09:43:28 +00:00
//fix src="/foo/bar/Baz.png"
2016-06-13 21:15:12 +00:00
articleString . replace ( " src= \" / " , " src= \" " + wikiUrl . toString ( ) ) ;
2012-01-26 07:56:23 +00:00
2018-01-28 20:15:03 +00:00
// Remove the /wiki/ prefix from links
2020-11-23 16:43:50 +00:00
articleString . replace ( " <a href= \" /wiki/ " , " <a href= \" " ) ;
2012-01-26 07:56:23 +00:00
2009-05-05 08:56:46 +00:00
// In those strings, change any underscores to spaces
2022-12-24 22:01:50 +00:00
QRegularExpression rxLink ( R " (<a \ s+href= " [ ^ / : " >#]+) " ) ;
2018-02-27 16:42:21 +00:00
it = rxLink . globalMatch ( articleString ) ;
while ( it . hasNext ( ) ) {
QRegularExpressionMatch match = it . next ( ) ;
2018-02-28 14:15:27 +00:00
for ( int i = match . capturedStart ( ) + 9 ; i < match . capturedEnd ( ) ; i + + ) {
2018-02-27 16:42:21 +00:00
if ( articleString . at ( i ) = = QChar ( ' _ ' ) ) {
articleString [ i ] = ' ' ;
2024-10-10 07:13:23 +00:00
}
}
2018-02-27 16:42:21 +00:00
}
2021-11-19 13:47:22 +00:00
2012-01-26 07:56:23 +00:00
//fix file: url
2022-12-24 22:01:50 +00:00
articleString . replace (
QRegularExpression ( R " (<a \ s+href= " ( [ ^ : / " ]*file%3A[^/ " ] + " )) " , QRegularExpression : : CaseInsensitiveOption ) ,
2021-11-19 13:47:22 +00:00
2012-01-26 07:56:23 +00:00
QString ( " <a href= \" %1/index.php?title= \\ 1 " ) . arg ( url ) ) ;
2018-03-11 13:33:17 +00:00
// Add url scheme to other urls like "//xxx"
articleString . replace ( " href= \" // " , " href= \" " + wikiUrl . scheme ( ) + " :// " ) ;
2023-03-02 13:23:01 +00:00
// Add url scheme to other urls like embed css background: url("//upload.wikimedia.org/wikipedia/commons/6/65/Lock-green.svg")right 0.1em center/9px no-repeat
articleString . replace ( " url( \" // " , " url( \" " + wikiUrl . scheme ( ) + " :// " ) ;
2019-03-04 16:36:58 +00:00
// Fix urls in "srcset" attribute
pos = 0 ;
2022-12-24 22:01:50 +00:00
QRegularExpression regSrcset ( R " ( srcset \ s*= \ s* " / [ ^ " ]+ " ) " ) ;
2019-03-04 16:36:58 +00:00
it = regSrcset . globalMatch ( articleString ) ;
while ( it . hasNext ( ) ) {
QRegularExpressionMatch match = it . next ( ) ;
2022-02-27 05:17:37 +00:00
articleNewString + = articleString . mid ( pos , match . capturedStart ( ) - pos ) ;
2019-03-04 16:36:58 +00:00
pos = match . capturedEnd ( ) ;
QString srcset = match . captured ( ) ;
2021-11-19 13:47:22 +00:00
2019-03-04 16:36:58 +00:00
QString newSrcset = srcset . replace ( " // " , wikiUrl . scheme ( ) + " :// " ) ;
articleNewString + = newSrcset ;
}
if ( pos ) {
2022-02-27 05:17:37 +00:00
articleNewString + = articleString . mid ( pos ) ;
2019-03-04 16:36:58 +00:00
articleString = articleNewString ;
articleNewString . clear ( ) ;
}
2021-11-19 13:47:22 +00:00
2019-03-04 16:36:58 +00:00
2023-02-07 06:26:36 +00:00
// Insert the ToC in the end to improve performance because no replacements are needed in the generated ToC.
MediaWikiSectionsParser : : generateTableOfContentsIfEmpty ( parseNode , articleString ) ;
2023-06-23 15:09:31 +00:00
articleString . prepend ( dictPtr - > isToLanguageRTL ( ) ? R " (<div class= " mwiki " dir= " rtl " >) " :
2013-07-10 13:48:09 +00:00
" <div class= \" mwiki \" > " ) ;
2023-06-23 15:09:31 +00:00
articleString . append ( " </div> " ) ;
2023-05-29 13:56:04 +00:00
2023-06-23 15:09:31 +00:00
appendString ( articleString . toStdString ( ) ) ;
2009-05-05 08:56:46 +00:00
hasAnyData = true ;
updated = true ;
2009-03-26 19:00:08 +00:00
}
}
}
2014-05-10 21:02:31 +00:00
GD_DPRINTF ( " done. \n " ) ;
2009-03-26 19:00:08 +00:00
}
2009-05-05 08:56:46 +00:00
else {
setErrorString ( netReply - > errorString ( ) ) ;
2024-10-10 07:13:23 +00:00
}
2016-04-21 15:14:04 +00:00
2017-03-20 14:31:02 +00:00
disconnect ( netReply , 0 , 0 , 0 ) ;
2016-04-21 15:14:04 +00:00
netReply - > deleteLater ( ) ;
2009-03-26 19:00:08 +00:00
}
2009-05-05 08:56:46 +00:00
if ( netReplies . empty ( ) ) {
finish ( ) ;
2024-10-10 07:13:23 +00:00
}
2009-05-05 08:56:46 +00:00
else if ( updated ) {
update ( ) ;
2024-10-10 07:13:23 +00:00
}
2009-03-26 19:00:08 +00:00
}
sptr < WordSearchRequest > MediaWikiDictionary : : prefixMatch ( wstring const & word , unsigned long maxResults )
2023-07-20 08:02:22 +00:00
2009-03-26 19:00:08 +00:00
{
2012-10-31 13:58:35 +00:00
( void ) maxResults ;
2010-03-30 16:52:44 +00:00
if ( word . size ( ) > 80 ) {
// Don't make excessively large queries -- they're fruitless anyway
2022-11-29 03:54:31 +00:00
return std : : make_shared < WordSearchRequestInstant > ( ) ;
2010-03-30 16:52:44 +00:00
}
else {
2024-01-25 23:56:17 +00:00
return std : : make_shared < MediaWikiWordSearchRequest > ( word , url , lang , netMgr ) ;
2024-10-10 07:13:23 +00:00
}
2009-03-26 19:00:08 +00:00
}
2009-05-29 19:48:50 +00:00
sptr < DataRequest >
MediaWikiDictionary : : getArticle ( wstring const & word , vector < wstring > const & alts , wstring const & , bool )
2023-07-20 08:02:22 +00:00
2009-03-26 19:00:08 +00:00
{
2010-03-30 16:52:44 +00:00
if ( word . size ( ) > 80 ) {
// Don't make excessively large queries -- they're fruitless anyway
2022-11-29 03:54:31 +00:00
return std : : make_shared < DataRequestInstant > ( false ) ;
2010-03-30 16:52:44 +00:00
}
else {
2024-01-25 23:56:17 +00:00
return std : : make_shared < MediaWikiArticleRequest > ( word , alts , url , lang , netMgr , this ) ;
2024-10-10 07:13:23 +00:00
}
2009-03-26 19:00:08 +00:00
}
} // namespace
2023-07-20 08:02:22 +00:00
2009-03-26 19:00:08 +00:00
vector < sptr < Dictionary : : Class > >
2009-05-05 08:56:46 +00:00
makeDictionaries ( Dictionary : : Initializing & , Config : : MediaWikis const & wikis , QNetworkAccessManager & mgr )
2009-03-26 19:00:08 +00:00
{
vector < sptr < Dictionary : : Class > > result ;
2023-07-29 16:50:03 +00:00
for ( const auto & wiki : wikis ) {
if ( wiki . enabled ) {
result . push_back ( std : : make_shared < MediaWikiDictionary > ( wiki . id . toStdString ( ) ,
wiki . name . toUtf8 ( ) . data ( ) ,
wiki . url ,
wiki . icon ,
2024-01-25 23:56:17 +00:00
wiki . lang ,
2009-03-26 19:00:08 +00:00
mgr ) ) ;
2024-10-10 07:13:23 +00:00
}
2009-03-26 19:00:08 +00:00
}
return result ;
}
2009-04-21 20:09:02 +00:00
2023-12-12 02:20:34 +00:00
# include "mediawiki.moc"
2009-03-26 19:00:08 +00:00
} // namespace MediaWiki