/* This file is (c) 2008-2012 Konstantin Isakov * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ #include "mediawiki.hh" #include "wstring_qt.hh" #include #include #include #include #include #include #include "gddebug.hh" #include "audiolink.hh" #include "langcoder.hh" #include "utils.hh" #include #include "globalbroadcaster.hh" namespace MediaWiki { using namespace Dictionary; namespace { class MediaWikiDictionary: public Dictionary::Class { string name; QString url, icon; QNetworkAccessManager & netMgr; quint32 langId; public: MediaWikiDictionary( string const & id, string const & name_, QString const & url_, QString const & icon_, QNetworkAccessManager & netMgr_ ): Dictionary::Class( id, vector< string >() ), name( name_ ), url( url_ ), icon( icon_ ), netMgr( netMgr_ ), langId( 0 ) { int n = url.indexOf( "." ); if ( n == 2 || ( n > 3 && url[ n - 3 ] == '/' ) ) langId = LangCoder::code2toInt( url.mid( n - 2, 2 ).toLatin1().data() ); } string getName() noexcept override { return name; } map< Property, string > getProperties() noexcept override { return map< Property, string >(); } unsigned long getArticleCount() noexcept override { return 0; } unsigned long getWordCount() noexcept override { return 0; } sptr< WordSearchRequest > prefixMatch( wstring const &, unsigned long maxResults ) override; sptr< DataRequest > getArticle( wstring const &, vector< wstring > const & alts, wstring const &, bool ) override; quint32 getLangFrom() const override { return langId; } quint32 getLangTo() const override { return langId; } protected: void loadIcon() noexcept override; }; class MediaWikiWordSearchRequestSlots: public Dictionary::WordSearchRequest { Q_OBJECT protected slots: virtual void downloadFinished() {} }; class MediaWikiDataRequestSlots: public Dictionary::DataRequest { Q_OBJECT protected slots: virtual void requestFinished( QNetworkReply * ) {} }; void MediaWikiDictionary::loadIcon() noexcept { if ( dictionaryIconLoaded ) return; if ( !icon.isEmpty() ) { QFileInfo fInfo( QDir( Config::getConfigDir() ), icon ); if ( fInfo.isFile() ) loadIconFromFile( fInfo.absoluteFilePath(), true ); } if ( dictionaryIcon.isNull() ) { if ( url.contains( "tionary" ) ) dictionaryIcon = QIcon( ":/icons/wiktionary.png" ); else dictionaryIcon = QIcon( ":/icons/icon32_wiki.png" ); } dictionaryIconLoaded = true; } class MediaWikiWordSearchRequest: public MediaWikiWordSearchRequestSlots { sptr< QNetworkReply > netReply; bool isCancelling; public: MediaWikiWordSearchRequest( wstring const &, QString const & url, QNetworkAccessManager & mgr ); ~MediaWikiWordSearchRequest(); void cancel() override; private: void downloadFinished() override; }; MediaWikiWordSearchRequest::MediaWikiWordSearchRequest( wstring const & str, QString const & url, QNetworkAccessManager & mgr ): isCancelling( false ) { GD_DPRINTF( "wiki request begin\n" ); QUrl reqUrl( url + "/api.php?action=query&list=allpages&aplimit=40&format=xml" ); GlobalBroadcaster::instance()->addWhitelist( reqUrl.host() ); Utils::Url::addQueryItem( reqUrl, "apprefix", QString::fromStdU32String( str ).replace( '+', "%2B" ) ); QNetworkRequest req( reqUrl ); //millseconds. req.setTransferTimeout( 2000 ); netReply = std::shared_ptr< QNetworkReply >( mgr.get( req ) ); connect( netReply.get(), SIGNAL( finished() ), this, SLOT( downloadFinished() ) ); #ifndef QT_NO_SSL connect( netReply.get(), SIGNAL( sslErrors( QList< QSslError > ) ), netReply.get(), SLOT( ignoreSslErrors() ) ); #endif } MediaWikiWordSearchRequest::~MediaWikiWordSearchRequest() { GD_DPRINTF( "request end\n" ); } void MediaWikiWordSearchRequest::cancel() { // We either finish it in place, or in the timer handler isCancelling = true; if ( netReply.get() ) netReply.reset(); finish(); GD_DPRINTF( "cancel the request" ); } void MediaWikiWordSearchRequest::downloadFinished() { if ( isCancelling || isFinished() ) // Was cancelled return; if ( netReply->error() == QNetworkReply::NoError ) { QDomDocument dd; QString errorStr; int errorLine, errorColumn; if ( !dd.setContent( netReply.get(), false, &errorStr, &errorLine, &errorColumn ) ) { setErrorString( QString( tr( "XML parse error: %1 at %2,%3" ).arg( errorStr ).arg( errorLine ).arg( errorColumn ) ) ); } else { QDomNode pages = dd.namedItem( "api" ).namedItem( "query" ).namedItem( "allpages" ); if ( !pages.isNull() ) { QDomNodeList nl = pages.toElement().elementsByTagName( "p" ); QMutexLocker _( &dataMutex ); qDebug() << "matches" << matches.size(); for ( int x = 0; x < nl.length(); ++x ) matches.emplace_back( gd::toWString( nl.item( x ).toElement().attribute( "title" ) ) ); } } GD_DPRINTF( "done.\n" ); } else setErrorString( netReply->errorString() ); finish(); } class MediaWikiSectionsParser { public: /// Since a recent Wikipedia UI redesign, the table of contents (ToC) is no longer part of an article's HTML. /// ToC is absent from the text node of Wikipedia's MediaWiki API reply. Quote from /// https://www.mediawiki.org/wiki/Reading/Web/Desktop_Improvements/Features/Table_of_contents#How_can_I_get_the_old_table_of_contents? /// We intentionally do not add the old table of contents to the article in addition to the new sidebar location... /// Users can restore the old table of contents position with the following JavaScript code: /// document.querySelector('mw\\3Atocplace,meta[property="mw:PageProp/toc"]').replaceWith( document.getElementById('mw-panel-toc') ) /// /// This function searches for an indicator of the empty ToC in an article HTML. If the indicator is present, /// generates ToC HTML from the sections element and replaces the indicator with the generated ToC. static void generateTableOfContentsIfEmpty( QDomNode const & parseNode, QString & articleString ) { QString const emptyTocIndicator = ""; int const emptyTocPos = articleString.indexOf( emptyTocIndicator ); if ( emptyTocPos == -1 ) return; // The ToC must be absent or nonempty => nothing to do. QDomElement const sectionsElement = parseNode.firstChildElement( "sections" ); if ( sectionsElement.isNull() ) { gdWarning( "MediaWiki: empty table of contents and missing sections element." ); return; } gdDebug( "MediaWiki: generating table of contents from the sections element." ); MediaWikiSectionsParser parser; parser.generateTableOfContents( sectionsElement ); articleString.replace( emptyTocPos, emptyTocIndicator.size(), parser.tableOfContents ); } private: MediaWikiSectionsParser(): previousLevel( 0 ) { } void generateTableOfContents( QDomElement const & sectionsElement ); bool addListLevel( QString const & levelString ); void closeListTags( int currentLevel ); QString tableOfContents; int previousLevel; }; void MediaWikiSectionsParser::generateTableOfContents( QDomElement const & sectionsElement ) { // A real example of a typical child of the element: // // Use Wiktionary's ToC style, which had also been Wikipedia's ToC style until the UI redesign. // Replace double quotes with single quotes to avoid escaping " within string literals. QString const elTagName = "s"; QDomElement el = sectionsElement.firstChildElement( elTagName ); if ( el.isNull() ) return; // Omit invisible and useless toctogglecheckbox, toctogglespan and toctogglelabel elements. // The values of lang (e.g. 'en') and dir (e.g. 'ltr') attributes of the toctitle element depend on // the article's language. These attributes have no visible effect and so are simply omitted here. // TODO: the "Contents" string should be translated to the article's language, but I don't know how // to implement this. Should "Contents" be enclosed in tr() to at least translate it to GoldenDict's // interface language? Is there a language-agnostic Unicode symbol that stands for "Contents"? tableOfContents = ""; } bool MediaWikiSectionsParser::addListLevel( QString const & levelString ) { bool convertedToInt; int const level = levelString.toInt( &convertedToInt ); if ( !convertedToInt ) { gdWarning( "MediaWiki: sections level is not an integer: %s", levelString.toUtf8().constData() ); return false; } if ( level <= 0 ) { gdWarning( "MediaWiki: unsupported nonpositive sections level: %s", levelString.toUtf8().constData() ); return false; } if ( level > previousLevel + 1 ) { gdWarning( "MediaWiki: unsupported sections level increase by more than one: from %d to %s", previousLevel, levelString.toUtf8().constData() ); return false; } if ( level == previousLevel + 1 ) { // Don't close the previous list item tag to nest the current deeper level's list in it. tableOfContents += "\n
    \n"; previousLevel = level; } else closeListTags( level ); Q_ASSERT( level == previousLevel ); // Open this list item tag. // Omit the (e.g.) class="toclevel-4 tocsection-9" attribute of
  • because it has no visible effect. tableOfContents += "
  • "; return true; } void MediaWikiSectionsParser::closeListTags( int currentLevel ) { Q_ASSERT( currentLevel <= previousLevel ); // Close the previous list item tag. tableOfContents += "
  • \n"; // Close list and list item tags of deeper levels, if any. while ( currentLevel < previousLevel ) { tableOfContents += "
\n\n"; --previousLevel; } } class MediaWikiArticleRequest: public MediaWikiDataRequestSlots { typedef std::list< std::pair< QNetworkReply *, bool > > NetReplies; NetReplies netReplies; QString url; public: MediaWikiArticleRequest( wstring const & word, vector< wstring > const & alts, QString const & url, QNetworkAccessManager & mgr, Class * dictPtr_ ); void cancel() override; private: void addQuery( QNetworkAccessManager & mgr, wstring const & word ); void requestFinished( QNetworkReply * ) override; /// This simple set implementation should be much more efficient than tree- /// and hash-based standard/Qt containers when there are very few elements. template< typename T > class SmallSet { public: bool insert( T x ) { if ( std::find( elements.begin(), elements.end(), x ) != elements.end() ) return false; elements.push_back( x ); return true; } private: std::vector< T > elements; }; /// The page id set allows to filter out duplicate articles in case MediaWiki /// redirects the main word and words in the alts collection to the same page. SmallSet< long long > addedPageIds; Class * dictPtr; }; void MediaWikiArticleRequest::cancel() { finish(); } MediaWikiArticleRequest::MediaWikiArticleRequest( wstring const & str, vector< wstring > const & alts, QString const & url_, QNetworkAccessManager & mgr, Class * dictPtr_ ): url( url_ ), dictPtr( dictPtr_ ) { connect( &mgr, SIGNAL( finished( QNetworkReply * ) ), this, SLOT( requestFinished( QNetworkReply * ) ), Qt::QueuedConnection ); addQuery( mgr, str ); for ( const auto & alt : alts ) addQuery( mgr, alt ); } void MediaWikiArticleRequest::addQuery( QNetworkAccessManager & mgr, wstring const & str ) { gdDebug( "MediaWiki: requesting article %s\n", QString::fromStdU32String( str ).toUtf8().data() ); QUrl reqUrl( url + "/api.php?action=parse&prop=text|revid|sections&format=xml&redirects" ); Utils::Url::addQueryItem( reqUrl, "page", QString::fromStdU32String( str ).replace( '+', "%2B" ) ); QNetworkRequest req( reqUrl ); //millseconds. req.setTransferTimeout( 3000 ); QNetworkReply * netReply = mgr.get( req ); connect( netReply, &QNetworkReply::errorOccurred, this, [ = ]( QNetworkReply::NetworkError e ) { qDebug() << "error:" << e; } ); #ifndef QT_NO_SSL connect( netReply, SIGNAL( sslErrors( QList< QSslError > ) ), netReply, SLOT( ignoreSslErrors() ) ); #endif netReplies.push_back( std::make_pair( netReply, false ) ); } void MediaWikiArticleRequest::requestFinished( QNetworkReply * r ) { GD_DPRINTF( "Finished.\n" ); if ( isFinished() ) // Was cancelled return; // Find this reply bool found = false; for ( auto & netReplie : netReplies ) { if ( netReplie.first == r ) { netReplie.second = true; // Mark as finished found = true; break; } } if ( !found ) { // Well, that's not our reply, don't do anything return; } bool updated = false; for ( ; netReplies.size() && netReplies.front().second; netReplies.pop_front() ) { QNetworkReply * netReply = netReplies.front().first; if ( netReply->error() == QNetworkReply::NoError ) { QDomDocument dd; QString errorStr; int errorLine, errorColumn; if ( !dd.setContent( netReply, false, &errorStr, &errorLine, &errorColumn ) ) { setErrorString( QString( tr( "XML parse error: %1 at %2,%3" ).arg( errorStr ).arg( errorLine ).arg( errorColumn ) ) ); } else { QDomNode parseNode = dd.namedItem( "api" ).namedItem( "parse" ); if ( !parseNode.isNull() && parseNode.toElement().attribute( "revid" ) != "0" // Don't show the same article more than once: && addedPageIds.insert( parseNode.toElement().attribute( "pageid" ).toLongLong() ) ) { QDomNode textNode = parseNode.namedItem( "text" ); if ( !textNode.isNull() ) { QString articleString = textNode.toElement().text(); // Replace all ":" in links, remove '#' part in links to other articles int pos = 0; QRegularExpression regLinks( "= 0 ) { // External link articleNewString += match.captured(); continue; } if ( link.indexOf( ':' ) >= 0 ) link.replace( ':', "%3A" ); int n = link.indexOf( '#', 1 ); if ( n > 0 ) { QString anchor = link.mid( n + 1 ).replace( '_', "%5F" ); link.truncate( n ); link += QString( "?gdanchor=%1" ).arg( anchor ); } QString newLink = QString( "", QRegularExpression::CaseInsensitiveOption | QRegularExpression::DotMatchesEverythingOption ); QRegularExpression reg2( R"(dictPtr->getId() ); QString audio_url = QString::fromStdString( script ) + "Play)"; articleNewString += audio_url; } else articleNewString += match.captured(); } if ( pos ) { articleNewString += articleString.mid( pos ); articleString = articleNewString; articleNewString.clear(); } // Add url scheme to image source urls articleString.replace( " src=\"//", " src=\"" + wikiUrl.scheme() + "://" ); //fix src="/foo/bar/Baz.png" articleString.replace( "src=\"/", "src=\"" + wikiUrl.toString() ); // Remove the /wiki/ prefix from links articleString.replace( "#]+)" ); it = rxLink.globalMatch( articleString ); while ( it.hasNext() ) { QRegularExpressionMatch match = it.next(); for ( int i = match.capturedStart() + 9; i < match.capturedEnd(); i++ ) if ( articleString.at( i ) == QChar( '_' ) ) articleString[ i ] = ' '; } //fix file: url articleString.replace( QRegularExpression( R"(isToLanguageRTL() ? R"(
)" : "
" ); articleString.append( "
" ); appendString( articleString.toStdString() ); hasAnyData = true; updated = true; } } } GD_DPRINTF( "done.\n" ); } else setErrorString( netReply->errorString() ); disconnect( netReply, 0, 0, 0 ); netReply->deleteLater(); } if ( netReplies.empty() ) finish(); else if ( updated ) update(); } sptr< WordSearchRequest > MediaWikiDictionary::prefixMatch( wstring const & word, unsigned long maxResults ) { (void)maxResults; if ( word.size() > 80 ) { // Don't make excessively large queries -- they're fruitless anyway return std::make_shared< WordSearchRequestInstant >(); } else return std::make_shared< MediaWikiWordSearchRequest >( word, url, netMgr ); } sptr< DataRequest > MediaWikiDictionary::getArticle( wstring const & word, vector< wstring > const & alts, wstring const &, bool ) { if ( word.size() > 80 ) { // Don't make excessively large queries -- they're fruitless anyway return std::make_shared< DataRequestInstant >( false ); } else return std::make_shared< MediaWikiArticleRequest >( word, alts, url, netMgr, this ); } } // namespace vector< sptr< Dictionary::Class > > makeDictionaries( Dictionary::Initializing &, Config::MediaWikis const & wikis, QNetworkAccessManager & mgr ) { vector< sptr< Dictionary::Class > > result; for ( const auto & wiki : wikis ) { if ( wiki.enabled ) result.push_back( std::make_shared< MediaWikiDictionary >( wiki.id.toStdString(), wiki.name.toUtf8().data(), wiki.url, wiki.icon, mgr ) ); } return result; } #include "mediawiki.moc" } // namespace MediaWiki