diff --git a/mediawiki.cc b/mediawiki.cc index 57b9e96b..ca6359de 100644 --- a/mediawiki.cc +++ b/mediawiki.cc @@ -200,6 +200,157 @@ void MediaWikiWordSearchRequest::downloadFinished() finish(); } +class MediaWikiSectionsParser +{ +public: + /// Since a recent Wikipedia UI redesign, the table of contents (ToC) is no longer part of an article's HTML. + /// ToC is absent from the text node of Wikipedia's MediaWiki API reply. Quote from + /// https://www.mediawiki.org/wiki/Reading/Web/Desktop_Improvements/Features/Table_of_contents#How_can_I_get_the_old_table_of_contents? + /// We intentionally do not add the old table of contents to the article in addition to the new sidebar location... + /// Users can restore the old table of contents position with the following JavaScript code: + /// document.querySelector('mw\\3Atocplace,meta[property="mw:PageProp/toc"]').replaceWith( document.getElementById('mw-panel-toc') ) + /// + /// This function searches for an indicator of the empty ToC in an article HTML. If the indicator is present, + /// generates ToC HTML from the sections element and replaces the indicator with the generated ToC. + static void generateTableOfContentsIfEmpty( QDomNode const & parseNode, QString & articleString ) + { + QString const emptyTocIndicator = ""; + int const emptyTocPos = articleString.indexOf( emptyTocIndicator ); + if( emptyTocPos == -1 ) + return; // The ToC must be absent or nonempty => nothing to do. + + QDomElement const sectionsElement = parseNode.firstChildElement( "sections" ); + if( sectionsElement.isNull() ) + { + gdWarning( "MediaWiki: empty table of contents and missing sections element." ); + return; + } + + gdDebug( "MediaWiki: generating table of contents from the sections element." ); + MediaWikiSectionsParser parser; + parser.generateTableOfContents( sectionsElement ); + articleString.replace( emptyTocPos, emptyTocIndicator.size(), parser.tableOfContents ); + } + +private: + MediaWikiSectionsParser() : previousLevel( 0 ) {} + void generateTableOfContents( QDomElement const & sectionsElement ); + + bool addListLevel( QString const & levelString ); + void closeListTags( int currentLevel ); + + QString tableOfContents; + int previousLevel; +}; + +void MediaWikiSectionsParser::generateTableOfContents( QDomElement const & sectionsElement ) +{ + // A real example of a typical child of the element: + // + + // Use Wiktionary's ToC style, which had also been Wikipedia's ToC style until the UI redesign. + // Replace double quotes with single quotes to avoid escaping " within string literals. + + QString const elTagName = "s"; + QDomElement el = sectionsElement.firstChildElement( elTagName ); + if( el.isNull() ) + return; + + // Omit invisible and useless toctogglecheckbox, toctogglespan and toctogglelabel elements. + // The values of lang (e.g. 'en') and dir (e.g. 'ltr') attributes of the toctitle element depend on + // the article's language. These attributes have no visible effect and so are simply omitted here. + // TODO: the "Contents" string should be translated to the article's language, but I don't know how + // to implement this. Should "Contents" be enclosed in tr() to at least translate it to GoldenDict's + // interface language? Is there a language-agnostic Unicode symbol that stands for "Contents"? + tableOfContents = ""; +} + +bool MediaWikiSectionsParser::addListLevel( QString const & levelString ) +{ + bool convertedToInt; + int const level = levelString.toInt( &convertedToInt ); + + if( !convertedToInt ) + { + gdWarning( "MediaWiki: sections level is not an integer: %s", levelString.toUtf8().constData() ); + return false; + } + if( level <= 0 ) + { + gdWarning( "MediaWiki: unsupported nonpositive sections level: %s", levelString.toUtf8().constData() ); + return false; + } + if( level > previousLevel + 1 ) + { + gdWarning( "MediaWiki: unsupported sections level increase by more than one: from %d to %s", + previousLevel, levelString.toUtf8().constData() ); + return false; + } + + if( level == previousLevel + 1 ) + { + // Don't close the previous list item tag to nest the current deeper level's list in it. + tableOfContents += "\n\n\n"; + --previousLevel; + } +} + class MediaWikiArticleRequest: public MediaWikiDataRequestSlots { typedef std::list< std::pair< QNetworkReply *, bool > > NetReplies; @@ -269,7 +420,7 @@ void MediaWikiArticleRequest::addQuery( QNetworkAccessManager & mgr, { gdDebug( "MediaWiki: requesting article %s\n", gd::toQString( str ).toUtf8().data() ); - QUrl reqUrl( url + "/api.php?action=parse&prop=text|revid&format=xml&redirects" ); + QUrl reqUrl( url + "/api.php?action=parse&prop=text|revid|sections&format=xml&redirects" ); Utils::Url::addQueryItem( reqUrl, "page", gd::toQString( str ).replace( '+', "%2B" ) ); QNetworkRequest req( reqUrl ) ; @@ -489,6 +640,9 @@ void MediaWikiArticleRequest::requestFinished( QNetworkReply * r ) } + // Insert the ToC in the end to improve performance because no replacements are needed in the generated ToC. + MediaWikiSectionsParser::generateTableOfContentsIfEmpty( parseNode, articleString ); + QByteArray articleBody = articleString.toUtf8(); articleBody.prepend( dictPtr->isToLanguageRTL() ? R"(
)" :