improve:handle invalid tag soup improvement, related to old issue #271

This commit is contained in:
yifang 2022-01-11 20:33:46 +08:00
parent ccc0f275ba
commit 37d22bc412
9 changed files with 32 additions and 40 deletions

View file

@ -524,11 +524,10 @@ void AardDictionary::loadArticle( quint32 address,
articleText = string( QObject::tr( "Article decoding error" ).toUtf8().constData() ); articleText = string( QObject::tr( "Article decoding error" ).toUtf8().constData() );
// See Issue #271: A mechanism to clean-up invalid HTML cards. // See Issue #271: A mechanism to clean-up invalid HTML cards.
string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>" // leave the invalid tags at the mercy of modern browsers.(webengine chrome)
"</font>""</font>""</font>""</font>""</font>""</font>" // https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser
"</b></b></b></b></b></b></b></b>" // https://en.wikipedia.org/wiki/Tag_soup#HTML5
"</i></i></i></i></i></i></i></i>" string cleaner = "";
"</a></a></a></a></a></a></a></a>";
string prefix( "<div class=\"aard\"" ); string prefix( "<div class=\"aard\"" );
if( isToLanguageRTL() ) if( isToLanguageRTL() )

View file

@ -47,8 +47,7 @@ std::string ArticleMaker::makeHtmlHeader( QString const & word,
bool expandOptionalParts ) const bool expandOptionalParts ) const
{ {
string result = string result =
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" " "<!DOCTYPE html>"
"\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">"
"<html><head>" "<html><head>"
"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">"; "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">";

8
bgl.cc
View file

@ -858,10 +858,10 @@ void BglArticleRequest::run()
multimap< wstring, pair< string, string > >::const_iterator i; multimap< wstring, pair< string, string > >::const_iterator i;
string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>" // leave the invalid tags at the mercy of modern browsers.(webengine chrome)
"</font>""</font>""</font>""</font>""</font>""</font>" // https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser
"</b></b></b></b></b></b></b></b>" // https://en.wikipedia.org/wiki/Tag_soup#HTML5
"</i></i></i></i></i></i></i></i>"; string cleaner = "";
for( i = mainArticles.begin(); i != mainArticles.end(); ++i ) for( i = mainArticles.begin(); i != mainArticles.end(); ++i )
{ {
if (dict.isFromLanguageRTL() ) // RTL support if (dict.isFromLanguageRTL() ) // RTL support

9
mdx.cc
View file

@ -707,11 +707,10 @@ void MdxArticleRequest::run()
} }
// See Issue #271: A mechanism to clean-up invalid HTML cards. // See Issue #271: A mechanism to clean-up invalid HTML cards.
string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>" // leave the invalid tags at the mercy of modern browsers.(webengine chrome)
"</font>""</font>""</font>""</font>""</font>""</font>" // https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser
"</b></b></b></b></b></b></b></b>" // https://en.wikipedia.org/wiki/Tag_soup#HTML5
"</i></i></i></i></i></i></i></i>" string cleaner = "";
"</a></a></a></a></a></a></a></a>";
articleText += "<div class=\"mdict\">" + articleBody + cleaner + "</div>\n"; articleText += "<div class=\"mdict\">" + articleBody + cleaner + "</div>\n";
} }

View file

@ -799,11 +799,10 @@ void SlobDictionary::loadArticle( quint32 address,
articleText = string( QObject::tr( "Article decoding error" ).toUtf8().constData() ); articleText = string( QObject::tr( "Article decoding error" ).toUtf8().constData() );
// See Issue #271: A mechanism to clean-up invalid HTML cards. // See Issue #271: A mechanism to clean-up invalid HTML cards.
string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>" // leave the invalid tags at the mercy of modern browsers.(webengine chrome)
"</font>""</font>""</font>""</font>""</font>""</font>" // https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser
"</b></b></b></b></b></b></b></b>" // https://en.wikipedia.org/wiki/Tag_soup#HTML5
"</i></i></i></i></i></i></i></i>" string cleaner = "";
"</a></a></a></a></a></a></a></a>";
string prefix( "<div class=\"slobdict\"" ); string prefix( "<div class=\"slobdict\"" );
if( isToLanguageRTL() ) if( isToLanguageRTL() )

View file

@ -1464,11 +1464,10 @@ void StardictArticleRequest::run()
multimap< wstring, pair< string, string > >::const_iterator i; multimap< wstring, pair< string, string > >::const_iterator i;
string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>" // leave the invalid tags at the mercy of modern browsers.(webengine chrome)
"</font>""</font>""</font>""</font>""</font>""</font>" // https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser
"</b></b></b></b></b></b></b></b>" // https://en.wikipedia.org/wiki/Tag_soup#HTML5
"</i></i></i></i></i></i></i></i>"; string cleaner = "";
for( i = mainArticles.begin(); i != mainArticles.end(); ++i ) for( i = mainArticles.begin(); i != mainArticles.end(); ++i )
{ {
result += dict.isFromLanguageRTL() ? "<h3 class=\"sdct_headwords\" dir=\"rtl\">" : "<h3 class=\"sdct_headwords\">"; result += dict.isFromLanguageRTL() ? "<h3 class=\"sdct_headwords\" dir=\"rtl\">" : "<h3 class=\"sdct_headwords\">";

View file

@ -281,11 +281,9 @@ void WebSiteArticleRequest::requestFinished( QNetworkReply * r )
} }
// See Issue #271: A mechanism to clean-up invalid HTML cards. // See Issue #271: A mechanism to clean-up invalid HTML cards.
articleString += "</font>""</font>""</font>""</font>""</font>""</font>" // leave the invalid tags at the mercy of modern browsers.(webengine chrome)
"</font>""</font>""</font>""</font>""</font>""</font>" // https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser
"</b></b></b></b></b></b></b></b>" // https://en.wikipedia.org/wiki/Tag_soup#HTML5
"</i></i></i></i></i></i></i></i>"
"</a></a></a></a></a></a></a></a>";
QByteArray articleBody = articleString.toUtf8(); QByteArray articleBody = articleString.toUtf8();

View file

@ -584,10 +584,10 @@ void XdxfArticleRequest::run()
multimap< wstring, pair< string, string > >::const_iterator i; multimap< wstring, pair< string, string > >::const_iterator i;
string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>" // leave the invalid tags at the mercy of modern browsers.(webengine chrome)
"</font>""</font>""</font>""</font>""</font>""</font>" // https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser
"</b></b></b></b></b></b></b></b>" // https://en.wikipedia.org/wiki/Tag_soup#HTML5
"</i></i></i></i></i></i></i></i>"; string cleaner = "";
for( i = mainArticles.begin(); i != mainArticles.end(); ++i ) for( i = mainArticles.begin(); i != mainArticles.end(); ++i )
{ {

9
zim.cc
View file

@ -1286,11 +1286,10 @@ void ZimArticleRequest::run()
string result; string result;
// See Issue #271: A mechanism to clean-up invalid HTML cards. // See Issue #271: A mechanism to clean-up invalid HTML cards.
string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>" // leave the invalid tags at the mercy of modern browsers.(webengine chrome)
"</font>""</font>""</font>""</font>""</font>""</font>" // https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser
"</b></b></b></b></b></b></b></b>" // https://en.wikipedia.org/wiki/Tag_soup#HTML5
"</i></i></i></i></i></i></i></i>" string cleaner = "";
"</a></a></a></a></a></a></a></a>";
multimap< wstring, pair< string, string > >::const_iterator i; multimap< wstring, pair< string, string > >::const_iterator i;