improve:handle invalid tag soup improvement, related to old issue #271

This commit is contained in:
yifang 2022-01-11 20:33:46 +08:00
parent ccc0f275ba
commit 37d22bc412
9 changed files with 32 additions and 40 deletions

View file

@ -524,11 +524,10 @@ void AardDictionary::loadArticle( quint32 address,
articleText = string( QObject::tr( "Article decoding error" ).toUtf8().constData() );
// See Issue #271: A mechanism to clean-up invalid HTML cards.
string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>"
"</font>""</font>""</font>""</font>""</font>""</font>"
"</b></b></b></b></b></b></b></b>"
"</i></i></i></i></i></i></i></i>"
"</a></a></a></a></a></a></a></a>";
// leave the invalid tags at the mercy of modern browsers.(webengine chrome)
// https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser
// https://en.wikipedia.org/wiki/Tag_soup#HTML5
string cleaner = "";
string prefix( "<div class=\"aard\"" );
if( isToLanguageRTL() )

View file

@ -47,8 +47,7 @@ std::string ArticleMaker::makeHtmlHeader( QString const & word,
bool expandOptionalParts ) const
{
string result =
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" "
"\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">"
"<!DOCTYPE html>"
"<html><head>"
"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">";

8
bgl.cc
View file

@ -858,10 +858,10 @@ void BglArticleRequest::run()
multimap< wstring, pair< string, string > >::const_iterator i;
string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>"
"</font>""</font>""</font>""</font>""</font>""</font>"
"</b></b></b></b></b></b></b></b>"
"</i></i></i></i></i></i></i></i>";
// leave the invalid tags at the mercy of modern browsers.(webengine chrome)
// https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser
// https://en.wikipedia.org/wiki/Tag_soup#HTML5
string cleaner = "";
for( i = mainArticles.begin(); i != mainArticles.end(); ++i )
{
if (dict.isFromLanguageRTL() ) // RTL support

9
mdx.cc
View file

@ -707,11 +707,10 @@ void MdxArticleRequest::run()
}
// See Issue #271: A mechanism to clean-up invalid HTML cards.
string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>"
"</font>""</font>""</font>""</font>""</font>""</font>"
"</b></b></b></b></b></b></b></b>"
"</i></i></i></i></i></i></i></i>"
"</a></a></a></a></a></a></a></a>";
// leave the invalid tags at the mercy of modern browsers.(webengine chrome)
// https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser
// https://en.wikipedia.org/wiki/Tag_soup#HTML5
string cleaner = "";
articleText += "<div class=\"mdict\">" + articleBody + cleaner + "</div>\n";
}

View file

@ -799,11 +799,10 @@ void SlobDictionary::loadArticle( quint32 address,
articleText = string( QObject::tr( "Article decoding error" ).toUtf8().constData() );
// See Issue #271: A mechanism to clean-up invalid HTML cards.
string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>"
"</font>""</font>""</font>""</font>""</font>""</font>"
"</b></b></b></b></b></b></b></b>"
"</i></i></i></i></i></i></i></i>"
"</a></a></a></a></a></a></a></a>";
// leave the invalid tags at the mercy of modern browsers.(webengine chrome)
// https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser
// https://en.wikipedia.org/wiki/Tag_soup#HTML5
string cleaner = "";
string prefix( "<div class=\"slobdict\"" );
if( isToLanguageRTL() )

View file

@ -1464,11 +1464,10 @@ void StardictArticleRequest::run()
multimap< wstring, pair< string, string > >::const_iterator i;
string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>"
"</font>""</font>""</font>""</font>""</font>""</font>"
"</b></b></b></b></b></b></b></b>"
"</i></i></i></i></i></i></i></i>";
// leave the invalid tags at the mercy of modern browsers.(webengine chrome)
// https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser
// https://en.wikipedia.org/wiki/Tag_soup#HTML5
string cleaner = "";
for( i = mainArticles.begin(); i != mainArticles.end(); ++i )
{
result += dict.isFromLanguageRTL() ? "<h3 class=\"sdct_headwords\" dir=\"rtl\">" : "<h3 class=\"sdct_headwords\">";

View file

@ -281,11 +281,9 @@ void WebSiteArticleRequest::requestFinished( QNetworkReply * r )
}
// See Issue #271: A mechanism to clean-up invalid HTML cards.
articleString += "</font>""</font>""</font>""</font>""</font>""</font>"
"</font>""</font>""</font>""</font>""</font>""</font>"
"</b></b></b></b></b></b></b></b>"
"</i></i></i></i></i></i></i></i>"
"</a></a></a></a></a></a></a></a>";
// leave the invalid tags at the mercy of modern browsers.(webengine chrome)
// https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser
// https://en.wikipedia.org/wiki/Tag_soup#HTML5
QByteArray articleBody = articleString.toUtf8();

View file

@ -584,10 +584,10 @@ void XdxfArticleRequest::run()
multimap< wstring, pair< string, string > >::const_iterator i;
string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>"
"</font>""</font>""</font>""</font>""</font>""</font>"
"</b></b></b></b></b></b></b></b>"
"</i></i></i></i></i></i></i></i>";
// leave the invalid tags at the mercy of modern browsers.(webengine chrome)
// https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser
// https://en.wikipedia.org/wiki/Tag_soup#HTML5
string cleaner = "";
for( i = mainArticles.begin(); i != mainArticles.end(); ++i )
{

9
zim.cc
View file

@ -1286,11 +1286,10 @@ void ZimArticleRequest::run()
string result;
// See Issue #271: A mechanism to clean-up invalid HTML cards.
string cleaner = "</font>""</font>""</font>""</font>""</font>""</font>"
"</font>""</font>""</font>""</font>""</font>""</font>"
"</b></b></b></b></b></b></b></b>"
"</i></i></i></i></i></i></i></i>"
"</a></a></a></a></a></a></a></a>";
// leave the invalid tags at the mercy of modern browsers.(webengine chrome)
// https://html.spec.whatwg.org/#an-introduction-to-error-handling-and-strange-cases-in-the-parser
// https://en.wikipedia.org/wiki/Tag_soup#HTML5
string cleaner = "";
multimap< wstring, pair< string, string > >::const_iterator i;