fix:startdict parse html incorrect

stardict/xdxf use QDomDocument to parse the html. which result with incorrect html structure.
This commit is contained in:
xiaoyifang 2023-05-07 18:02:22 +08:00
parent c11cff46f0
commit f25bee96e5
3 changed files with 11 additions and 16 deletions

View file

@ -59,6 +59,9 @@ namespace Html {
const static QRegularExpression startDivTag( R"(<div[\s>])" ); const static QRegularExpression startDivTag( R"(<div[\s>])" );
const static QRegularExpression htmlEntity( R"(&(?:#\d+|#[xX][\da-fA-F]+|[0-9a-zA-Z]+);)" ); const static QRegularExpression htmlEntity( R"(&(?:#\d+|#[xX][\da-fA-F]+|[0-9a-zA-Z]+);)" );
// exclude <br/> <hr/>
const static QRegularExpression emptyXmlTag(R"(<(?!(br|hr)\b)([^/ >]*)\s*/>)");
bool containHtmlEntity( std::string const & text ); bool containHtmlEntity( std::string const & text );
} }

View file

@ -22,10 +22,8 @@
#include <map> #include <map>
#include <set> #include <set>
#include <string> #include <string>
// msvc defines _WIN32 https://learn.microsoft.com/en-us/cpp/preprocessor/predefined-macros?view=msvc-170
// gcc also defines __WIN32, _WIN32, __WIN32__ #ifndef Q_OS_WIN
// todo: unify how windows are detected on headers
#ifndef _WIN32
#include <arpa/inet.h> #include <arpa/inet.h>
#else #else
#include <winsock.h> #include <winsock.h>
@ -94,7 +92,7 @@ struct Ifo
string sametypesequence, dicttype, description; string sametypesequence, dicttype, description;
string copyright, author, email, website, date; string copyright, author, email, website, date;
Ifo( File::Class & ); explicit Ifo( File::Class & );
}; };
enum enum
@ -799,7 +797,7 @@ void StardictDictionary::pangoToHtml( QString & text )
else if( style.compare( "font_style", Qt::CaseInsensitive ) == 0 else if( style.compare( "font_style", Qt::CaseInsensitive ) == 0
|| style.compare( "style", Qt::CaseInsensitive ) == 0) || style.compare( "style", Qt::CaseInsensitive ) == 0)
newSpan += QString( "font-style:" ) + styleRegex.cap( 2 ) + ";"; newSpan += QString( "font-style:" ) + styleRegex.cap( 2 ) + ";";
else if( style.compare( "weight", Qt::CaseInsensitive ) == 0 else if( style.compare( "font_weight", Qt::CaseInsensitive ) == 0
|| style.compare( "weight", Qt::CaseInsensitive ) == 0) || style.compare( "weight", Qt::CaseInsensitive ) == 0)
{ {
QString str = styleRegex.cap( 2 ); QString str = styleRegex.cap( 2 );

View file

@ -17,6 +17,8 @@
#include <QRegularExpression> #include <QRegularExpression>
#include "globalregex.hh"
namespace Xdxf2Html { namespace Xdxf2Html {
static void fixLink( QDomElement & el, string const & dictId, const char *attrName ) static void fixLink( QDomElement & el, string const & dictId, const char *attrName )
@ -69,8 +71,6 @@ string convert( string const & in, DICT_TYPE type, map < string, string > const
Dictionary::Class *dictPtr, IndexedZip * resourceZip, Dictionary::Class *dictPtr, IndexedZip * resourceZip,
bool isLogicalFormat, unsigned revisionNumber, QString * headword ) bool isLogicalFormat, unsigned revisionNumber, QString * headword )
{ {
// GD_DPRINTF( "Source>>>>>>>>>>: %s\n\n\n", in.c_str() );
// Convert spaces after each end of line to &nbsp;s, and then each end of // Convert spaces after each end of line to &nbsp;s, and then each end of
// line to a <br> // line to a <br>
@ -108,11 +108,6 @@ string convert( string const & in, DICT_TYPE type, map < string, string > const
} }
} }
// Strip "<nu />" tags - QDomDocument don't handle it correctly
string::size_type n;
while( ( n = inConverted.find( "<nu />" ) ) != string::npos )
inConverted.erase( n, 6 );
// We build a dom representation of the given xml, then do some transforms // We build a dom representation of the given xml, then do some transforms
QDomDocument dd; QDomDocument dd;
@ -703,9 +698,8 @@ string convert( string const & in, DICT_TYPE type, map < string, string > const
el.setAttribute( "class", "xdxf_rref" ); el.setAttribute( "class", "xdxf_rref" );
} }
// GD_DPRINTF( "Result>>>>>>>>>>: %s\n\n\n", dd.toByteArray( 0 ).data() ); //workaround, the xml is not very suitable to process the html content. such as <blockquote/> is valid in xml ,while invalid in html.
return dd.toString().remove( RX::Html::emptyXmlTag ).toUtf8().data();
return dd.toString( 1 ).remove('\n').remove( QRegularExpression( "<(b|i)/>" ) ).toUtf8().data();
} }
} }