mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-12-17 23:04:06 +00:00
fix:startdict parse html incorrect
stardict/xdxf use QDomDocument to parse the html. which result with incorrect html structure.
This commit is contained in:
parent
c11cff46f0
commit
f25bee96e5
|
@ -59,6 +59,9 @@ namespace Html {
|
|||
const static QRegularExpression startDivTag( R"(<div[\s>])" );
|
||||
const static QRegularExpression htmlEntity( R"(&(?:#\d+|#[xX][\da-fA-F]+|[0-9a-zA-Z]+);)" );
|
||||
|
||||
// exclude <br/> <hr/>
|
||||
const static QRegularExpression emptyXmlTag(R"(<(?!(br|hr)\b)([^/ >]*)\s*/>)");
|
||||
|
||||
bool containHtmlEntity( std::string const & text );
|
||||
}
|
||||
|
||||
|
|
|
@ -22,10 +22,8 @@
|
|||
#include <map>
|
||||
#include <set>
|
||||
#include <string>
|
||||
// msvc defines _WIN32 https://learn.microsoft.com/en-us/cpp/preprocessor/predefined-macros?view=msvc-170
|
||||
// gcc also defines __WIN32, _WIN32, __WIN32__
|
||||
// todo: unify how windows are detected on headers
|
||||
#ifndef _WIN32
|
||||
|
||||
#ifndef Q_OS_WIN
|
||||
#include <arpa/inet.h>
|
||||
#else
|
||||
#include <winsock.h>
|
||||
|
@ -94,7 +92,7 @@ struct Ifo
|
|||
string sametypesequence, dicttype, description;
|
||||
string copyright, author, email, website, date;
|
||||
|
||||
Ifo( File::Class & );
|
||||
explicit Ifo( File::Class & );
|
||||
};
|
||||
|
||||
enum
|
||||
|
@ -799,7 +797,7 @@ void StardictDictionary::pangoToHtml( QString & text )
|
|||
else if( style.compare( "font_style", Qt::CaseInsensitive ) == 0
|
||||
|| style.compare( "style", Qt::CaseInsensitive ) == 0)
|
||||
newSpan += QString( "font-style:" ) + styleRegex.cap( 2 ) + ";";
|
||||
else if( style.compare( "weight", Qt::CaseInsensitive ) == 0
|
||||
else if( style.compare( "font_weight", Qt::CaseInsensitive ) == 0
|
||||
|| style.compare( "weight", Qt::CaseInsensitive ) == 0)
|
||||
{
|
||||
QString str = styleRegex.cap( 2 );
|
||||
|
|
|
@ -17,6 +17,8 @@
|
|||
|
||||
#include <QRegularExpression>
|
||||
|
||||
#include "globalregex.hh"
|
||||
|
||||
namespace Xdxf2Html {
|
||||
|
||||
static void fixLink( QDomElement & el, string const & dictId, const char *attrName )
|
||||
|
@ -69,8 +71,6 @@ string convert( string const & in, DICT_TYPE type, map < string, string > const
|
|||
Dictionary::Class *dictPtr, IndexedZip * resourceZip,
|
||||
bool isLogicalFormat, unsigned revisionNumber, QString * headword )
|
||||
{
|
||||
// GD_DPRINTF( "Source>>>>>>>>>>: %s\n\n\n", in.c_str() );
|
||||
|
||||
// Convert spaces after each end of line to s, and then each end of
|
||||
// line to a <br>
|
||||
|
||||
|
@ -108,11 +108,6 @@ string convert( string const & in, DICT_TYPE type, map < string, string > const
|
|||
}
|
||||
}
|
||||
|
||||
// Strip "<nu />" tags - QDomDocument don't handle it correctly
|
||||
string::size_type n;
|
||||
while( ( n = inConverted.find( "<nu />" ) ) != string::npos )
|
||||
inConverted.erase( n, 6 );
|
||||
|
||||
// We build a dom representation of the given xml, then do some transforms
|
||||
QDomDocument dd;
|
||||
|
||||
|
@ -703,9 +698,8 @@ string convert( string const & in, DICT_TYPE type, map < string, string > const
|
|||
el.setAttribute( "class", "xdxf_rref" );
|
||||
}
|
||||
|
||||
// GD_DPRINTF( "Result>>>>>>>>>>: %s\n\n\n", dd.toByteArray( 0 ).data() );
|
||||
|
||||
return dd.toString( 1 ).remove('\n').remove( QRegularExpression( "<(b|i)/>" ) ).toUtf8().data();
|
||||
//workaround, the xml is not very suitable to process the html content. such as <blockquote/> is valid in xml ,while invalid in html.
|
||||
return dd.toString().remove( RX::Html::emptyXmlTag ).toUtf8().data();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue