mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-27 15:24:05 +00:00
fix:startdict parse html incorrect
stardict/xdxf use QDomDocument to parse the html. which result with incorrect html structure.
This commit is contained in:
parent
c11cff46f0
commit
f25bee96e5
|
@ -59,6 +59,9 @@ namespace Html {
|
||||||
const static QRegularExpression startDivTag( R"(<div[\s>])" );
|
const static QRegularExpression startDivTag( R"(<div[\s>])" );
|
||||||
const static QRegularExpression htmlEntity( R"(&(?:#\d+|#[xX][\da-fA-F]+|[0-9a-zA-Z]+);)" );
|
const static QRegularExpression htmlEntity( R"(&(?:#\d+|#[xX][\da-fA-F]+|[0-9a-zA-Z]+);)" );
|
||||||
|
|
||||||
|
// exclude <br/> <hr/>
|
||||||
|
const static QRegularExpression emptyXmlTag(R"(<(?!(br|hr)\b)([^/ >]*)\s*/>)");
|
||||||
|
|
||||||
bool containHtmlEntity( std::string const & text );
|
bool containHtmlEntity( std::string const & text );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -22,10 +22,8 @@
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <set>
|
#include <set>
|
||||||
#include <string>
|
#include <string>
|
||||||
// msvc defines _WIN32 https://learn.microsoft.com/en-us/cpp/preprocessor/predefined-macros?view=msvc-170
|
|
||||||
// gcc also defines __WIN32, _WIN32, __WIN32__
|
#ifndef Q_OS_WIN
|
||||||
// todo: unify how windows are detected on headers
|
|
||||||
#ifndef _WIN32
|
|
||||||
#include <arpa/inet.h>
|
#include <arpa/inet.h>
|
||||||
#else
|
#else
|
||||||
#include <winsock.h>
|
#include <winsock.h>
|
||||||
|
@ -94,7 +92,7 @@ struct Ifo
|
||||||
string sametypesequence, dicttype, description;
|
string sametypesequence, dicttype, description;
|
||||||
string copyright, author, email, website, date;
|
string copyright, author, email, website, date;
|
||||||
|
|
||||||
Ifo( File::Class & );
|
explicit Ifo( File::Class & );
|
||||||
};
|
};
|
||||||
|
|
||||||
enum
|
enum
|
||||||
|
@ -799,7 +797,7 @@ void StardictDictionary::pangoToHtml( QString & text )
|
||||||
else if( style.compare( "font_style", Qt::CaseInsensitive ) == 0
|
else if( style.compare( "font_style", Qt::CaseInsensitive ) == 0
|
||||||
|| style.compare( "style", Qt::CaseInsensitive ) == 0)
|
|| style.compare( "style", Qt::CaseInsensitive ) == 0)
|
||||||
newSpan += QString( "font-style:" ) + styleRegex.cap( 2 ) + ";";
|
newSpan += QString( "font-style:" ) + styleRegex.cap( 2 ) + ";";
|
||||||
else if( style.compare( "weight", Qt::CaseInsensitive ) == 0
|
else if( style.compare( "font_weight", Qt::CaseInsensitive ) == 0
|
||||||
|| style.compare( "weight", Qt::CaseInsensitive ) == 0)
|
|| style.compare( "weight", Qt::CaseInsensitive ) == 0)
|
||||||
{
|
{
|
||||||
QString str = styleRegex.cap( 2 );
|
QString str = styleRegex.cap( 2 );
|
||||||
|
|
|
@ -17,6 +17,8 @@
|
||||||
|
|
||||||
#include <QRegularExpression>
|
#include <QRegularExpression>
|
||||||
|
|
||||||
|
#include "globalregex.hh"
|
||||||
|
|
||||||
namespace Xdxf2Html {
|
namespace Xdxf2Html {
|
||||||
|
|
||||||
static void fixLink( QDomElement & el, string const & dictId, const char *attrName )
|
static void fixLink( QDomElement & el, string const & dictId, const char *attrName )
|
||||||
|
@ -69,8 +71,6 @@ string convert( string const & in, DICT_TYPE type, map < string, string > const
|
||||||
Dictionary::Class *dictPtr, IndexedZip * resourceZip,
|
Dictionary::Class *dictPtr, IndexedZip * resourceZip,
|
||||||
bool isLogicalFormat, unsigned revisionNumber, QString * headword )
|
bool isLogicalFormat, unsigned revisionNumber, QString * headword )
|
||||||
{
|
{
|
||||||
// GD_DPRINTF( "Source>>>>>>>>>>: %s\n\n\n", in.c_str() );
|
|
||||||
|
|
||||||
// Convert spaces after each end of line to s, and then each end of
|
// Convert spaces after each end of line to s, and then each end of
|
||||||
// line to a <br>
|
// line to a <br>
|
||||||
|
|
||||||
|
@ -108,11 +108,6 @@ string convert( string const & in, DICT_TYPE type, map < string, string > const
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Strip "<nu />" tags - QDomDocument don't handle it correctly
|
|
||||||
string::size_type n;
|
|
||||||
while( ( n = inConverted.find( "<nu />" ) ) != string::npos )
|
|
||||||
inConverted.erase( n, 6 );
|
|
||||||
|
|
||||||
// We build a dom representation of the given xml, then do some transforms
|
// We build a dom representation of the given xml, then do some transforms
|
||||||
QDomDocument dd;
|
QDomDocument dd;
|
||||||
|
|
||||||
|
@ -703,9 +698,8 @@ string convert( string const & in, DICT_TYPE type, map < string, string > const
|
||||||
el.setAttribute( "class", "xdxf_rref" );
|
el.setAttribute( "class", "xdxf_rref" );
|
||||||
}
|
}
|
||||||
|
|
||||||
// GD_DPRINTF( "Result>>>>>>>>>>: %s\n\n\n", dd.toByteArray( 0 ).data() );
|
//workaround, the xml is not very suitable to process the html content. such as <blockquote/> is valid in xml ,while invalid in html.
|
||||||
|
return dd.toString().remove( RX::Html::emptyXmlTag ).toUtf8().data();
|
||||||
return dd.toString( 1 ).remove('\n').remove( QRegularExpression( "<(b|i)/>" ) ).toUtf8().data();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue