mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-30 17:24:08 +00:00
*! Do proper '<charset c=t>..</charset>' replacements with a regexp.
This commit is contained in:
parent
44b5524a91
commit
484c3e356b
108
src/bgl.cc
108
src/bgl.cc
|
@ -25,6 +25,8 @@
|
||||||
#include <QThreadPool>
|
#include <QThreadPool>
|
||||||
#include <QAtomicInt>
|
#include <QAtomicInt>
|
||||||
|
|
||||||
|
#include <QRegExp>
|
||||||
|
|
||||||
namespace Bgl {
|
namespace Bgl {
|
||||||
|
|
||||||
using std::map;
|
using std::map;
|
||||||
|
@ -826,103 +828,33 @@ sptr< Dictionary::DataRequest > BglDictionary::getResource( string const & name
|
||||||
/// Replaces <CHARSET c="t">1234;</CHARSET> occurences with ሴ
|
/// Replaces <CHARSET c="t">1234;</CHARSET> occurences with ሴ
|
||||||
void BglDictionary::replaceCharsetEntities( string & text )
|
void BglDictionary::replaceCharsetEntities( string & text )
|
||||||
{
|
{
|
||||||
string lowercased = text;
|
QRegExp charsetExp( "<\\s*charset\\s+c\\s*=\\s*[\"']?t[\"']?\\s*>((?:\\s*[0-9a-fA-F]+\\s*;\\s*)*)<\\s*/\\s*charset\\s*>",
|
||||||
|
Qt::CaseInsensitive );
|
||||||
|
|
||||||
// Make a lowercased version of text, used for searching only. Only touch
|
charsetExp.setMinimal( true );
|
||||||
// symbols < 0x80 to avoid any weird results.
|
|
||||||
for( unsigned x = lowercased.size(); x--; )
|
QRegExp oneValueExp( "\\s*([0-9a-fA-F]+)\\s*;" );
|
||||||
if ( (unsigned char )lowercased[ x ] < 0x80 )
|
|
||||||
lowercased[ x ] = tolower( lowercased[ x ] );
|
|
||||||
|
|
||||||
size_t prevPos = 0;
|
QString str = QString::fromUtf8( text.c_str() );
|
||||||
|
|
||||||
for( ;; )
|
for( int pos = 0; ( pos = charsetExp.indexIn( str, pos ) ) != -1; )
|
||||||
{
|
{
|
||||||
size_t pos = lowercased.find( "<charset c=\"t\">", prevPos );
|
//printf( "Match: %s\n", str.mid( pos, charsetExp.matchedLength() ).toUtf8().data() );
|
||||||
|
|
||||||
|
QString out;
|
||||||
|
|
||||||
if ( pos == string::npos )
|
for( int p = 0; ( p = oneValueExp.indexIn( charsetExp.cap( 1 ), p ) ) != -1; )
|
||||||
break;
|
|
||||||
|
|
||||||
if ( lowercased.size() - pos < 30 )
|
|
||||||
{
|
{
|
||||||
// This is not right, the string is too short, leave it alone
|
//printf( "Cap: %s\n", oneValueExp.cap( 1 ).toUtf8().data() );
|
||||||
break;
|
out += "&#x" + oneValueExp.cap( 1 ) + ";";
|
||||||
|
|
||||||
|
p += oneValueExp.matchedLength();
|
||||||
}
|
}
|
||||||
|
|
||||||
prevPos = pos + 1;
|
str.replace( pos, charsetExp.matchedLength(), out );
|
||||||
|
|
||||||
if ( lowercased.substr( pos + 15 + 4, 11 ) != ";</charset>" )
|
|
||||||
{
|
|
||||||
// The ending doesn't match
|
|
||||||
printf( "!!!!!!ending mismatch\n" );
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if digits are all hex
|
|
||||||
|
|
||||||
if ( !isxdigit( lowercased[ pos + 15 ] ) ||
|
|
||||||
!isxdigit( lowercased[ pos + 16 ] ) ||
|
|
||||||
!isxdigit( lowercased[ pos + 17 ] ) ||
|
|
||||||
!isxdigit( lowercased[ pos + 18 ] ) )
|
|
||||||
{
|
|
||||||
printf( "!!!!!!!!not hex digits\n" );
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Ok, replace now.
|
|
||||||
|
|
||||||
lowercased.replace( pos, 15, "&#x" );
|
|
||||||
lowercased.erase( pos + 8, 10 );
|
|
||||||
|
|
||||||
text.replace( pos, 15, "&#x" );
|
|
||||||
text.erase( pos + 8, 10 );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
prevPos = 0;
|
text = str.toUtf8().data();
|
||||||
|
|
||||||
// Copy-pasted version for <charset c=t>. This should all be replaced
|
|
||||||
// by regexps.
|
|
||||||
for( ;; )
|
|
||||||
{
|
|
||||||
size_t pos = lowercased.find( "<charset c=t>", prevPos );
|
|
||||||
|
|
||||||
if ( pos == string::npos )
|
|
||||||
break;
|
|
||||||
|
|
||||||
if ( lowercased.size() - pos < 28 )
|
|
||||||
{
|
|
||||||
// This is not right, the string is too short, leave it alone
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
prevPos = pos + 1;
|
|
||||||
|
|
||||||
if ( lowercased.substr( pos + 13 + 4, 11 ) != ";</charset>" )
|
|
||||||
{
|
|
||||||
// The ending doesn't match
|
|
||||||
printf( "!!!!!!ending mismatch\n" );
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if digits are all hex
|
|
||||||
|
|
||||||
if ( !isxdigit( lowercased[ pos + 13 ] ) ||
|
|
||||||
!isxdigit( lowercased[ pos + 14 ] ) ||
|
|
||||||
!isxdigit( lowercased[ pos + 15 ] ) ||
|
|
||||||
!isxdigit( lowercased[ pos + 16 ] ) )
|
|
||||||
{
|
|
||||||
printf( "!!!!!!!!not hex digits\n" );
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Ok, replace now.
|
|
||||||
|
|
||||||
lowercased.replace( pos, 13, "&#x" );
|
|
||||||
lowercased.erase( pos + 8, 10 );
|
|
||||||
|
|
||||||
text.replace( pos, 13, "&#x" );
|
|
||||||
text.erase( pos + 8, 10 );
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
class ResourceHandler: public Babylon::ResourceHandler
|
class ResourceHandler: public Babylon::ResourceHandler
|
||||||
|
|
Loading…
Reference in a new issue