mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-24 04:24:09 +00:00
fix: support dsl dictionary utf encoding detection (#830)
* fix: support dsl dictionary utf encoding detection
* fix: code smells
* fix: code smells
* 🎨 apply clang-format changes
* fix: adjust linefeed
---------
Co-authored-by: xiaoyifang <xiaoyifang@users.noreply.github.com>
This commit is contained in:
parent
5fd9261047
commit
67ed24c61c
|
@ -4,32 +4,28 @@
|
||||||
#include "utf8.hh"
|
#include "utf8.hh"
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <QByteArray>
|
||||||
|
#include <QString>
|
||||||
|
|
||||||
namespace Utf8 {
|
namespace Utf8 {
|
||||||
|
|
||||||
size_t encode( wchar const * in, size_t inSize, char * out_ )
|
size_t encode( wchar const * in, size_t inSize, char * out_ )
|
||||||
{
|
{
|
||||||
unsigned char * out = (unsigned char *) out_;
|
unsigned char * out = (unsigned char *)out_;
|
||||||
|
|
||||||
while( inSize-- )
|
while ( inSize-- ) {
|
||||||
{
|
|
||||||
if ( *in < 0x80 )
|
if ( *in < 0x80 )
|
||||||
*out++ = *in++;
|
*out++ = *in++;
|
||||||
else
|
else if ( *in < 0x800 ) {
|
||||||
if ( *in < 0x800 )
|
|
||||||
{
|
|
||||||
*out++ = 0xC0 | ( *in >> 6 );
|
*out++ = 0xC0 | ( *in >> 6 );
|
||||||
*out++ = 0x80 | ( *in++ & 0x3F );
|
*out++ = 0x80 | ( *in++ & 0x3F );
|
||||||
}
|
}
|
||||||
else
|
else if ( *in < 0x10000 ) {
|
||||||
if ( *in < 0x10000 )
|
|
||||||
{
|
|
||||||
*out++ = 0xE0 | ( *in >> 12 );
|
*out++ = 0xE0 | ( *in >> 12 );
|
||||||
*out++ = 0x80 | ( ( *in >> 6 ) & 0x3F );
|
*out++ = 0x80 | ( ( *in >> 6 ) & 0x3F );
|
||||||
*out++ = 0x80 | ( *in++ & 0x3F );
|
*out++ = 0x80 | ( *in++ & 0x3F );
|
||||||
}
|
}
|
||||||
else
|
else {
|
||||||
{
|
|
||||||
*out++ = 0xF0 | ( *in >> 18 );
|
*out++ = 0xF0 | ( *in >> 18 );
|
||||||
*out++ = 0x80 | ( ( *in >> 12 ) & 0x3F );
|
*out++ = 0x80 | ( ( *in >> 12 ) & 0x3F );
|
||||||
*out++ = 0x80 | ( ( *in >> 6 ) & 0x3F );
|
*out++ = 0x80 | ( ( *in >> 6 ) & 0x3F );
|
||||||
|
@ -37,26 +33,21 @@ size_t encode( wchar const * in, size_t inSize, char * out_ )
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return out - (unsigned char *) out_;
|
return out - (unsigned char *)out_;
|
||||||
}
|
}
|
||||||
|
|
||||||
long decode( char const * in_, size_t inSize, wchar * out_ )
|
long decode( char const * in_, size_t inSize, wchar * out_ )
|
||||||
{
|
{
|
||||||
unsigned char const * in = (unsigned char const *) in_;
|
unsigned char const * in = (unsigned char const *)in_;
|
||||||
wchar * out = out_;
|
wchar * out = out_;
|
||||||
|
|
||||||
while( inSize-- )
|
while ( inSize-- ) {
|
||||||
{
|
|
||||||
wchar result;
|
wchar result;
|
||||||
|
|
||||||
if ( *in & 0x80 )
|
if ( *in & 0x80 ) {
|
||||||
{
|
if ( *in & 0x40 ) {
|
||||||
if ( *in & 0x40 )
|
if ( *in & 0x20 ) {
|
||||||
{
|
if ( *in & 0x10 ) {
|
||||||
if ( *in & 0x20 )
|
|
||||||
{
|
|
||||||
if ( *in & 0x10 )
|
|
||||||
{
|
|
||||||
// Four-byte sequence
|
// Four-byte sequence
|
||||||
if ( *in & 8 )
|
if ( *in & 8 )
|
||||||
// This can't be
|
// This can't be
|
||||||
|
@ -67,7 +58,7 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
|
||||||
|
|
||||||
inSize -= 3;
|
inSize -= 3;
|
||||||
|
|
||||||
result = ( (wchar )*in++ & 7 ) << 18;
|
result = ( (wchar)*in++ & 7 ) << 18;
|
||||||
|
|
||||||
if ( ( *in & 0xC0 ) != 0x80 )
|
if ( ( *in & 0xC0 ) != 0x80 )
|
||||||
return -1;
|
return -1;
|
||||||
|
@ -81,8 +72,7 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
|
||||||
return -1;
|
return -1;
|
||||||
result |= (wchar)*in++ & 0x3F;
|
result |= (wchar)*in++ & 0x3F;
|
||||||
}
|
}
|
||||||
else
|
else {
|
||||||
{
|
|
||||||
// Three-byte sequence
|
// Three-byte sequence
|
||||||
|
|
||||||
if ( inSize < 2 )
|
if ( inSize < 2 )
|
||||||
|
@ -90,7 +80,7 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
|
||||||
|
|
||||||
inSize -= 2;
|
inSize -= 2;
|
||||||
|
|
||||||
result = ( (wchar )*in++ & 0xF ) << 12;
|
result = ( (wchar)*in++ & 0xF ) << 12;
|
||||||
|
|
||||||
if ( ( *in & 0xC0 ) != 0x80 )
|
if ( ( *in & 0xC0 ) != 0x80 )
|
||||||
return -1;
|
return -1;
|
||||||
|
@ -101,23 +91,21 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
|
||||||
result |= (wchar)*in++ & 0x3F;
|
result |= (wchar)*in++ & 0x3F;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else {
|
||||||
{
|
|
||||||
// Two-byte sequence
|
// Two-byte sequence
|
||||||
if ( !inSize )
|
if ( !inSize )
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
--inSize;
|
--inSize;
|
||||||
|
|
||||||
result = ( (wchar )*in++ & 0x1F ) << 6;
|
result = ( (wchar)*in++ & 0x1F ) << 6;
|
||||||
|
|
||||||
if ( ( *in & 0xC0 ) != 0x80 )
|
if ( ( *in & 0xC0 ) != 0x80 )
|
||||||
return -1;
|
return -1;
|
||||||
result |= (wchar)*in++ & 0x3F;
|
result |= (wchar)*in++ & 0x3F;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else {
|
||||||
{
|
|
||||||
// This char is from the middle of encoding, it can't be leading
|
// This char is from the middle of encoding, it can't be leading
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
@ -139,18 +127,17 @@ string encode( wstring const & in ) noexcept
|
||||||
|
|
||||||
std::vector< char > buffer( in.size() * 4 );
|
std::vector< char > buffer( in.size() * 4 );
|
||||||
|
|
||||||
return string( &buffer.front(),
|
return string( &buffer.front(), encode( in.data(), in.size(), &buffer.front() ) );
|
||||||
encode( in.data(), in.size(), &buffer.front() ) );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
wstring decode( string const & in )
|
wstring decode( string const & in )
|
||||||
{
|
{
|
||||||
if ( in.empty() )
|
if ( in.empty() )
|
||||||
return {};
|
return {};
|
||||||
|
|
||||||
std::vector< wchar > buffer( in.size() );
|
std::vector< wchar > buffer( in.size() );
|
||||||
|
|
||||||
long result = decode( in.data(), in.size(), &buffer.front() );
|
long result = decode( in.data(), in.size(), &buffer.front() );
|
||||||
|
|
||||||
if ( result < 0 )
|
if ( result < 0 )
|
||||||
throw exCantDecode( in );
|
throw exCantDecode( in );
|
||||||
|
@ -160,8 +147,7 @@ wstring decode( string const & in )
|
||||||
|
|
||||||
bool isspace( int c )
|
bool isspace( int c )
|
||||||
{
|
{
|
||||||
switch( c )
|
switch ( c ) {
|
||||||
{
|
|
||||||
case ' ':
|
case ' ':
|
||||||
case '\f':
|
case '\f':
|
||||||
case '\n':
|
case '\n':
|
||||||
|
@ -176,62 +162,95 @@ bool isspace( int c )
|
||||||
}
|
}
|
||||||
|
|
||||||
//get the first line in string s1. -1 if not found
|
//get the first line in string s1. -1 if not found
|
||||||
int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length)
|
int findFirstLinePosition( char * s1, int s1length, const char * s2, int s2length )
|
||||||
{
|
{
|
||||||
char* pos = std::search(s1,s1+s1length, s2, s2+s2length);
|
char * pos = std::search( s1, s1 + s1length, s2, s2 + s2length );
|
||||||
|
|
||||||
if (pos == s1 + s1length)
|
if ( pos == s1 + s1length )
|
||||||
return pos-s1;
|
return pos - s1;
|
||||||
|
|
||||||
//the line size.
|
//the line size.
|
||||||
return pos- s1+ s2length;
|
return pos - s1 + s2length;
|
||||||
}
|
}
|
||||||
|
|
||||||
char const* getEncodingNameFor(Encoding e)
|
char const * getEncodingNameFor( Encoding e )
|
||||||
{
|
{
|
||||||
switch (e)
|
switch ( e ) {
|
||||||
{
|
case Utf32LE:
|
||||||
|
return "UTF-32LE";
|
||||||
|
case Utf32BE:
|
||||||
|
return "UTF-32BE";
|
||||||
case Utf16LE:
|
case Utf16LE:
|
||||||
return "UTF-16LE";
|
return "UTF-16LE";
|
||||||
case Utf16BE:
|
case Utf16BE:
|
||||||
return "UTF-16BE";
|
return "UTF-16BE";
|
||||||
case Windows1252:
|
case Windows1252:
|
||||||
return "WINDOWS-1252";
|
return "WINDOWS-1252";
|
||||||
case Windows1251:
|
case Windows1251:
|
||||||
return "WINDOWS-1251";
|
return "WINDOWS-1251";
|
||||||
case Utf8:
|
case Utf8:
|
||||||
return "UTF-8";
|
return "UTF-8";
|
||||||
case Windows1250:
|
case Windows1250:
|
||||||
|
return "WINDOWS-1250";
|
||||||
default:
|
default:
|
||||||
return "WINDOWS-1250";
|
return "UTF-8";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LineFeed initLineFeed(Encoding e)
|
Encoding getEncodingForName( const QByteArray & _name )
|
||||||
{
|
{
|
||||||
LineFeed lf;
|
const auto name = _name.toUpper();
|
||||||
switch (e)
|
if ( name == "UTF-32LE" )
|
||||||
{
|
return Utf32LE;
|
||||||
case Utf8::Utf16LE:
|
if ( name == "UTF-32BE" )
|
||||||
lf.lineFeed= new char[2]{ 0x0A,0 };
|
return Utf32BE;
|
||||||
lf.length = 2;
|
if ( name == "UTF-16LE" )
|
||||||
break;
|
return Utf16LE;
|
||||||
case Utf8::Utf16BE:
|
if ( name == "UTF-16BE" )
|
||||||
lf.lineFeed = new char[2]{ 0,0x0A };
|
return Utf16BE;
|
||||||
lf.length = 2;
|
if ( name == "WINDOWS-1252" )
|
||||||
break;
|
return Windows1252;
|
||||||
case Utf8::Windows1252:
|
if ( name == "WINDOWS-1251" )
|
||||||
|
return Windows1251;
|
||||||
case Utf8::Windows1251:
|
if ( name == "UTF-8" )
|
||||||
|
return Utf8;
|
||||||
case Utf8::Utf8:
|
if ( name == "WINDOWS-1250" )
|
||||||
|
return Windows1250;
|
||||||
case Utf8::Windows1250:
|
return Utf8;
|
||||||
default:
|
|
||||||
lf.length = 1;
|
|
||||||
lf.lineFeed = new char[1]{ 0x0A };
|
|
||||||
}
|
|
||||||
return lf;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LineFeed initLineFeed( const Encoding e )
|
||||||
|
{
|
||||||
|
LineFeed lf{};
|
||||||
|
switch ( e ) {
|
||||||
|
case Utf8::Utf32LE:
|
||||||
|
lf.lineFeed = new char[ 4 ]{ 0x0A, 0, 0, 0 };
|
||||||
|
lf.length = 4;
|
||||||
|
break;
|
||||||
|
case Utf8::Utf32BE:
|
||||||
|
lf.lineFeed = new char[ 4 ]{ 0, 0, 0, 0x0A };
|
||||||
|
lf.length = 4;
|
||||||
|
break;
|
||||||
|
case Utf8::Utf16LE:
|
||||||
|
lf.lineFeed = new char[ 2 ]{ 0x0A, 0 };
|
||||||
|
lf.length = 2;
|
||||||
|
break;
|
||||||
|
case Utf8::Utf16BE:
|
||||||
|
lf.lineFeed = new char[ 2 ]{ 0, 0x0A };
|
||||||
|
lf.length = 2;
|
||||||
|
break;
|
||||||
|
case Utf8::Windows1252:
|
||||||
|
|
||||||
|
case Utf8::Windows1251:
|
||||||
|
|
||||||
|
case Utf8::Utf8:
|
||||||
|
|
||||||
|
case Utf8::Windows1250:
|
||||||
|
default:
|
||||||
|
lf.length = 1;
|
||||||
|
lf.lineFeed = new char[ 1 ]{ 0x0A };
|
||||||
|
}
|
||||||
|
return lf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} // namespace Utf8
|
||||||
|
|
|
@ -4,6 +4,7 @@
|
||||||
#define __UTF8_HH_INCLUDED__
|
#define __UTF8_HH_INCLUDED__
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
#include <QByteArray>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include "ex.hh"
|
#include "ex.hh"
|
||||||
#include "wstring.hh"
|
#include "wstring.hh"
|
||||||
|
@ -15,14 +16,15 @@
|
||||||
namespace Utf8 {
|
namespace Utf8 {
|
||||||
|
|
||||||
// Those are possible encodings for .dsl files
|
// Those are possible encodings for .dsl files
|
||||||
enum Encoding
|
enum Encoding {
|
||||||
{
|
|
||||||
Utf16LE,
|
Utf16LE,
|
||||||
Utf16BE,
|
Utf16BE,
|
||||||
Windows1252,
|
Windows1252,
|
||||||
Windows1251,
|
Windows1251,
|
||||||
Windows1250,
|
Windows1250,
|
||||||
Utf8 // This is an extension. Detected solely by the UTF8 BOM.
|
Utf8, // This is an extension. Detected solely by the UTF8 BOM.
|
||||||
|
Utf32BE,
|
||||||
|
Utf32LE,
|
||||||
};
|
};
|
||||||
|
|
||||||
using std::string;
|
using std::string;
|
||||||
|
@ -54,7 +56,8 @@ bool isspace( int c );
|
||||||
|
|
||||||
//get the first line in string s1. -1 if not found
|
//get the first line in string s1. -1 if not found
|
||||||
int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length);
|
int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length);
|
||||||
char const* getEncodingNameFor(Encoding e);
|
char const * getEncodingNameFor( Encoding e );
|
||||||
|
Encoding getEncodingForName( const QByteArray & name );
|
||||||
|
|
||||||
struct LineFeed
|
struct LineFeed
|
||||||
{
|
{
|
||||||
|
|
|
@ -9,6 +9,7 @@
|
||||||
#include "ufile.hh"
|
#include "ufile.hh"
|
||||||
#include "utf8.hh"
|
#include "utf8.hh"
|
||||||
|
|
||||||
|
#include <exception>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <wctype.h>
|
#include <wctype.h>
|
||||||
|
|
||||||
|
@ -157,9 +158,9 @@ wstring ArticleDom::Node::renderAsText( bool stripTrsTag ) const
|
||||||
|
|
||||||
wstring result;
|
wstring result;
|
||||||
|
|
||||||
for( list< Node >::const_iterator i = begin(); i != end(); ++i )
|
for ( const auto & i : *this )
|
||||||
if( !stripTrsTag || i->tagName != U"!trs" )
|
if ( !stripTrsTag || i.tagName != U"!trs" )
|
||||||
result += i->renderAsText( stripTrsTag );
|
result += i.renderAsText( stripTrsTag );
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -248,9 +249,7 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
|
||||||
processUnsortedParts( linkTo, true );
|
processUnsortedParts( linkTo, true );
|
||||||
expandOptionalParts( linkTo, &allLinkEntries );
|
expandOptionalParts( linkTo, &allLinkEntries );
|
||||||
|
|
||||||
for( list< wstring >::iterator entry = allLinkEntries.begin();
|
for ( auto entry = allLinkEntries.begin(); entry != allLinkEntries.end(); ) {
|
||||||
entry != allLinkEntries.end(); )
|
|
||||||
{
|
|
||||||
if ( !textNode )
|
if ( !textNode )
|
||||||
{
|
{
|
||||||
Node text = Node( Node::Text(), wstring() );
|
Node text = Node( Node::Text(), wstring() );
|
||||||
|
@ -279,8 +278,8 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
|
||||||
ArticleDom nodeDom( linkText, dictName, headword_ );
|
ArticleDom nodeDom( linkText, dictName, headword_ );
|
||||||
|
|
||||||
Node link( Node::Tag(), U"@" , wstring() );
|
Node link( Node::Tag(), U"@" , wstring() );
|
||||||
for( Node::iterator n = nodeDom.root.begin(); n != nodeDom.root.end(); ++n )
|
for ( auto & n : nodeDom.root )
|
||||||
link.push_back( *n );
|
link.push_back( n );
|
||||||
|
|
||||||
++entry;
|
++entry;
|
||||||
|
|
||||||
|
@ -352,17 +351,19 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
|
||||||
nextChar();
|
nextChar();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch( eot )
|
catch ( std::exception & ex ) {
|
||||||
{
|
if ( !dictionaryName.empty() )
|
||||||
if( !dictionaryName.empty() )
|
|
||||||
gdWarning( R"(DSL: Unfinished tag "%s" with attributes "%s" found in "%s", article "%s".)",
|
gdWarning( R"(DSL: Unfinished tag "%s" with attributes "%s" found in "%s", article "%s".)",
|
||||||
QString::fromStdU32String( name ).toUtf8().data(), QString::fromStdU32String( attrs ).toUtf8().data(),
|
QString::fromStdU32String( name ).toUtf8().data(),
|
||||||
dictionaryName.c_str(), QString::fromStdU32String( headword ).toUtf8().data() );
|
QString::fromStdU32String( attrs ).toUtf8().data(),
|
||||||
|
dictionaryName.c_str(),
|
||||||
|
QString::fromStdU32String( headword ).toUtf8().data() );
|
||||||
else
|
else
|
||||||
gdWarning( R"(DSL: Unfinished tag "%s" with attributes "%s" found)",
|
gdWarning( R"(DSL: Unfinished tag "%s" with attributes "%s" found)",
|
||||||
QString::fromStdU32String( name ).toUtf8().data(), QString::fromStdU32String( attrs ).toUtf8().data() );
|
QString::fromStdU32String( name ).toUtf8().data(),
|
||||||
|
QString::fromStdU32String( attrs ).toUtf8().data() );
|
||||||
|
|
||||||
throw eot();
|
throw ex;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add the tag, or close it
|
// Add the tag, or close it
|
||||||
|
@ -491,8 +492,8 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
|
||||||
ArticleDom nodeDom( linkText, dictName, headword_ );
|
ArticleDom nodeDom( linkText, dictName, headword_ );
|
||||||
|
|
||||||
Node link( Node::Tag(), U"ref" , wstring() );
|
Node link( Node::Tag(), U"ref" , wstring() );
|
||||||
for( Node::iterator n = nodeDom.root.begin(); n != nodeDom.root.end(); ++n )
|
for ( auto & n : nodeDom.root )
|
||||||
link.push_back( *n );
|
link.push_back( n );
|
||||||
|
|
||||||
if ( stack.empty() )
|
if ( stack.empty() )
|
||||||
root.push_back( link );
|
root.push_back( link );
|
||||||
|
@ -646,16 +647,14 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
|
||||||
textNode->text.push_back( ch );
|
textNode->text.push_back( ch );
|
||||||
} // for( ; ; )
|
} // for( ; ; )
|
||||||
}
|
}
|
||||||
catch( eot )
|
catch ( eot & ) {
|
||||||
{
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( textNode )
|
if ( textNode )
|
||||||
stack.pop_back();
|
stack.pop_back();
|
||||||
|
|
||||||
if ( stack.size() )
|
if ( !stack.empty() ) {
|
||||||
{
|
auto it = std::find_if( stack.begin(), stack.end(), MustTagBeClosed() );
|
||||||
list< Node * >::iterator it = std::find_if( stack.begin(), stack.end(), MustTagBeClosed() );
|
|
||||||
if( it == stack.end() )
|
if( it == stack.end() )
|
||||||
return; // no unclosed tags that must be closed => nothing to warn about
|
return; // no unclosed tags that must be closed => nothing to warn about
|
||||||
QByteArray const firstTagName = QString::fromStdU32String( ( *it )->tagName ).toUtf8();
|
QByteArray const firstTagName = QString::fromStdU32String( ( *it )->tagName ).toUtf8();
|
||||||
|
@ -687,10 +686,8 @@ void ArticleDom::openTag( wstring const & name,
|
||||||
// All tags above [m] tag will be closed and reopened after
|
// All tags above [m] tag will be closed and reopened after
|
||||||
// to avoid break this tag by closing some other tag.
|
// to avoid break this tag by closing some other tag.
|
||||||
|
|
||||||
while( stack.size() )
|
while ( !stack.empty() ) {
|
||||||
{
|
nodesToReopen.emplace_back( Node::Tag(), stack.back()->tagName, stack.back()->tagAttrs );
|
||||||
nodesToReopen.push_back( Node( Node::Tag(), stack.back()->tagName,
|
|
||||||
stack.back()->tagAttrs ) );
|
|
||||||
|
|
||||||
if ( stack.back()->empty() )
|
if ( stack.back()->empty() )
|
||||||
{
|
{
|
||||||
|
@ -698,7 +695,7 @@ void ArticleDom::openTag( wstring const & name,
|
||||||
|
|
||||||
stack.pop_back();
|
stack.pop_back();
|
||||||
|
|
||||||
Node * parent = stack.size() ? stack.back() : &root;
|
Node * parent = !stack.empty() ? stack.back() : &root;
|
||||||
|
|
||||||
parent->pop_back();
|
parent->pop_back();
|
||||||
}
|
}
|
||||||
|
@ -724,8 +721,7 @@ void ArticleDom::openTag( wstring const & name,
|
||||||
|
|
||||||
// Reopen tags if needed
|
// Reopen tags if needed
|
||||||
|
|
||||||
while( nodesToReopen.size() )
|
while ( !nodesToReopen.empty() ) {
|
||||||
{
|
|
||||||
if ( stack.empty() )
|
if ( stack.empty() )
|
||||||
{
|
{
|
||||||
root.push_back( nodesToReopen.back() );
|
root.push_back( nodesToReopen.back() );
|
||||||
|
@ -739,7 +735,6 @@ void ArticleDom::openTag( wstring const & name,
|
||||||
|
|
||||||
nodesToReopen.pop_back();
|
nodesToReopen.pop_back();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ArticleDom::closeTag( wstring const & name,
|
void ArticleDom::closeTag( wstring const & name,
|
||||||
|
@ -767,14 +762,12 @@ void ArticleDom::closeTag( wstring const & name,
|
||||||
|
|
||||||
list< Node > nodesToReopen;
|
list< Node > nodesToReopen;
|
||||||
|
|
||||||
while( stack.size() )
|
while ( !stack.empty() ) {
|
||||||
{
|
|
||||||
bool found = stack.back()->tagName == name ||
|
bool found = stack.back()->tagName == name ||
|
||||||
checkM( stack.back()->tagName, name );
|
checkM( stack.back()->tagName, name );
|
||||||
|
|
||||||
if ( !found )
|
if ( !found )
|
||||||
nodesToReopen.push_back( Node( Node::Tag(), stack.back()->tagName,
|
nodesToReopen.emplace_back( Node::Tag(), stack.back()->tagName, stack.back()->tagAttrs );
|
||||||
stack.back()->tagAttrs ) );
|
|
||||||
|
|
||||||
if( stack.back()->empty() && stack.back()->tagName != U"br" )
|
if( stack.back()->empty() && stack.back()->tagName != U"br" )
|
||||||
{
|
{
|
||||||
|
@ -782,7 +775,7 @@ void ArticleDom::closeTag( wstring const & name,
|
||||||
|
|
||||||
stack.pop_back();
|
stack.pop_back();
|
||||||
|
|
||||||
Node * parent = stack.size() ? stack.back() : &root;
|
Node * parent = !stack.empty() ? stack.back() : &root;
|
||||||
|
|
||||||
parent->pop_back();
|
parent->pop_back();
|
||||||
}
|
}
|
||||||
|
@ -793,8 +786,7 @@ void ArticleDom::closeTag( wstring const & name,
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
while( nodesToReopen.size() )
|
while ( !nodesToReopen.empty() ) {
|
||||||
{
|
|
||||||
if ( stack.empty() )
|
if ( stack.empty() )
|
||||||
{
|
{
|
||||||
root.push_back( nodesToReopen.back() );
|
root.push_back( nodesToReopen.back() );
|
||||||
|
@ -880,10 +872,9 @@ DslScanner::DslScanner( string const & fileName ) :
|
||||||
|
|
||||||
// Now try guessing the encoding by reading the first two bytes
|
// Now try guessing the encoding by reading the first two bytes
|
||||||
|
|
||||||
unsigned char firstBytes[ 2 ];
|
unsigned char firstBytes[ 50 ];
|
||||||
|
|
||||||
if ( gzread( f, firstBytes, sizeof( firstBytes ) ) != sizeof( firstBytes ) )
|
if ( gzread( f, firstBytes, sizeof( firstBytes ) ) != sizeof( firstBytes ) ) {
|
||||||
{
|
|
||||||
// Apparently the file's too short
|
// Apparently the file's too short
|
||||||
gzclose( f );
|
gzclose( f );
|
||||||
throw exMalformedDslFile( fileName );
|
throw exMalformedDslFile( fileName );
|
||||||
|
@ -891,53 +882,19 @@ DslScanner::DslScanner( string const & fileName ) :
|
||||||
|
|
||||||
bool needExactEncoding = false;
|
bool needExactEncoding = false;
|
||||||
|
|
||||||
|
QByteArray ba = QByteArray::fromRawData( (const char *)firstBytes, 50 );
|
||||||
|
codec = QTextCodec::codecForUtfText( ba, QTextCodec::codecForName( "UTF-8" ) );
|
||||||
|
|
||||||
// If the file begins with the dedicated Unicode marker, we just consume
|
encoding = Utf8::getEncodingForName( codec->name() );
|
||||||
// it. If, on the other hand, it's not, we return the bytes back
|
qDebug() << codec->name();
|
||||||
if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE )
|
|
||||||
encoding = Utf8::Utf16LE;
|
|
||||||
else
|
|
||||||
if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF )
|
|
||||||
encoding = Utf8::Utf16BE;
|
|
||||||
else
|
|
||||||
if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB )
|
|
||||||
{
|
|
||||||
// Looks like Utf8, read one more byte
|
|
||||||
if ( gzread( f, firstBytes, 1 ) != 1 || firstBytes[ 0 ] != 0xBF )
|
|
||||||
{
|
|
||||||
// Either the file's too short, or the BOM is weird
|
|
||||||
gzclose( f );
|
|
||||||
throw exMalformedDslFile( fileName );
|
|
||||||
}
|
|
||||||
|
|
||||||
encoding = Utf8::Utf8;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if ( firstBytes[ 0 ] && !firstBytes[ 1 ] )
|
|
||||||
encoding = Utf8::Utf16LE;
|
|
||||||
else
|
|
||||||
if ( !firstBytes[ 0 ] && firstBytes[ 1 ] )
|
|
||||||
encoding = Utf8::Utf16BE;
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// Ok, this doesn't look like 16-bit Unicode. We will start with a
|
|
||||||
// 8-bit encoding with an intent to find out the exact one from
|
|
||||||
// the header.
|
|
||||||
needExactEncoding = true;
|
|
||||||
encoding = Utf8::Windows1251;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( gzrewind( f ) )
|
if ( gzrewind( f ) ) {
|
||||||
{
|
gzclose( f );
|
||||||
gzclose( f );
|
throw exCantOpen( fileName );
|
||||||
throw exCantOpen( fileName );
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//iconv.reinit( encoding );
|
//iconv.reinit( encoding );
|
||||||
codec = QTextCodec::codecForName(getEncodingNameFor(encoding));
|
lineFeed = Utf8::initLineFeed( encoding );
|
||||||
lineFeed=Utf8::initLineFeed(encoding);
|
|
||||||
// We now can use our own readNextLine() function
|
// We now can use our own readNextLine() function
|
||||||
|
|
||||||
wstring str;
|
wstring str;
|
||||||
|
|
Loading…
Reference in a new issue