fix: support dsl dictionary utf encoding detection (#830)

* fix: support dsl dictionary utf encoding detection

* fix: code smells

* fix: code smells

* 🎨 apply clang-format changes

* fix: adjust linefeed

---------

Co-authored-by: xiaoyifang <xiaoyifang@users.noreply.github.com>
This commit is contained in:
xiaoyifang 2023-06-09 08:01:45 +08:00 committed by GitHub
parent 5fd9261047
commit 67ed24c61c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 144 additions and 165 deletions

View file

@ -4,32 +4,28 @@
#include "utf8.hh" #include "utf8.hh"
#include <vector> #include <vector>
#include <algorithm> #include <algorithm>
#include <QByteArray>
#include <QString>
namespace Utf8 { namespace Utf8 {
size_t encode( wchar const * in, size_t inSize, char * out_ ) size_t encode( wchar const * in, size_t inSize, char * out_ )
{ {
unsigned char * out = (unsigned char *) out_; unsigned char * out = (unsigned char *)out_;
while( inSize-- ) while ( inSize-- ) {
{
if ( *in < 0x80 ) if ( *in < 0x80 )
*out++ = *in++; *out++ = *in++;
else else if ( *in < 0x800 ) {
if ( *in < 0x800 )
{
*out++ = 0xC0 | ( *in >> 6 ); *out++ = 0xC0 | ( *in >> 6 );
*out++ = 0x80 | ( *in++ & 0x3F ); *out++ = 0x80 | ( *in++ & 0x3F );
} }
else else if ( *in < 0x10000 ) {
if ( *in < 0x10000 )
{
*out++ = 0xE0 | ( *in >> 12 ); *out++ = 0xE0 | ( *in >> 12 );
*out++ = 0x80 | ( ( *in >> 6 ) & 0x3F ); *out++ = 0x80 | ( ( *in >> 6 ) & 0x3F );
*out++ = 0x80 | ( *in++ & 0x3F ); *out++ = 0x80 | ( *in++ & 0x3F );
} }
else else {
{
*out++ = 0xF0 | ( *in >> 18 ); *out++ = 0xF0 | ( *in >> 18 );
*out++ = 0x80 | ( ( *in >> 12 ) & 0x3F ); *out++ = 0x80 | ( ( *in >> 12 ) & 0x3F );
*out++ = 0x80 | ( ( *in >> 6 ) & 0x3F ); *out++ = 0x80 | ( ( *in >> 6 ) & 0x3F );
@ -37,26 +33,21 @@ size_t encode( wchar const * in, size_t inSize, char * out_ )
} }
} }
return out - (unsigned char *) out_; return out - (unsigned char *)out_;
} }
long decode( char const * in_, size_t inSize, wchar * out_ ) long decode( char const * in_, size_t inSize, wchar * out_ )
{ {
unsigned char const * in = (unsigned char const *) in_; unsigned char const * in = (unsigned char const *)in_;
wchar * out = out_; wchar * out = out_;
while( inSize-- ) while ( inSize-- ) {
{
wchar result; wchar result;
if ( *in & 0x80 ) if ( *in & 0x80 ) {
{ if ( *in & 0x40 ) {
if ( *in & 0x40 ) if ( *in & 0x20 ) {
{ if ( *in & 0x10 ) {
if ( *in & 0x20 )
{
if ( *in & 0x10 )
{
// Four-byte sequence // Four-byte sequence
if ( *in & 8 ) if ( *in & 8 )
// This can't be // This can't be
@ -67,7 +58,7 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
inSize -= 3; inSize -= 3;
result = ( (wchar )*in++ & 7 ) << 18; result = ( (wchar)*in++ & 7 ) << 18;
if ( ( *in & 0xC0 ) != 0x80 ) if ( ( *in & 0xC0 ) != 0x80 )
return -1; return -1;
@ -81,8 +72,7 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
return -1; return -1;
result |= (wchar)*in++ & 0x3F; result |= (wchar)*in++ & 0x3F;
} }
else else {
{
// Three-byte sequence // Three-byte sequence
if ( inSize < 2 ) if ( inSize < 2 )
@ -90,7 +80,7 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
inSize -= 2; inSize -= 2;
result = ( (wchar )*in++ & 0xF ) << 12; result = ( (wchar)*in++ & 0xF ) << 12;
if ( ( *in & 0xC0 ) != 0x80 ) if ( ( *in & 0xC0 ) != 0x80 )
return -1; return -1;
@ -101,23 +91,21 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
result |= (wchar)*in++ & 0x3F; result |= (wchar)*in++ & 0x3F;
} }
} }
else else {
{
// Two-byte sequence // Two-byte sequence
if ( !inSize ) if ( !inSize )
return -1; return -1;
--inSize; --inSize;
result = ( (wchar )*in++ & 0x1F ) << 6; result = ( (wchar)*in++ & 0x1F ) << 6;
if ( ( *in & 0xC0 ) != 0x80 ) if ( ( *in & 0xC0 ) != 0x80 )
return -1; return -1;
result |= (wchar)*in++ & 0x3F; result |= (wchar)*in++ & 0x3F;
} }
} }
else else {
{
// This char is from the middle of encoding, it can't be leading // This char is from the middle of encoding, it can't be leading
return -1; return -1;
} }
@ -139,18 +127,17 @@ string encode( wstring const & in ) noexcept
std::vector< char > buffer( in.size() * 4 ); std::vector< char > buffer( in.size() * 4 );
return string( &buffer.front(), return string( &buffer.front(), encode( in.data(), in.size(), &buffer.front() ) );
encode( in.data(), in.size(), &buffer.front() ) );
} }
wstring decode( string const & in ) wstring decode( string const & in )
{ {
if ( in.empty() ) if ( in.empty() )
return {}; return {};
std::vector< wchar > buffer( in.size() ); std::vector< wchar > buffer( in.size() );
long result = decode( in.data(), in.size(), &buffer.front() ); long result = decode( in.data(), in.size(), &buffer.front() );
if ( result < 0 ) if ( result < 0 )
throw exCantDecode( in ); throw exCantDecode( in );
@ -160,8 +147,7 @@ wstring decode( string const & in )
bool isspace( int c ) bool isspace( int c )
{ {
switch( c ) switch ( c ) {
{
case ' ': case ' ':
case '\f': case '\f':
case '\n': case '\n':
@ -176,62 +162,95 @@ bool isspace( int c )
} }
//get the first line in string s1. -1 if not found //get the first line in string s1. -1 if not found
int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length) int findFirstLinePosition( char * s1, int s1length, const char * s2, int s2length )
{ {
char* pos = std::search(s1,s1+s1length, s2, s2+s2length); char * pos = std::search( s1, s1 + s1length, s2, s2 + s2length );
if (pos == s1 + s1length) if ( pos == s1 + s1length )
return pos-s1; return pos - s1;
//the line size. //the line size.
return pos- s1+ s2length; return pos - s1 + s2length;
} }
char const* getEncodingNameFor(Encoding e) char const * getEncodingNameFor( Encoding e )
{ {
switch (e) switch ( e ) {
{ case Utf32LE:
return "UTF-32LE";
case Utf32BE:
return "UTF-32BE";
case Utf16LE: case Utf16LE:
return "UTF-16LE"; return "UTF-16LE";
case Utf16BE: case Utf16BE:
return "UTF-16BE"; return "UTF-16BE";
case Windows1252: case Windows1252:
return "WINDOWS-1252"; return "WINDOWS-1252";
case Windows1251: case Windows1251:
return "WINDOWS-1251"; return "WINDOWS-1251";
case Utf8: case Utf8:
return "UTF-8"; return "UTF-8";
case Windows1250: case Windows1250:
return "WINDOWS-1250";
default: default:
return "WINDOWS-1250"; return "UTF-8";
} }
} }
LineFeed initLineFeed(Encoding e) Encoding getEncodingForName( const QByteArray & _name )
{ {
LineFeed lf; const auto name = _name.toUpper();
switch (e) if ( name == "UTF-32LE" )
{ return Utf32LE;
case Utf8::Utf16LE: if ( name == "UTF-32BE" )
lf.lineFeed= new char[2]{ 0x0A,0 }; return Utf32BE;
lf.length = 2; if ( name == "UTF-16LE" )
break; return Utf16LE;
case Utf8::Utf16BE: if ( name == "UTF-16BE" )
lf.lineFeed = new char[2]{ 0,0x0A }; return Utf16BE;
lf.length = 2; if ( name == "WINDOWS-1252" )
break; return Windows1252;
case Utf8::Windows1252: if ( name == "WINDOWS-1251" )
return Windows1251;
case Utf8::Windows1251: if ( name == "UTF-8" )
return Utf8;
case Utf8::Utf8: if ( name == "WINDOWS-1250" )
return Windows1250;
case Utf8::Windows1250: return Utf8;
default:
lf.length = 1;
lf.lineFeed = new char[1]{ 0x0A };
}
return lf;
} }
LineFeed initLineFeed( const Encoding e )
{
LineFeed lf{};
switch ( e ) {
case Utf8::Utf32LE:
lf.lineFeed = new char[ 4 ]{ 0x0A, 0, 0, 0 };
lf.length = 4;
break;
case Utf8::Utf32BE:
lf.lineFeed = new char[ 4 ]{ 0, 0, 0, 0x0A };
lf.length = 4;
break;
case Utf8::Utf16LE:
lf.lineFeed = new char[ 2 ]{ 0x0A, 0 };
lf.length = 2;
break;
case Utf8::Utf16BE:
lf.lineFeed = new char[ 2 ]{ 0, 0x0A };
lf.length = 2;
break;
case Utf8::Windows1252:
case Utf8::Windows1251:
case Utf8::Utf8:
case Utf8::Windows1250:
default:
lf.length = 1;
lf.lineFeed = new char[ 1 ]{ 0x0A };
}
return lf;
} }
} // namespace Utf8

View file

@ -4,6 +4,7 @@
#define __UTF8_HH_INCLUDED__ #define __UTF8_HH_INCLUDED__
#include <cstdio> #include <cstdio>
#include <QByteArray>
#include <string> #include <string>
#include "ex.hh" #include "ex.hh"
#include "wstring.hh" #include "wstring.hh"
@ -15,14 +16,15 @@
namespace Utf8 { namespace Utf8 {
// Those are possible encodings for .dsl files // Those are possible encodings for .dsl files
enum Encoding enum Encoding {
{
Utf16LE, Utf16LE,
Utf16BE, Utf16BE,
Windows1252, Windows1252,
Windows1251, Windows1251,
Windows1250, Windows1250,
Utf8 // This is an extension. Detected solely by the UTF8 BOM. Utf8, // This is an extension. Detected solely by the UTF8 BOM.
Utf32BE,
Utf32LE,
}; };
using std::string; using std::string;
@ -54,7 +56,8 @@ bool isspace( int c );
//get the first line in string s1. -1 if not found //get the first line in string s1. -1 if not found
int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length); int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length);
char const* getEncodingNameFor(Encoding e); char const * getEncodingNameFor( Encoding e );
Encoding getEncodingForName( const QByteArray & name );
struct LineFeed struct LineFeed
{ {

View file

@ -9,6 +9,7 @@
#include "ufile.hh" #include "ufile.hh"
#include "utf8.hh" #include "utf8.hh"
#include <exception>
#include <stdio.h> #include <stdio.h>
#include <wctype.h> #include <wctype.h>
@ -157,9 +158,9 @@ wstring ArticleDom::Node::renderAsText( bool stripTrsTag ) const
wstring result; wstring result;
for( list< Node >::const_iterator i = begin(); i != end(); ++i ) for ( const auto & i : *this )
if( !stripTrsTag || i->tagName != U"!trs" ) if ( !stripTrsTag || i.tagName != U"!trs" )
result += i->renderAsText( stripTrsTag ); result += i.renderAsText( stripTrsTag );
return result; return result;
} }
@ -248,9 +249,7 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
processUnsortedParts( linkTo, true ); processUnsortedParts( linkTo, true );
expandOptionalParts( linkTo, &allLinkEntries ); expandOptionalParts( linkTo, &allLinkEntries );
for( list< wstring >::iterator entry = allLinkEntries.begin(); for ( auto entry = allLinkEntries.begin(); entry != allLinkEntries.end(); ) {
entry != allLinkEntries.end(); )
{
if ( !textNode ) if ( !textNode )
{ {
Node text = Node( Node::Text(), wstring() ); Node text = Node( Node::Text(), wstring() );
@ -279,8 +278,8 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
ArticleDom nodeDom( linkText, dictName, headword_ ); ArticleDom nodeDom( linkText, dictName, headword_ );
Node link( Node::Tag(), U"@" , wstring() ); Node link( Node::Tag(), U"@" , wstring() );
for( Node::iterator n = nodeDom.root.begin(); n != nodeDom.root.end(); ++n ) for ( auto & n : nodeDom.root )
link.push_back( *n ); link.push_back( n );
++entry; ++entry;
@ -352,17 +351,19 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
nextChar(); nextChar();
} }
} }
catch( eot ) catch ( std::exception & ex ) {
{ if ( !dictionaryName.empty() )
if( !dictionaryName.empty() )
gdWarning( R"(DSL: Unfinished tag "%s" with attributes "%s" found in "%s", article "%s".)", gdWarning( R"(DSL: Unfinished tag "%s" with attributes "%s" found in "%s", article "%s".)",
QString::fromStdU32String( name ).toUtf8().data(), QString::fromStdU32String( attrs ).toUtf8().data(), QString::fromStdU32String( name ).toUtf8().data(),
dictionaryName.c_str(), QString::fromStdU32String( headword ).toUtf8().data() ); QString::fromStdU32String( attrs ).toUtf8().data(),
dictionaryName.c_str(),
QString::fromStdU32String( headword ).toUtf8().data() );
else else
gdWarning( R"(DSL: Unfinished tag "%s" with attributes "%s" found)", gdWarning( R"(DSL: Unfinished tag "%s" with attributes "%s" found)",
QString::fromStdU32String( name ).toUtf8().data(), QString::fromStdU32String( attrs ).toUtf8().data() ); QString::fromStdU32String( name ).toUtf8().data(),
QString::fromStdU32String( attrs ).toUtf8().data() );
throw eot(); throw ex;
} }
// Add the tag, or close it // Add the tag, or close it
@ -491,8 +492,8 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
ArticleDom nodeDom( linkText, dictName, headword_ ); ArticleDom nodeDom( linkText, dictName, headword_ );
Node link( Node::Tag(), U"ref" , wstring() ); Node link( Node::Tag(), U"ref" , wstring() );
for( Node::iterator n = nodeDom.root.begin(); n != nodeDom.root.end(); ++n ) for ( auto & n : nodeDom.root )
link.push_back( *n ); link.push_back( n );
if ( stack.empty() ) if ( stack.empty() )
root.push_back( link ); root.push_back( link );
@ -646,16 +647,14 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
textNode->text.push_back( ch ); textNode->text.push_back( ch );
} // for( ; ; ) } // for( ; ; )
} }
catch( eot ) catch ( eot & ) {
{
} }
if ( textNode ) if ( textNode )
stack.pop_back(); stack.pop_back();
if ( stack.size() ) if ( !stack.empty() ) {
{ auto it = std::find_if( stack.begin(), stack.end(), MustTagBeClosed() );
list< Node * >::iterator it = std::find_if( stack.begin(), stack.end(), MustTagBeClosed() );
if( it == stack.end() ) if( it == stack.end() )
return; // no unclosed tags that must be closed => nothing to warn about return; // no unclosed tags that must be closed => nothing to warn about
QByteArray const firstTagName = QString::fromStdU32String( ( *it )->tagName ).toUtf8(); QByteArray const firstTagName = QString::fromStdU32String( ( *it )->tagName ).toUtf8();
@ -687,10 +686,8 @@ void ArticleDom::openTag( wstring const & name,
// All tags above [m] tag will be closed and reopened after // All tags above [m] tag will be closed and reopened after
// to avoid break this tag by closing some other tag. // to avoid break this tag by closing some other tag.
while( stack.size() ) while ( !stack.empty() ) {
{ nodesToReopen.emplace_back( Node::Tag(), stack.back()->tagName, stack.back()->tagAttrs );
nodesToReopen.push_back( Node( Node::Tag(), stack.back()->tagName,
stack.back()->tagAttrs ) );
if ( stack.back()->empty() ) if ( stack.back()->empty() )
{ {
@ -698,7 +695,7 @@ void ArticleDom::openTag( wstring const & name,
stack.pop_back(); stack.pop_back();
Node * parent = stack.size() ? stack.back() : &root; Node * parent = !stack.empty() ? stack.back() : &root;
parent->pop_back(); parent->pop_back();
} }
@ -724,8 +721,7 @@ void ArticleDom::openTag( wstring const & name,
// Reopen tags if needed // Reopen tags if needed
while( nodesToReopen.size() ) while ( !nodesToReopen.empty() ) {
{
if ( stack.empty() ) if ( stack.empty() )
{ {
root.push_back( nodesToReopen.back() ); root.push_back( nodesToReopen.back() );
@ -739,7 +735,6 @@ void ArticleDom::openTag( wstring const & name,
nodesToReopen.pop_back(); nodesToReopen.pop_back();
} }
} }
void ArticleDom::closeTag( wstring const & name, void ArticleDom::closeTag( wstring const & name,
@ -767,14 +762,12 @@ void ArticleDom::closeTag( wstring const & name,
list< Node > nodesToReopen; list< Node > nodesToReopen;
while( stack.size() ) while ( !stack.empty() ) {
{
bool found = stack.back()->tagName == name || bool found = stack.back()->tagName == name ||
checkM( stack.back()->tagName, name ); checkM( stack.back()->tagName, name );
if ( !found ) if ( !found )
nodesToReopen.push_back( Node( Node::Tag(), stack.back()->tagName, nodesToReopen.emplace_back( Node::Tag(), stack.back()->tagName, stack.back()->tagAttrs );
stack.back()->tagAttrs ) );
if( stack.back()->empty() && stack.back()->tagName != U"br" ) if( stack.back()->empty() && stack.back()->tagName != U"br" )
{ {
@ -782,7 +775,7 @@ void ArticleDom::closeTag( wstring const & name,
stack.pop_back(); stack.pop_back();
Node * parent = stack.size() ? stack.back() : &root; Node * parent = !stack.empty() ? stack.back() : &root;
parent->pop_back(); parent->pop_back();
} }
@ -793,8 +786,7 @@ void ArticleDom::closeTag( wstring const & name,
break; break;
} }
while( nodesToReopen.size() ) while ( !nodesToReopen.empty() ) {
{
if ( stack.empty() ) if ( stack.empty() )
{ {
root.push_back( nodesToReopen.back() ); root.push_back( nodesToReopen.back() );
@ -880,10 +872,9 @@ DslScanner::DslScanner( string const & fileName ) :
// Now try guessing the encoding by reading the first two bytes // Now try guessing the encoding by reading the first two bytes
unsigned char firstBytes[ 2 ]; unsigned char firstBytes[ 50 ];
if ( gzread( f, firstBytes, sizeof( firstBytes ) ) != sizeof( firstBytes ) ) if ( gzread( f, firstBytes, sizeof( firstBytes ) ) != sizeof( firstBytes ) ) {
{
// Apparently the file's too short // Apparently the file's too short
gzclose( f ); gzclose( f );
throw exMalformedDslFile( fileName ); throw exMalformedDslFile( fileName );
@ -891,53 +882,19 @@ DslScanner::DslScanner( string const & fileName ) :
bool needExactEncoding = false; bool needExactEncoding = false;
QByteArray ba = QByteArray::fromRawData( (const char *)firstBytes, 50 );
codec = QTextCodec::codecForUtfText( ba, QTextCodec::codecForName( "UTF-8" ) );
// If the file begins with the dedicated Unicode marker, we just consume encoding = Utf8::getEncodingForName( codec->name() );
// it. If, on the other hand, it's not, we return the bytes back qDebug() << codec->name();
if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE )
encoding = Utf8::Utf16LE;
else
if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF )
encoding = Utf8::Utf16BE;
else
if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB )
{
// Looks like Utf8, read one more byte
if ( gzread( f, firstBytes, 1 ) != 1 || firstBytes[ 0 ] != 0xBF )
{
// Either the file's too short, or the BOM is weird
gzclose( f );
throw exMalformedDslFile( fileName );
}
encoding = Utf8::Utf8;
}
else
{
if ( firstBytes[ 0 ] && !firstBytes[ 1 ] )
encoding = Utf8::Utf16LE;
else
if ( !firstBytes[ 0 ] && firstBytes[ 1 ] )
encoding = Utf8::Utf16BE;
else
{
// Ok, this doesn't look like 16-bit Unicode. We will start with a
// 8-bit encoding with an intent to find out the exact one from
// the header.
needExactEncoding = true;
encoding = Utf8::Windows1251;
}
if ( gzrewind( f ) ) if ( gzrewind( f ) ) {
{ gzclose( f );
gzclose( f ); throw exCantOpen( fileName );
throw exCantOpen( fileName );
}
} }
//iconv.reinit( encoding ); //iconv.reinit( encoding );
codec = QTextCodec::codecForName(getEncodingNameFor(encoding)); lineFeed = Utf8::initLineFeed( encoding );
lineFeed=Utf8::initLineFeed(encoding);
// We now can use our own readNextLine() function // We now can use our own readNextLine() function
wstring str; wstring str;