fix: support dsl dictionary utf encoding detection (#830)

* fix: support dsl dictionary utf encoding detection

* fix: code smells

* fix: code smells

* 🎨 apply clang-format changes

* fix: adjust linefeed

---------

Co-authored-by: xiaoyifang <xiaoyifang@users.noreply.github.com>
This commit is contained in:
xiaoyifang 2023-06-09 08:01:45 +08:00 committed by GitHub
parent 5fd9261047
commit 67ed24c61c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 144 additions and 165 deletions

View file

@ -4,32 +4,28 @@
#include "utf8.hh"
#include <vector>
#include <algorithm>
#include <QByteArray>
#include <QString>
namespace Utf8 {
size_t encode( wchar const * in, size_t inSize, char * out_ )
{
unsigned char * out = (unsigned char *) out_;
unsigned char * out = (unsigned char *)out_;
while( inSize-- )
{
while ( inSize-- ) {
if ( *in < 0x80 )
*out++ = *in++;
else
if ( *in < 0x800 )
{
else if ( *in < 0x800 ) {
*out++ = 0xC0 | ( *in >> 6 );
*out++ = 0x80 | ( *in++ & 0x3F );
}
else
if ( *in < 0x10000 )
{
else if ( *in < 0x10000 ) {
*out++ = 0xE0 | ( *in >> 12 );
*out++ = 0x80 | ( ( *in >> 6 ) & 0x3F );
*out++ = 0x80 | ( *in++ & 0x3F );
}
else
{
else {
*out++ = 0xF0 | ( *in >> 18 );
*out++ = 0x80 | ( ( *in >> 12 ) & 0x3F );
*out++ = 0x80 | ( ( *in >> 6 ) & 0x3F );
@ -37,26 +33,21 @@ size_t encode( wchar const * in, size_t inSize, char * out_ )
}
}
return out - (unsigned char *) out_;
return out - (unsigned char *)out_;
}
long decode( char const * in_, size_t inSize, wchar * out_ )
{
unsigned char const * in = (unsigned char const *) in_;
wchar * out = out_;
unsigned char const * in = (unsigned char const *)in_;
wchar * out = out_;
while( inSize-- )
{
while ( inSize-- ) {
wchar result;
if ( *in & 0x80 )
{
if ( *in & 0x40 )
{
if ( *in & 0x20 )
{
if ( *in & 0x10 )
{
if ( *in & 0x80 ) {
if ( *in & 0x40 ) {
if ( *in & 0x20 ) {
if ( *in & 0x10 ) {
// Four-byte sequence
if ( *in & 8 )
// This can't be
@ -67,7 +58,7 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
inSize -= 3;
result = ( (wchar )*in++ & 7 ) << 18;
result = ( (wchar)*in++ & 7 ) << 18;
if ( ( *in & 0xC0 ) != 0x80 )
return -1;
@ -81,8 +72,7 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
return -1;
result |= (wchar)*in++ & 0x3F;
}
else
{
else {
// Three-byte sequence
if ( inSize < 2 )
@ -90,7 +80,7 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
inSize -= 2;
result = ( (wchar )*in++ & 0xF ) << 12;
result = ( (wchar)*in++ & 0xF ) << 12;
if ( ( *in & 0xC0 ) != 0x80 )
return -1;
@ -101,23 +91,21 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
result |= (wchar)*in++ & 0x3F;
}
}
else
{
else {
// Two-byte sequence
if ( !inSize )
return -1;
--inSize;
result = ( (wchar )*in++ & 0x1F ) << 6;
result = ( (wchar)*in++ & 0x1F ) << 6;
if ( ( *in & 0xC0 ) != 0x80 )
return -1;
result |= (wchar)*in++ & 0x3F;
}
}
else
{
else {
// This char is from the middle of encoding, it can't be leading
return -1;
}
@ -139,18 +127,17 @@ string encode( wstring const & in ) noexcept
std::vector< char > buffer( in.size() * 4 );
return string( &buffer.front(),
encode( in.data(), in.size(), &buffer.front() ) );
return string( &buffer.front(), encode( in.data(), in.size(), &buffer.front() ) );
}
wstring decode( string const & in )
wstring decode( string const & in )
{
if ( in.empty() )
return {};
std::vector< wchar > buffer( in.size() );
long result = decode( in.data(), in.size(), &buffer.front() );
long result = decode( in.data(), in.size(), &buffer.front() );
if ( result < 0 )
throw exCantDecode( in );
@ -160,8 +147,7 @@ wstring decode( string const & in )
bool isspace( int c )
{
switch( c )
{
switch ( c ) {
case ' ':
case '\f':
case '\n':
@ -176,62 +162,95 @@ bool isspace( int c )
}
//get the first line in string s1. -1 if not found
int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length)
int findFirstLinePosition( char * s1, int s1length, const char * s2, int s2length )
{
char* pos = std::search(s1,s1+s1length, s2, s2+s2length);
char * pos = std::search( s1, s1 + s1length, s2, s2 + s2length );
if (pos == s1 + s1length)
return pos-s1;
if ( pos == s1 + s1length )
return pos - s1;
//the line size.
return pos- s1+ s2length;
//the line size.
return pos - s1 + s2length;
}
char const* getEncodingNameFor(Encoding e)
char const * getEncodingNameFor( Encoding e )
{
switch (e)
{
switch ( e ) {
case Utf32LE:
return "UTF-32LE";
case Utf32BE:
return "UTF-32BE";
case Utf16LE:
return "UTF-16LE";
return "UTF-16LE";
case Utf16BE:
return "UTF-16BE";
return "UTF-16BE";
case Windows1252:
return "WINDOWS-1252";
return "WINDOWS-1252";
case Windows1251:
return "WINDOWS-1251";
return "WINDOWS-1251";
case Utf8:
return "UTF-8";
return "UTF-8";
case Windows1250:
return "WINDOWS-1250";
default:
return "WINDOWS-1250";
}
return "UTF-8";
}
}
LineFeed initLineFeed(Encoding e)
Encoding getEncodingForName( const QByteArray & _name )
{
LineFeed lf;
switch (e)
{
case Utf8::Utf16LE:
lf.lineFeed= new char[2]{ 0x0A,0 };
lf.length = 2;
break;
case Utf8::Utf16BE:
lf.lineFeed = new char[2]{ 0,0x0A };
lf.length = 2;
break;
case Utf8::Windows1252:
case Utf8::Windows1251:
case Utf8::Utf8:
case Utf8::Windows1250:
default:
lf.length = 1;
lf.lineFeed = new char[1]{ 0x0A };
}
return lf;
const auto name = _name.toUpper();
if ( name == "UTF-32LE" )
return Utf32LE;
if ( name == "UTF-32BE" )
return Utf32BE;
if ( name == "UTF-16LE" )
return Utf16LE;
if ( name == "UTF-16BE" )
return Utf16BE;
if ( name == "WINDOWS-1252" )
return Windows1252;
if ( name == "WINDOWS-1251" )
return Windows1251;
if ( name == "UTF-8" )
return Utf8;
if ( name == "WINDOWS-1250" )
return Windows1250;
return Utf8;
}
LineFeed initLineFeed( const Encoding e )
{
LineFeed lf{};
switch ( e ) {
case Utf8::Utf32LE:
lf.lineFeed = new char[ 4 ]{ 0x0A, 0, 0, 0 };
lf.length = 4;
break;
case Utf8::Utf32BE:
lf.lineFeed = new char[ 4 ]{ 0, 0, 0, 0x0A };
lf.length = 4;
break;
case Utf8::Utf16LE:
lf.lineFeed = new char[ 2 ]{ 0x0A, 0 };
lf.length = 2;
break;
case Utf8::Utf16BE:
lf.lineFeed = new char[ 2 ]{ 0, 0x0A };
lf.length = 2;
break;
case Utf8::Windows1252:
case Utf8::Windows1251:
case Utf8::Utf8:
case Utf8::Windows1250:
default:
lf.length = 1;
lf.lineFeed = new char[ 1 ]{ 0x0A };
}
return lf;
}
} // namespace Utf8

View file

@ -4,6 +4,7 @@
#define __UTF8_HH_INCLUDED__
#include <cstdio>
#include <QByteArray>
#include <string>
#include "ex.hh"
#include "wstring.hh"
@ -15,14 +16,15 @@
namespace Utf8 {
// Those are possible encodings for .dsl files
enum Encoding
{
enum Encoding {
Utf16LE,
Utf16BE,
Windows1252,
Windows1251,
Windows1250,
Utf8 // This is an extension. Detected solely by the UTF8 BOM.
Utf8, // This is an extension. Detected solely by the UTF8 BOM.
Utf32BE,
Utf32LE,
};
using std::string;
@ -54,7 +56,8 @@ bool isspace( int c );
//get the first line in string s1. -1 if not found
int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length);
char const* getEncodingNameFor(Encoding e);
char const * getEncodingNameFor( Encoding e );
Encoding getEncodingForName( const QByteArray & name );
struct LineFeed
{

View file

@ -9,6 +9,7 @@
#include "ufile.hh"
#include "utf8.hh"
#include <exception>
#include <stdio.h>
#include <wctype.h>
@ -157,9 +158,9 @@ wstring ArticleDom::Node::renderAsText( bool stripTrsTag ) const
wstring result;
for( list< Node >::const_iterator i = begin(); i != end(); ++i )
if( !stripTrsTag || i->tagName != U"!trs" )
result += i->renderAsText( stripTrsTag );
for ( const auto & i : *this )
if ( !stripTrsTag || i.tagName != U"!trs" )
result += i.renderAsText( stripTrsTag );
return result;
}
@ -248,9 +249,7 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
processUnsortedParts( linkTo, true );
expandOptionalParts( linkTo, &allLinkEntries );
for( list< wstring >::iterator entry = allLinkEntries.begin();
entry != allLinkEntries.end(); )
{
for ( auto entry = allLinkEntries.begin(); entry != allLinkEntries.end(); ) {
if ( !textNode )
{
Node text = Node( Node::Text(), wstring() );
@ -279,8 +278,8 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
ArticleDom nodeDom( linkText, dictName, headword_ );
Node link( Node::Tag(), U"@" , wstring() );
for( Node::iterator n = nodeDom.root.begin(); n != nodeDom.root.end(); ++n )
link.push_back( *n );
for ( auto & n : nodeDom.root )
link.push_back( n );
++entry;
@ -352,17 +351,19 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
nextChar();
}
}
catch( eot )
{
if( !dictionaryName.empty() )
catch ( std::exception & ex ) {
if ( !dictionaryName.empty() )
gdWarning( R"(DSL: Unfinished tag "%s" with attributes "%s" found in "%s", article "%s".)",
QString::fromStdU32String( name ).toUtf8().data(), QString::fromStdU32String( attrs ).toUtf8().data(),
dictionaryName.c_str(), QString::fromStdU32String( headword ).toUtf8().data() );
QString::fromStdU32String( name ).toUtf8().data(),
QString::fromStdU32String( attrs ).toUtf8().data(),
dictionaryName.c_str(),
QString::fromStdU32String( headword ).toUtf8().data() );
else
gdWarning( R"(DSL: Unfinished tag "%s" with attributes "%s" found)",
QString::fromStdU32String( name ).toUtf8().data(), QString::fromStdU32String( attrs ).toUtf8().data() );
QString::fromStdU32String( name ).toUtf8().data(),
QString::fromStdU32String( attrs ).toUtf8().data() );
throw eot();
throw ex;
}
// Add the tag, or close it
@ -491,8 +492,8 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
ArticleDom nodeDom( linkText, dictName, headword_ );
Node link( Node::Tag(), U"ref" , wstring() );
for( Node::iterator n = nodeDom.root.begin(); n != nodeDom.root.end(); ++n )
link.push_back( *n );
for ( auto & n : nodeDom.root )
link.push_back( n );
if ( stack.empty() )
root.push_back( link );
@ -646,16 +647,14 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
textNode->text.push_back( ch );
} // for( ; ; )
}
catch( eot )
{
catch ( eot & ) {
}
if ( textNode )
stack.pop_back();
if ( stack.size() )
{
list< Node * >::iterator it = std::find_if( stack.begin(), stack.end(), MustTagBeClosed() );
if ( !stack.empty() ) {
auto it = std::find_if( stack.begin(), stack.end(), MustTagBeClosed() );
if( it == stack.end() )
return; // no unclosed tags that must be closed => nothing to warn about
QByteArray const firstTagName = QString::fromStdU32String( ( *it )->tagName ).toUtf8();
@ -687,10 +686,8 @@ void ArticleDom::openTag( wstring const & name,
// All tags above [m] tag will be closed and reopened after
// to avoid break this tag by closing some other tag.
while( stack.size() )
{
nodesToReopen.push_back( Node( Node::Tag(), stack.back()->tagName,
stack.back()->tagAttrs ) );
while ( !stack.empty() ) {
nodesToReopen.emplace_back( Node::Tag(), stack.back()->tagName, stack.back()->tagAttrs );
if ( stack.back()->empty() )
{
@ -698,7 +695,7 @@ void ArticleDom::openTag( wstring const & name,
stack.pop_back();
Node * parent = stack.size() ? stack.back() : &root;
Node * parent = !stack.empty() ? stack.back() : &root;
parent->pop_back();
}
@ -724,8 +721,7 @@ void ArticleDom::openTag( wstring const & name,
// Reopen tags if needed
while( nodesToReopen.size() )
{
while ( !nodesToReopen.empty() ) {
if ( stack.empty() )
{
root.push_back( nodesToReopen.back() );
@ -739,7 +735,6 @@ void ArticleDom::openTag( wstring const & name,
nodesToReopen.pop_back();
}
}
void ArticleDom::closeTag( wstring const & name,
@ -767,14 +762,12 @@ void ArticleDom::closeTag( wstring const & name,
list< Node > nodesToReopen;
while( stack.size() )
{
while ( !stack.empty() ) {
bool found = stack.back()->tagName == name ||
checkM( stack.back()->tagName, name );
if ( !found )
nodesToReopen.push_back( Node( Node::Tag(), stack.back()->tagName,
stack.back()->tagAttrs ) );
nodesToReopen.emplace_back( Node::Tag(), stack.back()->tagName, stack.back()->tagAttrs );
if( stack.back()->empty() && stack.back()->tagName != U"br" )
{
@ -782,7 +775,7 @@ void ArticleDom::closeTag( wstring const & name,
stack.pop_back();
Node * parent = stack.size() ? stack.back() : &root;
Node * parent = !stack.empty() ? stack.back() : &root;
parent->pop_back();
}
@ -793,8 +786,7 @@ void ArticleDom::closeTag( wstring const & name,
break;
}
while( nodesToReopen.size() )
{
while ( !nodesToReopen.empty() ) {
if ( stack.empty() )
{
root.push_back( nodesToReopen.back() );
@ -880,10 +872,9 @@ DslScanner::DslScanner( string const & fileName ) :
// Now try guessing the encoding by reading the first two bytes
unsigned char firstBytes[ 2 ];
unsigned char firstBytes[ 50 ];
if ( gzread( f, firstBytes, sizeof( firstBytes ) ) != sizeof( firstBytes ) )
{
if ( gzread( f, firstBytes, sizeof( firstBytes ) ) != sizeof( firstBytes ) ) {
// Apparently the file's too short
gzclose( f );
throw exMalformedDslFile( fileName );
@ -891,53 +882,19 @@ DslScanner::DslScanner( string const & fileName ) :
bool needExactEncoding = false;
QByteArray ba = QByteArray::fromRawData( (const char *)firstBytes, 50 );
codec = QTextCodec::codecForUtfText( ba, QTextCodec::codecForName( "UTF-8" ) );
// If the file begins with the dedicated Unicode marker, we just consume
// it. If, on the other hand, it's not, we return the bytes back
if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE )
encoding = Utf8::Utf16LE;
else
if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF )
encoding = Utf8::Utf16BE;
else
if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB )
{
// Looks like Utf8, read one more byte
if ( gzread( f, firstBytes, 1 ) != 1 || firstBytes[ 0 ] != 0xBF )
{
// Either the file's too short, or the BOM is weird
gzclose( f );
throw exMalformedDslFile( fileName );
}
encoding = Utf8::Utf8;
}
else
{
if ( firstBytes[ 0 ] && !firstBytes[ 1 ] )
encoding = Utf8::Utf16LE;
else
if ( !firstBytes[ 0 ] && firstBytes[ 1 ] )
encoding = Utf8::Utf16BE;
else
{
// Ok, this doesn't look like 16-bit Unicode. We will start with a
// 8-bit encoding with an intent to find out the exact one from
// the header.
needExactEncoding = true;
encoding = Utf8::Windows1251;
}
encoding = Utf8::getEncodingForName( codec->name() );
qDebug() << codec->name();
if ( gzrewind( f ) )
{
gzclose( f );
throw exCantOpen( fileName );
}
if ( gzrewind( f ) ) {
gzclose( f );
throw exCantOpen( fileName );
}
//iconv.reinit( encoding );
codec = QTextCodec::codecForName(getEncodingNameFor(encoding));
lineFeed=Utf8::initLineFeed(encoding);
lineFeed = Utf8::initLineFeed( encoding );
// We now can use our own readNextLine() function
wstring str;