From 67ed24c61c29d4025843c9d892f0445c4c008bde Mon Sep 17 00:00:00 2001 From: xiaoyifang <105986+xiaoyifang@users.noreply.github.com> Date: Fri, 9 Jun 2023 08:01:45 +0800 Subject: [PATCH] fix: support dsl dictionary utf encoding detection (#830) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: support dsl dictionary utf encoding detection * fix: code smells * fix: code smells * 🎨 apply clang-format changes * fix: adjust linefeed --------- Co-authored-by: xiaoyifang --- src/common/utf8.cc | 177 ++++++++++++++++++++++------------------ src/common/utf8.hh | 11 ++- src/dict/dsl_details.cc | 121 +++++++++------------------ 3 files changed, 144 insertions(+), 165 deletions(-) diff --git a/src/common/utf8.cc b/src/common/utf8.cc index ea641408..8cf9c4ab 100644 --- a/src/common/utf8.cc +++ b/src/common/utf8.cc @@ -4,32 +4,28 @@ #include "utf8.hh" #include #include +#include +#include namespace Utf8 { size_t encode( wchar const * in, size_t inSize, char * out_ ) { - unsigned char * out = (unsigned char *) out_; + unsigned char * out = (unsigned char *)out_; - while( inSize-- ) - { + while ( inSize-- ) { if ( *in < 0x80 ) *out++ = *in++; - else - if ( *in < 0x800 ) - { + else if ( *in < 0x800 ) { *out++ = 0xC0 | ( *in >> 6 ); *out++ = 0x80 | ( *in++ & 0x3F ); } - else - if ( *in < 0x10000 ) - { + else if ( *in < 0x10000 ) { *out++ = 0xE0 | ( *in >> 12 ); *out++ = 0x80 | ( ( *in >> 6 ) & 0x3F ); *out++ = 0x80 | ( *in++ & 0x3F ); } - else - { + else { *out++ = 0xF0 | ( *in >> 18 ); *out++ = 0x80 | ( ( *in >> 12 ) & 0x3F ); *out++ = 0x80 | ( ( *in >> 6 ) & 0x3F ); @@ -37,26 +33,21 @@ size_t encode( wchar const * in, size_t inSize, char * out_ ) } } - return out - (unsigned char *) out_; + return out - (unsigned char *)out_; } long decode( char const * in_, size_t inSize, wchar * out_ ) { - unsigned char const * in = (unsigned char const *) in_; - wchar * out = out_; + unsigned char const * in = (unsigned char const *)in_; + wchar * out = out_; - while( inSize-- ) - { + while ( inSize-- ) { wchar result; - if ( *in & 0x80 ) - { - if ( *in & 0x40 ) - { - if ( *in & 0x20 ) - { - if ( *in & 0x10 ) - { + if ( *in & 0x80 ) { + if ( *in & 0x40 ) { + if ( *in & 0x20 ) { + if ( *in & 0x10 ) { // Four-byte sequence if ( *in & 8 ) // This can't be @@ -67,7 +58,7 @@ long decode( char const * in_, size_t inSize, wchar * out_ ) inSize -= 3; - result = ( (wchar )*in++ & 7 ) << 18; + result = ( (wchar)*in++ & 7 ) << 18; if ( ( *in & 0xC0 ) != 0x80 ) return -1; @@ -81,8 +72,7 @@ long decode( char const * in_, size_t inSize, wchar * out_ ) return -1; result |= (wchar)*in++ & 0x3F; } - else - { + else { // Three-byte sequence if ( inSize < 2 ) @@ -90,7 +80,7 @@ long decode( char const * in_, size_t inSize, wchar * out_ ) inSize -= 2; - result = ( (wchar )*in++ & 0xF ) << 12; + result = ( (wchar)*in++ & 0xF ) << 12; if ( ( *in & 0xC0 ) != 0x80 ) return -1; @@ -101,23 +91,21 @@ long decode( char const * in_, size_t inSize, wchar * out_ ) result |= (wchar)*in++ & 0x3F; } } - else - { + else { // Two-byte sequence if ( !inSize ) return -1; --inSize; - result = ( (wchar )*in++ & 0x1F ) << 6; + result = ( (wchar)*in++ & 0x1F ) << 6; if ( ( *in & 0xC0 ) != 0x80 ) return -1; result |= (wchar)*in++ & 0x3F; } } - else - { + else { // This char is from the middle of encoding, it can't be leading return -1; } @@ -139,18 +127,17 @@ string encode( wstring const & in ) noexcept std::vector< char > buffer( in.size() * 4 ); - return string( &buffer.front(), - encode( in.data(), in.size(), &buffer.front() ) ); + return string( &buffer.front(), encode( in.data(), in.size(), &buffer.front() ) ); } -wstring decode( string const & in ) +wstring decode( string const & in ) { if ( in.empty() ) return {}; std::vector< wchar > buffer( in.size() ); - long result = decode( in.data(), in.size(), &buffer.front() ); + long result = decode( in.data(), in.size(), &buffer.front() ); if ( result < 0 ) throw exCantDecode( in ); @@ -160,8 +147,7 @@ wstring decode( string const & in ) bool isspace( int c ) { - switch( c ) - { + switch ( c ) { case ' ': case '\f': case '\n': @@ -176,62 +162,95 @@ bool isspace( int c ) } //get the first line in string s1. -1 if not found -int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length) +int findFirstLinePosition( char * s1, int s1length, const char * s2, int s2length ) { - char* pos = std::search(s1,s1+s1length, s2, s2+s2length); + char * pos = std::search( s1, s1 + s1length, s2, s2 + s2length ); - if (pos == s1 + s1length) - return pos-s1; + if ( pos == s1 + s1length ) + return pos - s1; - //the line size. - return pos- s1+ s2length; + //the line size. + return pos - s1 + s2length; } -char const* getEncodingNameFor(Encoding e) +char const * getEncodingNameFor( Encoding e ) { - switch (e) - { + switch ( e ) { + case Utf32LE: + return "UTF-32LE"; + case Utf32BE: + return "UTF-32BE"; case Utf16LE: - return "UTF-16LE"; + return "UTF-16LE"; case Utf16BE: - return "UTF-16BE"; + return "UTF-16BE"; case Windows1252: - return "WINDOWS-1252"; + return "WINDOWS-1252"; case Windows1251: - return "WINDOWS-1251"; + return "WINDOWS-1251"; case Utf8: - return "UTF-8"; + return "UTF-8"; case Windows1250: + return "WINDOWS-1250"; default: - return "WINDOWS-1250"; - } + return "UTF-8"; + } } -LineFeed initLineFeed(Encoding e) +Encoding getEncodingForName( const QByteArray & _name ) { - LineFeed lf; - switch (e) - { - case Utf8::Utf16LE: - lf.lineFeed= new char[2]{ 0x0A,0 }; - lf.length = 2; - break; - case Utf8::Utf16BE: - lf.lineFeed = new char[2]{ 0,0x0A }; - lf.length = 2; - break; - case Utf8::Windows1252: - - case Utf8::Windows1251: - - case Utf8::Utf8: - - case Utf8::Windows1250: - default: - lf.length = 1; - lf.lineFeed = new char[1]{ 0x0A }; - } - return lf; + const auto name = _name.toUpper(); + if ( name == "UTF-32LE" ) + return Utf32LE; + if ( name == "UTF-32BE" ) + return Utf32BE; + if ( name == "UTF-16LE" ) + return Utf16LE; + if ( name == "UTF-16BE" ) + return Utf16BE; + if ( name == "WINDOWS-1252" ) + return Windows1252; + if ( name == "WINDOWS-1251" ) + return Windows1251; + if ( name == "UTF-8" ) + return Utf8; + if ( name == "WINDOWS-1250" ) + return Windows1250; + return Utf8; } +LineFeed initLineFeed( const Encoding e ) +{ + LineFeed lf{}; + switch ( e ) { + case Utf8::Utf32LE: + lf.lineFeed = new char[ 4 ]{ 0x0A, 0, 0, 0 }; + lf.length = 4; + break; + case Utf8::Utf32BE: + lf.lineFeed = new char[ 4 ]{ 0, 0, 0, 0x0A }; + lf.length = 4; + break; + case Utf8::Utf16LE: + lf.lineFeed = new char[ 2 ]{ 0x0A, 0 }; + lf.length = 2; + break; + case Utf8::Utf16BE: + lf.lineFeed = new char[ 2 ]{ 0, 0x0A }; + lf.length = 2; + break; + case Utf8::Windows1252: + + case Utf8::Windows1251: + + case Utf8::Utf8: + + case Utf8::Windows1250: + default: + lf.length = 1; + lf.lineFeed = new char[ 1 ]{ 0x0A }; + } + return lf; } + +} // namespace Utf8 diff --git a/src/common/utf8.hh b/src/common/utf8.hh index 75f96503..787024ba 100644 --- a/src/common/utf8.hh +++ b/src/common/utf8.hh @@ -4,6 +4,7 @@ #define __UTF8_HH_INCLUDED__ #include +#include #include #include "ex.hh" #include "wstring.hh" @@ -15,14 +16,15 @@ namespace Utf8 { // Those are possible encodings for .dsl files -enum Encoding -{ +enum Encoding { Utf16LE, Utf16BE, Windows1252, Windows1251, Windows1250, - Utf8 // This is an extension. Detected solely by the UTF8 BOM. + Utf8, // This is an extension. Detected solely by the UTF8 BOM. + Utf32BE, + Utf32LE, }; using std::string; @@ -54,7 +56,8 @@ bool isspace( int c ); //get the first line in string s1. -1 if not found int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length); -char const* getEncodingNameFor(Encoding e); +char const * getEncodingNameFor( Encoding e ); +Encoding getEncodingForName( const QByteArray & name ); struct LineFeed { diff --git a/src/dict/dsl_details.cc b/src/dict/dsl_details.cc index 7ec7d328..3ca218a8 100644 --- a/src/dict/dsl_details.cc +++ b/src/dict/dsl_details.cc @@ -9,6 +9,7 @@ #include "ufile.hh" #include "utf8.hh" +#include #include #include @@ -157,9 +158,9 @@ wstring ArticleDom::Node::renderAsText( bool stripTrsTag ) const wstring result; - for( list< Node >::const_iterator i = begin(); i != end(); ++i ) - if( !stripTrsTag || i->tagName != U"!trs" ) - result += i->renderAsText( stripTrsTag ); + for ( const auto & i : *this ) + if ( !stripTrsTag || i.tagName != U"!trs" ) + result += i.renderAsText( stripTrsTag ); return result; } @@ -248,9 +249,7 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName, processUnsortedParts( linkTo, true ); expandOptionalParts( linkTo, &allLinkEntries ); - for( list< wstring >::iterator entry = allLinkEntries.begin(); - entry != allLinkEntries.end(); ) - { + for ( auto entry = allLinkEntries.begin(); entry != allLinkEntries.end(); ) { if ( !textNode ) { Node text = Node( Node::Text(), wstring() ); @@ -279,8 +278,8 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName, ArticleDom nodeDom( linkText, dictName, headword_ ); Node link( Node::Tag(), U"@" , wstring() ); - for( Node::iterator n = nodeDom.root.begin(); n != nodeDom.root.end(); ++n ) - link.push_back( *n ); + for ( auto & n : nodeDom.root ) + link.push_back( n ); ++entry; @@ -352,17 +351,19 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName, nextChar(); } } - catch( eot ) - { - if( !dictionaryName.empty() ) + catch ( std::exception & ex ) { + if ( !dictionaryName.empty() ) gdWarning( R"(DSL: Unfinished tag "%s" with attributes "%s" found in "%s", article "%s".)", - QString::fromStdU32String( name ).toUtf8().data(), QString::fromStdU32String( attrs ).toUtf8().data(), - dictionaryName.c_str(), QString::fromStdU32String( headword ).toUtf8().data() ); + QString::fromStdU32String( name ).toUtf8().data(), + QString::fromStdU32String( attrs ).toUtf8().data(), + dictionaryName.c_str(), + QString::fromStdU32String( headword ).toUtf8().data() ); else gdWarning( R"(DSL: Unfinished tag "%s" with attributes "%s" found)", - QString::fromStdU32String( name ).toUtf8().data(), QString::fromStdU32String( attrs ).toUtf8().data() ); + QString::fromStdU32String( name ).toUtf8().data(), + QString::fromStdU32String( attrs ).toUtf8().data() ); - throw eot(); + throw ex; } // Add the tag, or close it @@ -491,8 +492,8 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName, ArticleDom nodeDom( linkText, dictName, headword_ ); Node link( Node::Tag(), U"ref" , wstring() ); - for( Node::iterator n = nodeDom.root.begin(); n != nodeDom.root.end(); ++n ) - link.push_back( *n ); + for ( auto & n : nodeDom.root ) + link.push_back( n ); if ( stack.empty() ) root.push_back( link ); @@ -646,16 +647,14 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName, textNode->text.push_back( ch ); } // for( ; ; ) } - catch( eot ) - { + catch ( eot & ) { } if ( textNode ) stack.pop_back(); - if ( stack.size() ) - { - list< Node * >::iterator it = std::find_if( stack.begin(), stack.end(), MustTagBeClosed() ); + if ( !stack.empty() ) { + auto it = std::find_if( stack.begin(), stack.end(), MustTagBeClosed() ); if( it == stack.end() ) return; // no unclosed tags that must be closed => nothing to warn about QByteArray const firstTagName = QString::fromStdU32String( ( *it )->tagName ).toUtf8(); @@ -687,10 +686,8 @@ void ArticleDom::openTag( wstring const & name, // All tags above [m] tag will be closed and reopened after // to avoid break this tag by closing some other tag. - while( stack.size() ) - { - nodesToReopen.push_back( Node( Node::Tag(), stack.back()->tagName, - stack.back()->tagAttrs ) ); + while ( !stack.empty() ) { + nodesToReopen.emplace_back( Node::Tag(), stack.back()->tagName, stack.back()->tagAttrs ); if ( stack.back()->empty() ) { @@ -698,7 +695,7 @@ void ArticleDom::openTag( wstring const & name, stack.pop_back(); - Node * parent = stack.size() ? stack.back() : &root; + Node * parent = !stack.empty() ? stack.back() : &root; parent->pop_back(); } @@ -724,8 +721,7 @@ void ArticleDom::openTag( wstring const & name, // Reopen tags if needed - while( nodesToReopen.size() ) - { + while ( !nodesToReopen.empty() ) { if ( stack.empty() ) { root.push_back( nodesToReopen.back() ); @@ -739,7 +735,6 @@ void ArticleDom::openTag( wstring const & name, nodesToReopen.pop_back(); } - } void ArticleDom::closeTag( wstring const & name, @@ -767,14 +762,12 @@ void ArticleDom::closeTag( wstring const & name, list< Node > nodesToReopen; - while( stack.size() ) - { + while ( !stack.empty() ) { bool found = stack.back()->tagName == name || checkM( stack.back()->tagName, name ); if ( !found ) - nodesToReopen.push_back( Node( Node::Tag(), stack.back()->tagName, - stack.back()->tagAttrs ) ); + nodesToReopen.emplace_back( Node::Tag(), stack.back()->tagName, stack.back()->tagAttrs ); if( stack.back()->empty() && stack.back()->tagName != U"br" ) { @@ -782,7 +775,7 @@ void ArticleDom::closeTag( wstring const & name, stack.pop_back(); - Node * parent = stack.size() ? stack.back() : &root; + Node * parent = !stack.empty() ? stack.back() : &root; parent->pop_back(); } @@ -793,8 +786,7 @@ void ArticleDom::closeTag( wstring const & name, break; } - while( nodesToReopen.size() ) - { + while ( !nodesToReopen.empty() ) { if ( stack.empty() ) { root.push_back( nodesToReopen.back() ); @@ -880,10 +872,9 @@ DslScanner::DslScanner( string const & fileName ) : // Now try guessing the encoding by reading the first two bytes - unsigned char firstBytes[ 2 ]; + unsigned char firstBytes[ 50 ]; - if ( gzread( f, firstBytes, sizeof( firstBytes ) ) != sizeof( firstBytes ) ) - { + if ( gzread( f, firstBytes, sizeof( firstBytes ) ) != sizeof( firstBytes ) ) { // Apparently the file's too short gzclose( f ); throw exMalformedDslFile( fileName ); @@ -891,53 +882,19 @@ DslScanner::DslScanner( string const & fileName ) : bool needExactEncoding = false; + QByteArray ba = QByteArray::fromRawData( (const char *)firstBytes, 50 ); + codec = QTextCodec::codecForUtfText( ba, QTextCodec::codecForName( "UTF-8" ) ); - // If the file begins with the dedicated Unicode marker, we just consume - // it. If, on the other hand, it's not, we return the bytes back - if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE ) - encoding = Utf8::Utf16LE; - else - if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF ) - encoding = Utf8::Utf16BE; - else - if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB ) - { - // Looks like Utf8, read one more byte - if ( gzread( f, firstBytes, 1 ) != 1 || firstBytes[ 0 ] != 0xBF ) - { - // Either the file's too short, or the BOM is weird - gzclose( f ); - throw exMalformedDslFile( fileName ); - } - - encoding = Utf8::Utf8; - } - else - { - if ( firstBytes[ 0 ] && !firstBytes[ 1 ] ) - encoding = Utf8::Utf16LE; - else - if ( !firstBytes[ 0 ] && firstBytes[ 1 ] ) - encoding = Utf8::Utf16BE; - else - { - // Ok, this doesn't look like 16-bit Unicode. We will start with a - // 8-bit encoding with an intent to find out the exact one from - // the header. - needExactEncoding = true; - encoding = Utf8::Windows1251; - } + encoding = Utf8::getEncodingForName( codec->name() ); + qDebug() << codec->name(); - if ( gzrewind( f ) ) - { - gzclose( f ); - throw exCantOpen( fileName ); - } + if ( gzrewind( f ) ) { + gzclose( f ); + throw exCantOpen( fileName ); } //iconv.reinit( encoding ); - codec = QTextCodec::codecForName(getEncodingNameFor(encoding)); - lineFeed=Utf8::initLineFeed(encoding); + lineFeed = Utf8::initLineFeed( encoding ); // We now can use our own readNextLine() function wstring str;