fix: support dsl dictionary utf encoding detection (#830)

* fix: support dsl dictionary utf encoding detection * fix: code smells * fix: code smells * 🎨 apply clang-format changes * fix: adjust linefeed --------- Co-authored-by: xiaoyifang <xiaoyifang@users.noreply.github.com>
2024-11-24 04:24:09 +00:00 · 2023-06-09 08:01:45 +08:00 · 2023-06-09 08:01:45 +08:00 · 67ed24c61c
parent 5fd9261047
commit 67ed24c61c
3 changed files with 144 additions and 165 deletions
--- a/src/common/utf8.cc
+++ b/src/common/utf8.cc
@ -4,32 +4,28 @@
 #include "utf8.hh"
 #include <vector>
 #include <algorithm>
+#include <QByteArray>
+#include <QString>

 namespace Utf8 {

 size_t encode( wchar const * in, size_t inSize, char * out_ )
 {
-  unsigned char * out = (unsigned char *) out_;
+  unsigned char * out = (unsigned char *)out_;

-  while( inSize-- )
-  {
+  while ( inSize-- ) {
    if ( *in < 0x80 )
      *out++ = *in++;
-    else
-    if ( *in < 0x800 )
-    {
+    else if ( *in < 0x800 ) {
      *out++ = 0xC0 | ( *in >> 6 );
      *out++ = 0x80 | ( *in++ & 0x3F );
    }
-    else
-    if ( *in < 0x10000 )
-    {
+    else if ( *in < 0x10000 ) {
      *out++ = 0xE0 | ( *in >> 12 );
      *out++ = 0x80 | ( ( *in >> 6 ) & 0x3F );
      *out++ = 0x80 | ( *in++ & 0x3F );
    }
-    else
-    {
+    else {
      *out++ = 0xF0 | ( *in >> 18 );
      *out++ = 0x80 | ( ( *in >> 12 ) & 0x3F );
      *out++ = 0x80 | ( ( *in >> 6 ) & 0x3F );
@ -37,26 +33,21 @@ size_t encode( wchar const * in, size_t inSize, char * out_ )
    }
  }

-  return out - (unsigned char *) out_;
+  return out - (unsigned char *)out_;
 }

 long decode( char const * in_, size_t inSize, wchar * out_ )
 {
-  unsigned char const * in = (unsigned char const *) in_;
-  wchar * out = out_;
+  unsigned char const * in = (unsigned char const *)in_;
+  wchar * out              = out_;

-  while( inSize-- )
-  {
+  while ( inSize-- ) {
    wchar result;

-    if ( *in & 0x80 )
-    {
-      if ( *in & 0x40 )
-      {
-        if ( *in & 0x20 )
-        {
-          if ( *in & 0x10 )
-          {
+    if ( *in & 0x80 ) {
+      if ( *in & 0x40 ) {
+        if ( *in & 0x20 ) {
+          if ( *in & 0x10 ) {
            // Four-byte sequence
            if ( *in & 8 )
              // This can't be
@ -67,7 +58,7 @@ long decode( char const * in_, size_t inSize, wchar * out_ )

            inSize -= 3;

-            result = ( (wchar )*in++ & 7 ) << 18;
+            result = ( (wchar)*in++ & 7 ) << 18;

            if ( ( *in & 0xC0 ) != 0x80 )
              return -1;
@ -81,8 +72,7 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
              return -1;
            result |= (wchar)*in++ & 0x3F;
          }
-          else
-          {
+          else {
            // Three-byte sequence

            if ( inSize < 2 )
@ -90,7 +80,7 @@ long decode( char const * in_, size_t inSize, wchar * out_ )

            inSize -= 2;

-            result = ( (wchar )*in++ & 0xF ) << 12;
+            result = ( (wchar)*in++ & 0xF ) << 12;

            if ( ( *in & 0xC0 ) != 0x80 )
              return -1;
@ -101,23 +91,21 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
            result |= (wchar)*in++ & 0x3F;
          }
        }
-        else
-        {
+        else {
          // Two-byte sequence
          if ( !inSize )
            return -1;

          --inSize;

-          result = ( (wchar )*in++ & 0x1F ) << 6;
+          result = ( (wchar)*in++ & 0x1F ) << 6;

          if ( ( *in & 0xC0 ) != 0x80 )
            return -1;
          result |= (wchar)*in++ & 0x3F;
        }
      }
-      else
-      {
+      else {
        // This char is from the middle of encoding, it can't be leading
        return -1;
      }
@ -139,18 +127,17 @@ string encode( wstring const & in ) noexcept

  std::vector< char > buffer( in.size() * 4 );

-  return string( &buffer.front(),
-                 encode( in.data(), in.size(), &buffer.front() ) );
+  return string( &buffer.front(), encode( in.data(), in.size(), &buffer.front() ) );
 }

-wstring decode( string const & in ) 
+wstring decode( string const & in )
 {
  if ( in.empty() )
    return {};

  std::vector< wchar > buffer( in.size() );

-  long result = decode( in.data(),  in.size(), &buffer.front() );
+  long result = decode( in.data(), in.size(), &buffer.front() );

  if ( result < 0 )
    throw exCantDecode( in );
@ -160,8 +147,7 @@ wstring decode( string const & in )

 bool isspace( int c )
 {
-  switch( c )
-  {
+  switch ( c ) {
    case ' ':
    case '\f':
    case '\n':
@ -176,62 +162,95 @@ bool isspace( int c )
 }

 //get the first line in string s1. -1 if not found
-int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length)
+int findFirstLinePosition( char * s1, int s1length, const char * s2, int s2length )
 {
-    char* pos = std::search(s1,s1+s1length, s2, s2+s2length);
+  char * pos = std::search( s1, s1 + s1length, s2, s2 + s2length );

-    if (pos == s1 + s1length)
-        return pos-s1;
+  if ( pos == s1 + s1length )
+    return pos - s1;

-    //the line size.
-    return pos- s1+ s2length;
+  //the line size.
+  return pos - s1 + s2length;
 }

-char const* getEncodingNameFor(Encoding e)
+char const * getEncodingNameFor( Encoding e )
 {
-    switch (e)
-    {
+  switch ( e ) {
+    case Utf32LE:
+      return "UTF-32LE";
+    case Utf32BE:
+      return "UTF-32BE";
    case Utf16LE:
-        return "UTF-16LE";
+      return "UTF-16LE";
    case Utf16BE:
-        return "UTF-16BE";
+      return "UTF-16BE";
    case Windows1252:
-        return "WINDOWS-1252";
+      return "WINDOWS-1252";
    case Windows1251:
-        return "WINDOWS-1251";
+      return "WINDOWS-1251";
    case Utf8:
-        return "UTF-8";
+      return "UTF-8";
    case Windows1250:
+      return "WINDOWS-1250";
    default:
-        return "WINDOWS-1250";
-    }
+      return "UTF-8";
+  }
 }

-LineFeed initLineFeed(Encoding e)
+Encoding getEncodingForName( const QByteArray & _name )
 {
-    LineFeed lf;
-	switch (e)
-	{
-	case Utf8::Utf16LE:
-        lf.lineFeed= new char[2]{ 0x0A,0 };
-        lf.length = 2;
-		break;
-	case Utf8::Utf16BE:
-        lf.lineFeed = new char[2]{ 0,0x0A };
-        lf.length = 2;
-		break;
-	case Utf8::Windows1252:
-
-	case Utf8::Windows1251:
-
-	case Utf8::Utf8:
-
-	case Utf8::Windows1250:
-	default:
-        lf.length = 1;
-        lf.lineFeed = new char[1]{ 0x0A };
-	}
-    return lf;
+  const auto name = _name.toUpper();
+  if ( name == "UTF-32LE" )
+    return Utf32LE;
+  if ( name == "UTF-32BE" )
+    return Utf32BE;
+  if ( name == "UTF-16LE" )
+    return Utf16LE;
+  if ( name == "UTF-16BE" )
+    return Utf16BE;
+  if ( name == "WINDOWS-1252" )
+    return Windows1252;
+  if ( name == "WINDOWS-1251" )
+    return Windows1251;
+  if ( name == "UTF-8" )
+    return Utf8;
+  if ( name == "WINDOWS-1250" )
+    return Windows1250;
+  return Utf8;
 }

+LineFeed initLineFeed( const Encoding e )
+{
+  LineFeed lf{};
+  switch ( e ) {
+    case Utf8::Utf32LE:
+      lf.lineFeed = new char[ 4 ]{ 0x0A, 0, 0, 0 };
+      lf.length   = 4;
+      break;
+    case Utf8::Utf32BE:
+      lf.lineFeed = new char[ 4 ]{ 0, 0, 0, 0x0A };
+      lf.length   = 4;
+      break;
+    case Utf8::Utf16LE:
+      lf.lineFeed = new char[ 2 ]{ 0x0A, 0 };
+      lf.length   = 2;
+      break;
+    case Utf8::Utf16BE:
+      lf.lineFeed = new char[ 2 ]{ 0, 0x0A };
+      lf.length   = 2;
+      break;
+    case Utf8::Windows1252:
+
+    case Utf8::Windows1251:
+
+    case Utf8::Utf8:
+
+    case Utf8::Windows1250:
+    default:
+      lf.length   = 1;
+      lf.lineFeed = new char[ 1 ]{ 0x0A };
+  }
+  return lf;
 }
+
+} // namespace Utf8
--- a/src/common/utf8.hh
+++ b/src/common/utf8.hh
@ -4,6 +4,7 @@
 #define __UTF8_HH_INCLUDED__

 #include <cstdio>
+#include <QByteArray>
 #include <string>
 #include "ex.hh"
 #include "wstring.hh"
@ -15,14 +16,15 @@
 namespace Utf8 {

 // Those are possible encodings for .dsl files
-enum Encoding
-{
+enum Encoding {
  Utf16LE,
  Utf16BE,
  Windows1252,
  Windows1251,
  Windows1250,
-  Utf8 // This is an extension. Detected solely by the UTF8 BOM.
+  Utf8, // This is an extension. Detected solely by the UTF8 BOM.
+  Utf32BE,
+  Utf32LE,
 };

 using std::string;
@ -54,7 +56,8 @@ bool isspace( int c );

 //get the first line in string s1. -1 if not found
 int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length);
-char const* getEncodingNameFor(Encoding e);
+char const * getEncodingNameFor( Encoding e );
+Encoding getEncodingForName( const QByteArray & name );

 struct LineFeed
 {
--- a/src/dict/dsl_details.cc
+++ b/src/dict/dsl_details.cc
@ -9,6 +9,7 @@
 #include "ufile.hh"
 #include "utf8.hh"

+#include <exception>
 #include <stdio.h>
 #include <wctype.h>

@ -157,9 +158,9 @@ wstring ArticleDom::Node::renderAsText( bool stripTrsTag ) const

  wstring result;

-  for( list< Node >::const_iterator i = begin(); i != end(); ++i )
-    if( !stripTrsTag || i->tagName !=  U"!trs"  )
-      result += i->renderAsText( stripTrsTag );
+  for ( const auto & i : *this )
+    if ( !stripTrsTag || i.tagName != U"!trs" )
+      result += i.renderAsText( stripTrsTag );

  return result;
 }
@ -248,9 +249,7 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
            processUnsortedParts( linkTo, true );
            expandOptionalParts( linkTo, &allLinkEntries );

-            for( list< wstring >::iterator entry = allLinkEntries.begin();
-                 entry != allLinkEntries.end(); )
-            {
+            for ( auto entry = allLinkEntries.begin(); entry != allLinkEntries.end(); ) {
              if ( !textNode )
              {
                Node text = Node( Node::Text(), wstring() );
@ -279,8 +278,8 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
              ArticleDom nodeDom( linkText, dictName, headword_ );

              Node link( Node::Tag(),  U"@" , wstring() );
-              for( Node::iterator n = nodeDom.root.begin(); n != nodeDom.root.end(); ++n )
-                link.push_back( *n );
+              for ( auto & n : nodeDom.root )
+                link.push_back( n );

              ++entry;

@ -352,17 +351,19 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
            nextChar();
          }
        }
-        catch( eot )
-        {
-          if( !dictionaryName.empty() )
+        catch ( std::exception & ex ) {
+          if ( !dictionaryName.empty() )
            gdWarning( R"(DSL: Unfinished tag "%s" with attributes "%s" found in "%s", article "%s".)",
-                       QString::fromStdU32String( name ).toUtf8().data(), QString::fromStdU32String( attrs ).toUtf8().data(),
-                       dictionaryName.c_str(), QString::fromStdU32String( headword ).toUtf8().data() );
+                       QString::fromStdU32String( name ).toUtf8().data(),
+                       QString::fromStdU32String( attrs ).toUtf8().data(),
+                       dictionaryName.c_str(),
+                       QString::fromStdU32String( headword ).toUtf8().data() );
          else
            gdWarning( R"(DSL: Unfinished tag "%s" with attributes "%s" found)",
-                       QString::fromStdU32String( name ).toUtf8().data(), QString::fromStdU32String( attrs ).toUtf8().data() );
+                       QString::fromStdU32String( name ).toUtf8().data(),
+                       QString::fromStdU32String( attrs ).toUtf8().data() );

-          throw eot();
+          throw ex;
        }

        // Add the tag, or close it
@ -491,8 +492,8 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
          ArticleDom nodeDom( linkText, dictName, headword_ );

          Node link( Node::Tag(),  U"ref" , wstring() );
-          for( Node::iterator n = nodeDom.root.begin(); n != nodeDom.root.end(); ++n )
-            link.push_back( *n );
+          for ( auto & n : nodeDom.root )
+            link.push_back( n );

          if ( stack.empty() )
            root.push_back( link );
@ -646,16 +647,14 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
      textNode->text.push_back( ch );
    } // for( ; ; )
  }
-  catch( eot )
-  {
+  catch ( eot & ) {
  }

  if ( textNode )
    stack.pop_back();

-  if ( stack.size() )
-  {
-    list< Node * >::iterator it = std::find_if( stack.begin(), stack.end(), MustTagBeClosed() );
+  if ( !stack.empty() ) {
+    auto it = std::find_if( stack.begin(), stack.end(), MustTagBeClosed() );
    if( it == stack.end() )
      return; // no unclosed tags that must be closed => nothing to warn about
    QByteArray const firstTagName = QString::fromStdU32String( ( *it )->tagName ).toUtf8();
@ -687,10 +686,8 @@ void ArticleDom::openTag( wstring const & name,
    // All tags above [m] tag will be closed and reopened after
    // to avoid break this tag by closing some other tag.

-    while( stack.size() )
-    {
-      nodesToReopen.push_back( Node( Node::Tag(), stack.back()->tagName,
-                                     stack.back()->tagAttrs ) );
+    while ( !stack.empty() ) {
+      nodesToReopen.emplace_back( Node::Tag(), stack.back()->tagName, stack.back()->tagAttrs );

      if ( stack.back()->empty() )
      {
@ -698,7 +695,7 @@ void ArticleDom::openTag( wstring const & name,

        stack.pop_back();

-        Node * parent = stack.size() ? stack.back() : &root;
+        Node * parent = !stack.empty() ? stack.back() : &root;

        parent->pop_back();
      }
@ -724,8 +721,7 @@ void ArticleDom::openTag( wstring const & name,

  // Reopen tags if needed

-  while( nodesToReopen.size() )
-  {
+  while ( !nodesToReopen.empty() ) {
    if ( stack.empty() )
    {
      root.push_back( nodesToReopen.back() );
@ -739,7 +735,6 @@ void ArticleDom::openTag( wstring const & name,

    nodesToReopen.pop_back();
  }
-
 }

 void ArticleDom::closeTag( wstring const & name,
@ -767,14 +762,12 @@ void ArticleDom::closeTag( wstring const & name,

    list< Node > nodesToReopen;

-    while( stack.size() )
-    {
+    while ( !stack.empty() ) {
      bool found = stack.back()->tagName == name ||
                   checkM( stack.back()->tagName, name );

      if ( !found )
-        nodesToReopen.push_back( Node( Node::Tag(), stack.back()->tagName,
-                                       stack.back()->tagAttrs ) );
+        nodesToReopen.emplace_back( Node::Tag(), stack.back()->tagName, stack.back()->tagAttrs );

      if( stack.back()->empty() && stack.back()->tagName !=  U"br"  )
      {
@ -782,7 +775,7 @@ void ArticleDom::closeTag( wstring const & name,

        stack.pop_back();

-        Node * parent = stack.size() ? stack.back() : &root;
+        Node * parent = !stack.empty() ? stack.back() : &root;

        parent->pop_back();
      }
@ -793,8 +786,7 @@ void ArticleDom::closeTag( wstring const & name,
        break;
    }

-    while( nodesToReopen.size() )
-    {
+    while ( !nodesToReopen.empty() ) {
      if ( stack.empty() )
      {
        root.push_back( nodesToReopen.back() );
@ -880,10 +872,9 @@ DslScanner::DslScanner( string const & fileName ) :

  // Now try guessing the encoding by reading the first two bytes

-  unsigned char firstBytes[ 2 ];
+  unsigned char firstBytes[ 50 ];

-  if ( gzread( f, firstBytes, sizeof( firstBytes ) ) != sizeof( firstBytes ) )
-  {
+  if ( gzread( f, firstBytes, sizeof( firstBytes ) ) != sizeof( firstBytes ) ) {
    // Apparently the file's too short
    gzclose( f );
    throw exMalformedDslFile( fileName );
@ -891,53 +882,19 @@ DslScanner::DslScanner( string const & fileName ) :

  bool needExactEncoding = false;

+  QByteArray ba = QByteArray::fromRawData( (const char *)firstBytes, 50 );
+  codec         = QTextCodec::codecForUtfText( ba, QTextCodec::codecForName( "UTF-8" ) );

-  // If the file begins with the dedicated Unicode marker, we just consume
-  // it. If, on the other hand, it's not, we return the bytes back
-  if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE )
-    encoding = Utf8::Utf16LE;
-  else
-  if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF )
-    encoding = Utf8::Utf16BE;
-  else
-  if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB )
-  {
-    // Looks like Utf8, read one more byte
-    if ( gzread( f, firstBytes, 1 ) != 1 || firstBytes[ 0 ] != 0xBF )
-    {
-      // Either the file's too short, or the BOM is weird
-      gzclose( f );
-      throw exMalformedDslFile( fileName );
-    }
-    
-    encoding = Utf8::Utf8;
-  }
-  else
-  {
-    if ( firstBytes[ 0 ] && !firstBytes[ 1 ] )
-      encoding = Utf8::Utf16LE;
-    else
-    if ( !firstBytes[ 0 ] && firstBytes[ 1 ] )
-      encoding = Utf8::Utf16BE;
-    else
-    {
-      // Ok, this doesn't look like 16-bit Unicode. We will start with a
-      // 8-bit encoding with an intent to find out the exact one from
-      // the header.
-      needExactEncoding = true;
-      encoding = Utf8::Windows1251;
-    }
+  encoding = Utf8::getEncodingForName( codec->name() );
+  qDebug() << codec->name();

-    if ( gzrewind( f ) )
-    {
-      gzclose( f );
-      throw exCantOpen( fileName );
-    }
+  if ( gzrewind( f ) ) {
+    gzclose( f );
+    throw exCantOpen( fileName );
  }

  //iconv.reinit( encoding );
-  codec = QTextCodec::codecForName(getEncodingNameFor(encoding));
-  lineFeed=Utf8::initLineFeed(encoding);
+  lineFeed = Utf8::initLineFeed( encoding );
  // We now can use our own readNextLine() function

  wstring str;