From 67ed24c61c29d4025843c9d892f0445c4c008bde Mon Sep 17 00:00:00 2001
From: xiaoyifang <105986+xiaoyifang@users.noreply.github.com>
Date: Fri, 9 Jun 2023 08:01:45 +0800
Subject: [PATCH] fix: support dsl dictionary utf encoding detection (#830)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix: support dsl dictionary utf encoding detection

* fix: code smells

* fix: code smells

* 🎨 apply clang-format changes

* fix: adjust linefeed

---------

Co-authored-by: xiaoyifang <xiaoyifang@users.noreply.github.com>
---
 src/common/utf8.cc      | 177 ++++++++++++++++++++++------------------
 src/common/utf8.hh      |  11 ++-
 src/dict/dsl_details.cc | 121 +++++++++------------------
 3 files changed, 144 insertions(+), 165 deletions(-)
diff --git a/src/common/utf8.cc b/src/common/utf8.cc
index ea641408..8cf9c4ab 100644
--- a/src/common/utf8.cc
+++ b/src/common/utf8.cc
@@ -4,32 +4,28 @@
 #include "utf8.hh"
 #include <vector>
 #include <algorithm>
+#include <QByteArray>
+#include <QString>
 
 namespace Utf8 {
 
 size_t encode( wchar const * in, size_t inSize, char * out_ )
 {
-  unsigned char * out = (unsigned char *) out_;
+  unsigned char * out = (unsigned char *)out_;
 
-  while( inSize-- )
-  {
+  while ( inSize-- ) {
     if ( *in < 0x80 )
       *out++ = *in++;
-    else
-    if ( *in < 0x800 )
-    {
+    else if ( *in < 0x800 ) {
       *out++ = 0xC0 | ( *in >> 6 );
       *out++ = 0x80 | ( *in++ & 0x3F );
     }
-    else
-    if ( *in < 0x10000 )
-    {
+    else if ( *in < 0x10000 ) {
       *out++ = 0xE0 | ( *in >> 12 );
       *out++ = 0x80 | ( ( *in >> 6 ) & 0x3F );
       *out++ = 0x80 | ( *in++ & 0x3F );
     }
-    else
-    {
+    else {
       *out++ = 0xF0 | ( *in >> 18 );
       *out++ = 0x80 | ( ( *in >> 12 ) & 0x3F );
       *out++ = 0x80 | ( ( *in >> 6 ) & 0x3F );
@@ -37,26 +33,21 @@ size_t encode( wchar const * in, size_t inSize, char * out_ )
     }
   }
 
-  return out - (unsigned char *) out_;
+  return out - (unsigned char *)out_;
 }
 
 long decode( char const * in_, size_t inSize, wchar * out_ )
 {
-  unsigned char const * in = (unsigned char const *) in_;
-  wchar * out = out_;
+  unsigned char const * in = (unsigned char const *)in_;
+  wchar * out              = out_;
 
-  while( inSize-- )
-  {
+  while ( inSize-- ) {
     wchar result;
 
-    if ( *in & 0x80 )
-    {
-      if ( *in & 0x40 )
-      {
-        if ( *in & 0x20 )
-        {
-          if ( *in & 0x10 )
-          {
+    if ( *in & 0x80 ) {
+      if ( *in & 0x40 ) {
+        if ( *in & 0x20 ) {
+          if ( *in & 0x10 ) {
             // Four-byte sequence
             if ( *in & 8 )
               // This can't be
@@ -67,7 +58,7 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
 
             inSize -= 3;
 
-            result = ( (wchar )*in++ & 7 ) << 18;
+            result = ( (wchar)*in++ & 7 ) << 18;
 
             if ( ( *in & 0xC0 ) != 0x80 )
               return -1;
@@ -81,8 +72,7 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
               return -1;
             result |= (wchar)*in++ & 0x3F;
           }
-          else
-          {
+          else {
             // Three-byte sequence
 
             if ( inSize < 2 )
@@ -90,7 +80,7 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
 
             inSize -= 2;
 
-            result = ( (wchar )*in++ & 0xF ) << 12;
+            result = ( (wchar)*in++ & 0xF ) << 12;
 
             if ( ( *in & 0xC0 ) != 0x80 )
               return -1;
@@ -101,23 +91,21 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
             result |= (wchar)*in++ & 0x3F;
           }
         }
-        else
-        {
+        else {
           // Two-byte sequence
           if ( !inSize )
             return -1;
 
           --inSize;
 
-          result = ( (wchar )*in++ & 0x1F ) << 6;
+          result = ( (wchar)*in++ & 0x1F ) << 6;
 
           if ( ( *in & 0xC0 ) != 0x80 )
             return -1;
           result |= (wchar)*in++ & 0x3F;
         }
       }
-      else
-      {
+      else {
         // This char is from the middle of encoding, it can't be leading
         return -1;
       }
@@ -139,18 +127,17 @@ string encode( wstring const & in ) noexcept
 
   std::vector< char > buffer( in.size() * 4 );
 
-  return string( &buffer.front(),
-                 encode( in.data(), in.size(), &buffer.front() ) );
+  return string( &buffer.front(), encode( in.data(), in.size(), &buffer.front() ) );
 }
 
-wstring decode( string const & in ) 
+wstring decode( string const & in )
 {
   if ( in.empty() )
     return {};
 
   std::vector< wchar > buffer( in.size() );
 
-  long result = decode( in.data(),  in.size(), &buffer.front() );
+  long result = decode( in.data(), in.size(), &buffer.front() );
 
   if ( result < 0 )
     throw exCantDecode( in );
@@ -160,8 +147,7 @@ wstring decode( string const & in )
 
 bool isspace( int c )
 {
-  switch( c )
-  {
+  switch ( c ) {
     case ' ':
     case '\f':
     case '\n':
@@ -176,62 +162,95 @@ bool isspace( int c )
 }
 
 //get the first line in string s1. -1 if not found
-int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length)
+int findFirstLinePosition( char * s1, int s1length, const char * s2, int s2length )
 {
-    char* pos = std::search(s1,s1+s1length, s2, s2+s2length);
+  char * pos = std::search( s1, s1 + s1length, s2, s2 + s2length );
 
-    if (pos == s1 + s1length)
-        return pos-s1;
+  if ( pos == s1 + s1length )
+    return pos - s1;
 
-    //the line size.
-    return pos- s1+ s2length;
+  //the line size.
+  return pos - s1 + s2length;
 }
 
-char const* getEncodingNameFor(Encoding e)
+char const * getEncodingNameFor( Encoding e )
 {
-    switch (e)
-    {
+  switch ( e ) {
+    case Utf32LE:
+      return "UTF-32LE";
+    case Utf32BE:
+      return "UTF-32BE";
     case Utf16LE:
-        return "UTF-16LE";
+      return "UTF-16LE";
     case Utf16BE:
-        return "UTF-16BE";
+      return "UTF-16BE";
     case Windows1252:
-        return "WINDOWS-1252";
+      return "WINDOWS-1252";
     case Windows1251:
-        return "WINDOWS-1251";
+      return "WINDOWS-1251";
     case Utf8:
-        return "UTF-8";
+      return "UTF-8";
     case Windows1250:
+      return "WINDOWS-1250";
     default:
-        return "WINDOWS-1250";
-    }
+      return "UTF-8";
+  }
 }
 
-LineFeed initLineFeed(Encoding e)
+Encoding getEncodingForName( const QByteArray & _name )
 {
-    LineFeed lf;
-	switch (e)
-	{
-	case Utf8::Utf16LE:
-        lf.lineFeed= new char[2]{ 0x0A,0 };
-        lf.length = 2;
-		break;
-	case Utf8::Utf16BE:
-        lf.lineFeed = new char[2]{ 0,0x0A };
-        lf.length = 2;
-		break;
-	case Utf8::Windows1252:
-
-	case Utf8::Windows1251:
-
-	case Utf8::Utf8:
-
-	case Utf8::Windows1250:
-	default:
-        lf.length = 1;
-        lf.lineFeed = new char[1]{ 0x0A };
-	}
-    return lf;
+  const auto name = _name.toUpper();
+  if ( name == "UTF-32LE" )
+    return Utf32LE;
+  if ( name == "UTF-32BE" )
+    return Utf32BE;
+  if ( name == "UTF-16LE" )
+    return Utf16LE;
+  if ( name == "UTF-16BE" )
+    return Utf16BE;
+  if ( name == "WINDOWS-1252" )
+    return Windows1252;
+  if ( name == "WINDOWS-1251" )
+    return Windows1251;
+  if ( name == "UTF-8" )
+    return Utf8;
+  if ( name == "WINDOWS-1250" )
+    return Windows1250;
+  return Utf8;
 }
 
+LineFeed initLineFeed( const Encoding e )
+{
+  LineFeed lf{};
+  switch ( e ) {
+    case Utf8::Utf32LE:
+      lf.lineFeed = new char[ 4 ]{ 0x0A, 0, 0, 0 };
+      lf.length   = 4;
+      break;
+    case Utf8::Utf32BE:
+      lf.lineFeed = new char[ 4 ]{ 0, 0, 0, 0x0A };
+      lf.length   = 4;
+      break;
+    case Utf8::Utf16LE:
+      lf.lineFeed = new char[ 2 ]{ 0x0A, 0 };
+      lf.length   = 2;
+      break;
+    case Utf8::Utf16BE:
+      lf.lineFeed = new char[ 2 ]{ 0, 0x0A };
+      lf.length   = 2;
+      break;
+    case Utf8::Windows1252:
+
+    case Utf8::Windows1251:
+
+    case Utf8::Utf8:
+
+    case Utf8::Windows1250:
+    default:
+      lf.length   = 1;
+      lf.lineFeed = new char[ 1 ]{ 0x0A };
+  }
+  return lf;
 }
+
+} // namespace Utf8
diff --git a/src/common/utf8.hh b/src/common/utf8.hh
index 75f96503..787024ba 100644
--- a/src/common/utf8.hh
+++ b/src/common/utf8.hh
@@ -4,6 +4,7 @@
 #define __UTF8_HH_INCLUDED__
 
 #include <cstdio>
+#include <QByteArray>
 #include <string>
 #include "ex.hh"
 #include "wstring.hh"
@@ -15,14 +16,15 @@
 namespace Utf8 {
 
 // Those are possible encodings for .dsl files
-enum Encoding
-{
+enum Encoding {
   Utf16LE,
   Utf16BE,
   Windows1252,
   Windows1251,
   Windows1250,
-  Utf8 // This is an extension. Detected solely by the UTF8 BOM.
+  Utf8, // This is an extension. Detected solely by the UTF8 BOM.
+  Utf32BE,
+  Utf32LE,
 };
 
 using std::string;
@@ -54,7 +56,8 @@ bool isspace( int c );
 
 //get the first line in string s1. -1 if not found
 int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length);
-char const* getEncodingNameFor(Encoding e);
+char const * getEncodingNameFor( Encoding e );
+Encoding getEncodingForName( const QByteArray & name );
 
 struct LineFeed
 {
diff --git a/src/dict/dsl_details.cc b/src/dict/dsl_details.cc
index 7ec7d328..3ca218a8 100644
--- a/src/dict/dsl_details.cc
+++ b/src/dict/dsl_details.cc
@@ -9,6 +9,7 @@
 #include "ufile.hh"
 #include "utf8.hh"
 
+#include <exception>
 #include <stdio.h>
 #include <wctype.h>
 
@@ -157,9 +158,9 @@ wstring ArticleDom::Node::renderAsText( bool stripTrsTag ) const
 
   wstring result;
 
-  for( list< Node >::const_iterator i = begin(); i != end(); ++i )
-    if( !stripTrsTag || i->tagName !=  U"!trs"  )
-      result += i->renderAsText( stripTrsTag );
+  for ( const auto & i : *this )
+    if ( !stripTrsTag || i.tagName != U"!trs" )
+      result += i.renderAsText( stripTrsTag );
 
   return result;
 }
@@ -248,9 +249,7 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
             processUnsortedParts( linkTo, true );
             expandOptionalParts( linkTo, &allLinkEntries );
 
-            for( list< wstring >::iterator entry = allLinkEntries.begin();
-                 entry != allLinkEntries.end(); )
-            {
+            for ( auto entry = allLinkEntries.begin(); entry != allLinkEntries.end(); ) {
               if ( !textNode )
               {
                 Node text = Node( Node::Text(), wstring() );
@@ -279,8 +278,8 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
               ArticleDom nodeDom( linkText, dictName, headword_ );
 
               Node link( Node::Tag(),  U"@" , wstring() );
-              for( Node::iterator n = nodeDom.root.begin(); n != nodeDom.root.end(); ++n )
-                link.push_back( *n );
+              for ( auto & n : nodeDom.root )
+                link.push_back( n );
 
               ++entry;
 
@@ -352,17 +351,19 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
             nextChar();
           }
         }
-        catch( eot )
-        {
-          if( !dictionaryName.empty() )
+        catch ( std::exception & ex ) {
+          if ( !dictionaryName.empty() )
             gdWarning( R"(DSL: Unfinished tag "%s" with attributes "%s" found in "%s", article "%s".)",
-                       QString::fromStdU32String( name ).toUtf8().data(), QString::fromStdU32String( attrs ).toUtf8().data(),
-                       dictionaryName.c_str(), QString::fromStdU32String( headword ).toUtf8().data() );
+                       QString::fromStdU32String( name ).toUtf8().data(),
+                       QString::fromStdU32String( attrs ).toUtf8().data(),
+                       dictionaryName.c_str(),
+                       QString::fromStdU32String( headword ).toUtf8().data() );
           else
             gdWarning( R"(DSL: Unfinished tag "%s" with attributes "%s" found)",
-                       QString::fromStdU32String( name ).toUtf8().data(), QString::fromStdU32String( attrs ).toUtf8().data() );
+                       QString::fromStdU32String( name ).toUtf8().data(),
+                       QString::fromStdU32String( attrs ).toUtf8().data() );
 
-          throw eot();
+          throw ex;
         }
 
         // Add the tag, or close it
@@ -491,8 +492,8 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
           ArticleDom nodeDom( linkText, dictName, headword_ );
 
           Node link( Node::Tag(),  U"ref" , wstring() );
-          for( Node::iterator n = nodeDom.root.begin(); n != nodeDom.root.end(); ++n )
-            link.push_back( *n );
+          for ( auto & n : nodeDom.root )
+            link.push_back( n );
 
           if ( stack.empty() )
             root.push_back( link );
@@ -646,16 +647,14 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
       textNode->text.push_back( ch );
     } // for( ; ; )
   }
-  catch( eot )
-  {
+  catch ( eot & ) {
   }
 
   if ( textNode )
     stack.pop_back();
 
-  if ( stack.size() )
-  {
-    list< Node * >::iterator it = std::find_if( stack.begin(), stack.end(), MustTagBeClosed() );
+  if ( !stack.empty() ) {
+    auto it = std::find_if( stack.begin(), stack.end(), MustTagBeClosed() );
     if( it == stack.end() )
       return; // no unclosed tags that must be closed => nothing to warn about
     QByteArray const firstTagName = QString::fromStdU32String( ( *it )->tagName ).toUtf8();
@@ -687,10 +686,8 @@ void ArticleDom::openTag( wstring const & name,
     // All tags above [m] tag will be closed and reopened after
     // to avoid break this tag by closing some other tag.
 
-    while( stack.size() )
-    {
-      nodesToReopen.push_back( Node( Node::Tag(), stack.back()->tagName,
-                                     stack.back()->tagAttrs ) );
+    while ( !stack.empty() ) {
+      nodesToReopen.emplace_back( Node::Tag(), stack.back()->tagName, stack.back()->tagAttrs );
 
       if ( stack.back()->empty() )
       {
@@ -698,7 +695,7 @@ void ArticleDom::openTag( wstring const & name,
 
         stack.pop_back();
 
-        Node * parent = stack.size() ? stack.back() : &root;
+        Node * parent = !stack.empty() ? stack.back() : &root;
 
         parent->pop_back();
       }
@@ -724,8 +721,7 @@ void ArticleDom::openTag( wstring const & name,
 
   // Reopen tags if needed
 
-  while( nodesToReopen.size() )
-  {
+  while ( !nodesToReopen.empty() ) {
     if ( stack.empty() )
     {
       root.push_back( nodesToReopen.back() );
@@ -739,7 +735,6 @@ void ArticleDom::openTag( wstring const & name,
 
     nodesToReopen.pop_back();
   }
-
 }
 
 void ArticleDom::closeTag( wstring const & name,
@@ -767,14 +762,12 @@ void ArticleDom::closeTag( wstring const & name,
 
     list< Node > nodesToReopen;
 
-    while( stack.size() )
-    {
+    while ( !stack.empty() ) {
       bool found = stack.back()->tagName == name ||
                    checkM( stack.back()->tagName, name );
 
       if ( !found )
-        nodesToReopen.push_back( Node( Node::Tag(), stack.back()->tagName,
-                                       stack.back()->tagAttrs ) );
+        nodesToReopen.emplace_back( Node::Tag(), stack.back()->tagName, stack.back()->tagAttrs );
 
       if( stack.back()->empty() && stack.back()->tagName !=  U"br"  )
       {
@@ -782,7 +775,7 @@ void ArticleDom::closeTag( wstring const & name,
 
         stack.pop_back();
 
-        Node * parent = stack.size() ? stack.back() : &root;
+        Node * parent = !stack.empty() ? stack.back() : &root;
 
         parent->pop_back();
       }
@@ -793,8 +786,7 @@ void ArticleDom::closeTag( wstring const & name,
         break;
     }
 
-    while( nodesToReopen.size() )
-    {
+    while ( !nodesToReopen.empty() ) {
       if ( stack.empty() )
       {
         root.push_back( nodesToReopen.back() );
@@ -880,10 +872,9 @@ DslScanner::DslScanner( string const & fileName ) :
 
   // Now try guessing the encoding by reading the first two bytes
 
-  unsigned char firstBytes[ 2 ];
+  unsigned char firstBytes[ 50 ];
 
-  if ( gzread( f, firstBytes, sizeof( firstBytes ) ) != sizeof( firstBytes ) )
-  {
+  if ( gzread( f, firstBytes, sizeof( firstBytes ) ) != sizeof( firstBytes ) ) {
     // Apparently the file's too short
     gzclose( f );
     throw exMalformedDslFile( fileName );
@@ -891,53 +882,19 @@ DslScanner::DslScanner( string const & fileName ) :
 
   bool needExactEncoding = false;
 
+  QByteArray ba = QByteArray::fromRawData( (const char *)firstBytes, 50 );
+  codec         = QTextCodec::codecForUtfText( ba, QTextCodec::codecForName( "UTF-8" ) );
 
-  // If the file begins with the dedicated Unicode marker, we just consume
-  // it. If, on the other hand, it's not, we return the bytes back
-  if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE )
-    encoding = Utf8::Utf16LE;
-  else
-  if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF )
-    encoding = Utf8::Utf16BE;
-  else
-  if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB )
-  {
-    // Looks like Utf8, read one more byte
-    if ( gzread( f, firstBytes, 1 ) != 1 || firstBytes[ 0 ] != 0xBF )
-    {
-      // Either the file's too short, or the BOM is weird
-      gzclose( f );
-      throw exMalformedDslFile( fileName );
-    }
-    
-    encoding = Utf8::Utf8;
-  }
-  else
-  {
-    if ( firstBytes[ 0 ] && !firstBytes[ 1 ] )
-      encoding = Utf8::Utf16LE;
-    else
-    if ( !firstBytes[ 0 ] && firstBytes[ 1 ] )
-      encoding = Utf8::Utf16BE;
-    else
-    {
-      // Ok, this doesn't look like 16-bit Unicode. We will start with a
-      // 8-bit encoding with an intent to find out the exact one from
-      // the header.
-      needExactEncoding = true;
-      encoding = Utf8::Windows1251;
-    }
+  encoding = Utf8::getEncodingForName( codec->name() );
+  qDebug() << codec->name();
 
-    if ( gzrewind( f ) )
-    {
-      gzclose( f );
-      throw exCantOpen( fileName );
-    }
+  if ( gzrewind( f ) ) {
+    gzclose( f );
+    throw exCantOpen( fileName );
   }
 
   //iconv.reinit( encoding );
-  codec = QTextCodec::codecForName(getEncodingNameFor(encoding));
-  lineFeed=Utf8::initLineFeed(encoding);
+  lineFeed = Utf8::initLineFeed( encoding );
   // We now can use our own readNextLine() function
 
   wstring str;