fix: support dsl dictionary utf encoding detection (#830)

* fix: support dsl dictionary utf encoding detection * fix: code smells * fix: code smells * 🎨 apply clang-format changes * fix: adjust linefeed --------- Co-authored-by: xiaoyifang <xiaoyifang@users.noreply.github.com>
2024-11-24 04:24:09 +00:00 · 2023-06-09 08:01:45 +08:00 · 2023-06-09 08:01:45 +08:00 · 67ed24c61c
parent 5fd9261047
commit 67ed24c61c
3 changed files with 144 additions and 165 deletions
--- a/src/common/utf8.cc
+++ b/src/common/utf8.cc
@ -4,32 +4,28 @@
 #include "utf8.hh"
 #include <vector>
 #include <algorithm>
 #include <QByteArray>
 #include <QString>
 namespace Utf8 {
 size_t encode( wchar const * in, size_t inSize, char * out_ )
 {
-  unsigned char * out = (unsigned char *) out_;
+  unsigned char * out = (unsigned char *)out_;
-  while( inSize-- )
+  while ( inSize-- ) {
  {
    if ( *in < 0x80 )
      *out++ = *in++;
-    else
+    else if ( *in < 0x800 ) {
    if ( *in < 0x800 )
    {
      *out++ = 0xC0 | ( *in >> 6 );
      *out++ = 0x80 | ( *in++ & 0x3F );
    }
-    else
+    else if ( *in < 0x10000 ) {
    if ( *in < 0x10000 )
    {
      *out++ = 0xE0 | ( *in >> 12 );
      *out++ = 0x80 | ( ( *in >> 6 ) & 0x3F );
      *out++ = 0x80 | ( *in++ & 0x3F );
    }
-    else
+    else {
    {
      *out++ = 0xF0 | ( *in >> 18 );
      *out++ = 0x80 | ( ( *in >> 12 ) & 0x3F );
      *out++ = 0x80 | ( ( *in >> 6 ) & 0x3F );
@ -37,26 +33,21 @@ size_t encode( wchar const * in, size_t inSize, char * out_ )
    }
  }
-  return out - (unsigned char *) out_;
+  return out - (unsigned char *)out_;
 }
 long decode( char const * in_, size_t inSize, wchar * out_ )
 {
-  unsigned char const * in = (unsigned char const *) in_;
+  unsigned char const * in = (unsigned char const *)in_;
-  wchar * out = out_;
+  wchar * out              = out_;
-  while( inSize-- )
+  while ( inSize-- ) {
  {
    wchar result;
-    if ( *in & 0x80 )
+    if ( *in & 0x80 ) {
-    {
+      if ( *in & 0x40 ) {
-      if ( *in & 0x40 )
+        if ( *in & 0x20 ) {
-      {
+          if ( *in & 0x10 ) {
        if ( *in & 0x20 )
        {
          if ( *in & 0x10 )
          {
            // Four-byte sequence
            if ( *in & 8 )
              // This can't be
@ -67,7 +58,7 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
            inSize -= 3;
-            result = ( (wchar )*in++ & 7 ) << 18;
+            result = ( (wchar)*in++ & 7 ) << 18;
            if ( ( *in & 0xC0 ) != 0x80 )
              return -1;
@ -81,8 +72,7 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
              return -1;
            result |= (wchar)*in++ & 0x3F;
          }
-          else
+          else {
          {
            // Three-byte sequence
            if ( inSize < 2 )
@ -90,7 +80,7 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
            inSize -= 2;
-            result = ( (wchar )*in++ & 0xF ) << 12;
+            result = ( (wchar)*in++ & 0xF ) << 12;
            if ( ( *in & 0xC0 ) != 0x80 )
              return -1;
@ -101,23 +91,21 @@ long decode( char const * in_, size_t inSize, wchar * out_ )
            result |= (wchar)*in++ & 0x3F;
          }
        }
-        else
+        else {
        {
          // Two-byte sequence
          if ( !inSize )
            return -1;
          --inSize;
-          result = ( (wchar )*in++ & 0x1F ) << 6;
+          result = ( (wchar)*in++ & 0x1F ) << 6;
          if ( ( *in & 0xC0 ) != 0x80 )
            return -1;
          result |= (wchar)*in++ & 0x3F;
        }
      }
-      else
+      else {
      {
        // This char is from the middle of encoding, it can't be leading
        return -1;
      }
@ -139,18 +127,17 @@ string encode( wstring const & in ) noexcept
  std::vector< char > buffer( in.size() * 4 );
-  return string( &buffer.front(),
+  return string( &buffer.front(), encode( in.data(), in.size(), &buffer.front() ) );
                 encode( in.data(), in.size(), &buffer.front() ) );
 }
-wstring decode( string const & in ) 
+wstring decode( string const & in )
 {
  if ( in.empty() )
    return {};
  std::vector< wchar > buffer( in.size() );
-  long result = decode( in.data(),  in.size(), &buffer.front() );
+  long result = decode( in.data(), in.size(), &buffer.front() );
  if ( result < 0 )
    throw exCantDecode( in );
@ -160,8 +147,7 @@ wstring decode( string const & in )
 bool isspace( int c )
 {
-  switch( c )
+  switch ( c ) {
  {
    case ' ':
    case '\f':
    case '\n':
@ -176,62 +162,95 @@ bool isspace( int c )
 }
 //get the first line in string s1. -1 if not found
-int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length)
+int findFirstLinePosition( char * s1, int s1length, const char * s2, int s2length )
 {
-    char* pos = std::search(s1,s1+s1length, s2, s2+s2length);
+  char * pos = std::search( s1, s1 + s1length, s2, s2 + s2length );
-    if (pos == s1 + s1length)
+  if ( pos == s1 + s1length )
-        return pos-s1;
+    return pos - s1;
-    //the line size.
+  //the line size.
-    return pos- s1+ s2length;
+  return pos - s1 + s2length;
 }
-char const* getEncodingNameFor(Encoding e)
+char const * getEncodingNameFor( Encoding e )
 {
-    switch (e)
+  switch ( e ) {
-    {
+    case Utf32LE:
      return "UTF-32LE";
    case Utf32BE:
      return "UTF-32BE";
    case Utf16LE:
-        return "UTF-16LE";
+      return "UTF-16LE";
    case Utf16BE:
-        return "UTF-16BE";
+      return "UTF-16BE";
    case Windows1252:
-        return "WINDOWS-1252";
+      return "WINDOWS-1252";
    case Windows1251:
-        return "WINDOWS-1251";
+      return "WINDOWS-1251";
    case Utf8:
-        return "UTF-8";
+      return "UTF-8";
    case Windows1250:
      return "WINDOWS-1250";
    default:
-        return "WINDOWS-1250";
+      return "UTF-8";
-    }
+  }
 }
-LineFeed initLineFeed(Encoding e)
+Encoding getEncodingForName( const QByteArray & _name )
 {
-    LineFeed lf;
+  const auto name = _name.toUpper();
-	switch (e)
+  if ( name == "UTF-32LE" )
-	{
+    return Utf32LE;
-	case Utf8::Utf16LE:
+  if ( name == "UTF-32BE" )
-        lf.lineFeed= new char[2]{ 0x0A,0 };
+    return Utf32BE;
-        lf.length = 2;
+  if ( name == "UTF-16LE" )
-		break;
+    return Utf16LE;
-	case Utf8::Utf16BE:
+  if ( name == "UTF-16BE" )
-        lf.lineFeed = new char[2]{ 0,0x0A };
+    return Utf16BE;
-        lf.length = 2;
+  if ( name == "WINDOWS-1252" )
-		break;
+    return Windows1252;
-	case Utf8::Windows1252:
+  if ( name == "WINDOWS-1251" )
-
+    return Windows1251;
-	case Utf8::Windows1251:
+  if ( name == "UTF-8" )
-
+    return Utf8;
-	case Utf8::Utf8:
+  if ( name == "WINDOWS-1250" )
-
+    return Windows1250;
-	case Utf8::Windows1250:
+  return Utf8;
 	default:
        lf.length = 1;
        lf.lineFeed = new char[1]{ 0x0A };
 	}
    return lf;
 }
 LineFeed initLineFeed( const Encoding e )
 {
  LineFeed lf{};
  switch ( e ) {
    case Utf8::Utf32LE:
      lf.lineFeed = new char[ 4 ]{ 0x0A, 0, 0, 0 };
      lf.length   = 4;
      break;
    case Utf8::Utf32BE:
      lf.lineFeed = new char[ 4 ]{ 0, 0, 0, 0x0A };
      lf.length   = 4;
      break;
    case Utf8::Utf16LE:
      lf.lineFeed = new char[ 2 ]{ 0x0A, 0 };
      lf.length   = 2;
      break;
    case Utf8::Utf16BE:
      lf.lineFeed = new char[ 2 ]{ 0, 0x0A };
      lf.length   = 2;
      break;
    case Utf8::Windows1252:
    case Utf8::Windows1251:
    case Utf8::Utf8:
    case Utf8::Windows1250:
    default:
      lf.length   = 1;
      lf.lineFeed = new char[ 1 ]{ 0x0A };
  }
  return lf;
 }
 } // namespace Utf8
--- a/src/common/utf8.hh
+++ b/src/common/utf8.hh
@ -4,6 +4,7 @@
 #define __UTF8_HH_INCLUDED__
 #include <cstdio>
 #include <QByteArray>
 #include <string>
 #include "ex.hh"
 #include "wstring.hh"
@ -15,14 +16,15 @@
 namespace Utf8 {
 // Those are possible encodings for .dsl files
-enum Encoding
+enum Encoding {
 {
  Utf16LE,
  Utf16BE,
  Windows1252,
  Windows1251,
  Windows1250,
-  Utf8 // This is an extension. Detected solely by the UTF8 BOM.
+  Utf8, // This is an extension. Detected solely by the UTF8 BOM.
  Utf32BE,
  Utf32LE,
 };
 using std::string;
@ -54,7 +56,8 @@ bool isspace( int c );
 //get the first line in string s1. -1 if not found
 int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length);
-char const* getEncodingNameFor(Encoding e);
+char const * getEncodingNameFor( Encoding e );
 Encoding getEncodingForName( const QByteArray & name );
 struct LineFeed
 {
--- a/src/dict/dsl_details.cc
+++ b/src/dict/dsl_details.cc
@ -9,6 +9,7 @@
 #include "ufile.hh"
 #include "utf8.hh"
 #include <exception>
 #include <stdio.h>
 #include <wctype.h>
@ -157,9 +158,9 @@ wstring ArticleDom::Node::renderAsText( bool stripTrsTag ) const
  wstring result;
-  for( list< Node >::const_iterator i = begin(); i != end(); ++i )
+  for ( const auto & i : *this )
-    if( !stripTrsTag || i->tagName !=  U"!trs"  )
+    if ( !stripTrsTag || i.tagName != U"!trs" )
-      result += i->renderAsText( stripTrsTag );
+      result += i.renderAsText( stripTrsTag );
  return result;
 }
@ -248,9 +249,7 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
            processUnsortedParts( linkTo, true );
            expandOptionalParts( linkTo, &allLinkEntries );
-            for( list< wstring >::iterator entry = allLinkEntries.begin();
+            for ( auto entry = allLinkEntries.begin(); entry != allLinkEntries.end(); ) {
                 entry != allLinkEntries.end(); )
            {
              if ( !textNode )
              {
                Node text = Node( Node::Text(), wstring() );
@ -279,8 +278,8 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
              ArticleDom nodeDom( linkText, dictName, headword_ );
              Node link( Node::Tag(),  U"@" , wstring() );
-              for( Node::iterator n = nodeDom.root.begin(); n != nodeDom.root.end(); ++n )
+              for ( auto & n : nodeDom.root )
-                link.push_back( *n );
+                link.push_back( n );
              ++entry;
@ -352,17 +351,19 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
            nextChar();
          }
        }
-        catch( eot )
+        catch ( std::exception & ex ) {
-        {
+          if ( !dictionaryName.empty() )
          if( !dictionaryName.empty() )
            gdWarning( R"(DSL: Unfinished tag "%s" with attributes "%s" found in "%s", article "%s".)",
-                       QString::fromStdU32String( name ).toUtf8().data(), QString::fromStdU32String( attrs ).toUtf8().data(),
+                       QString::fromStdU32String( name ).toUtf8().data(),
-                       dictionaryName.c_str(), QString::fromStdU32String( headword ).toUtf8().data() );
+                       QString::fromStdU32String( attrs ).toUtf8().data(),
                       dictionaryName.c_str(),
                       QString::fromStdU32String( headword ).toUtf8().data() );
          else
            gdWarning( R"(DSL: Unfinished tag "%s" with attributes "%s" found)",
-                       QString::fromStdU32String( name ).toUtf8().data(), QString::fromStdU32String( attrs ).toUtf8().data() );
+                       QString::fromStdU32String( name ).toUtf8().data(),
                       QString::fromStdU32String( attrs ).toUtf8().data() );
-          throw eot();
+          throw ex;
        }
        // Add the tag, or close it
@ -491,8 +492,8 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
          ArticleDom nodeDom( linkText, dictName, headword_ );
          Node link( Node::Tag(),  U"ref" , wstring() );
-          for( Node::iterator n = nodeDom.root.begin(); n != nodeDom.root.end(); ++n )
+          for ( auto & n : nodeDom.root )
-            link.push_back( *n );
+            link.push_back( n );
          if ( stack.empty() )
            root.push_back( link );
@ -646,16 +647,14 @@ ArticleDom::ArticleDom( wstring const & str, string const & dictName,
      textNode->text.push_back( ch );
    } // for( ; ; )
  }
-  catch( eot )
+  catch ( eot & ) {
  {
  }
  if ( textNode )
    stack.pop_back();
-  if ( stack.size() )
+  if ( !stack.empty() ) {
-  {
+    auto it = std::find_if( stack.begin(), stack.end(), MustTagBeClosed() );
    list< Node * >::iterator it = std::find_if( stack.begin(), stack.end(), MustTagBeClosed() );
    if( it == stack.end() )
      return; // no unclosed tags that must be closed => nothing to warn about
    QByteArray const firstTagName = QString::fromStdU32String( ( *it )->tagName ).toUtf8();
@ -687,10 +686,8 @@ void ArticleDom::openTag( wstring const & name,
    // All tags above [m] tag will be closed and reopened after
    // to avoid break this tag by closing some other tag.
-    while( stack.size() )
+    while ( !stack.empty() ) {
-    {
+      nodesToReopen.emplace_back( Node::Tag(), stack.back()->tagName, stack.back()->tagAttrs );
      nodesToReopen.push_back( Node( Node::Tag(), stack.back()->tagName,
                                     stack.back()->tagAttrs ) );
      if ( stack.back()->empty() )
      {
@ -698,7 +695,7 @@ void ArticleDom::openTag( wstring const & name,
        stack.pop_back();
-        Node * parent = stack.size() ? stack.back() : &root;
+        Node * parent = !stack.empty() ? stack.back() : &root;
        parent->pop_back();
      }
@ -724,8 +721,7 @@ void ArticleDom::openTag( wstring const & name,
  // Reopen tags if needed
-  while( nodesToReopen.size() )
+  while ( !nodesToReopen.empty() ) {
  {
    if ( stack.empty() )
    {
      root.push_back( nodesToReopen.back() );
@ -739,7 +735,6 @@ void ArticleDom::openTag( wstring const & name,
    nodesToReopen.pop_back();
  }
 }
 void ArticleDom::closeTag( wstring const & name,
@ -767,14 +762,12 @@ void ArticleDom::closeTag( wstring const & name,
    list< Node > nodesToReopen;
-    while( stack.size() )
+    while ( !stack.empty() ) {
    {
      bool found = stack.back()->tagName == name ||
                   checkM( stack.back()->tagName, name );
      if ( !found )
-        nodesToReopen.push_back( Node( Node::Tag(), stack.back()->tagName,
+        nodesToReopen.emplace_back( Node::Tag(), stack.back()->tagName, stack.back()->tagAttrs );
                                       stack.back()->tagAttrs ) );
      if( stack.back()->empty() && stack.back()->tagName !=  U"br"  )
      {
@ -782,7 +775,7 @@ void ArticleDom::closeTag( wstring const & name,
        stack.pop_back();
-        Node * parent = stack.size() ? stack.back() : &root;
+        Node * parent = !stack.empty() ? stack.back() : &root;
        parent->pop_back();
      }
@ -793,8 +786,7 @@ void ArticleDom::closeTag( wstring const & name,
        break;
    }
-    while( nodesToReopen.size() )
+    while ( !nodesToReopen.empty() ) {
    {
      if ( stack.empty() )
      {
        root.push_back( nodesToReopen.back() );
@ -880,10 +872,9 @@ DslScanner::DslScanner( string const & fileName ) :
  // Now try guessing the encoding by reading the first two bytes
-  unsigned char firstBytes[ 2 ];
+  unsigned char firstBytes[ 50 ];
-  if ( gzread( f, firstBytes, sizeof( firstBytes ) ) != sizeof( firstBytes ) )
+  if ( gzread( f, firstBytes, sizeof( firstBytes ) ) != sizeof( firstBytes ) ) {
  {
    // Apparently the file's too short
    gzclose( f );
    throw exMalformedDslFile( fileName );
@ -891,53 +882,19 @@ DslScanner::DslScanner( string const & fileName ) :
  bool needExactEncoding = false;
  QByteArray ba = QByteArray::fromRawData( (const char *)firstBytes, 50 );
  codec         = QTextCodec::codecForUtfText( ba, QTextCodec::codecForName( "UTF-8" ) );
-  // If the file begins with the dedicated Unicode marker, we just consume
+  encoding = Utf8::getEncodingForName( codec->name() );
-  // it. If, on the other hand, it's not, we return the bytes back
+  qDebug() << codec->name();
  if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE )
    encoding = Utf8::Utf16LE;
  else
  if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF )
    encoding = Utf8::Utf16BE;
  else
  if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB )
  {
    // Looks like Utf8, read one more byte
    if ( gzread( f, firstBytes, 1 ) != 1 || firstBytes[ 0 ] != 0xBF )
    {
      // Either the file's too short, or the BOM is weird
      gzclose( f );
      throw exMalformedDslFile( fileName );
    }
    encoding = Utf8::Utf8;
  }
  else
  {
    if ( firstBytes[ 0 ] && !firstBytes[ 1 ] )
      encoding = Utf8::Utf16LE;
    else
    if ( !firstBytes[ 0 ] && firstBytes[ 1 ] )
      encoding = Utf8::Utf16BE;
    else
    {
      // Ok, this doesn't look like 16-bit Unicode. We will start with a
      // 8-bit encoding with an intent to find out the exact one from
      // the header.
      needExactEncoding = true;
      encoding = Utf8::Windows1251;
    }
-    if ( gzrewind( f ) )
+  if ( gzrewind( f ) ) {
-    {
+    gzclose( f );
-      gzclose( f );
+    throw exCantOpen( fileName );
      throw exCantOpen( fileName );
    }
  }
  //iconv.reinit( encoding );
-  codec = QTextCodec::codecForName(getEncodingNameFor(encoding));
+  lineFeed = Utf8::initLineFeed( encoding );
  lineFeed=Utf8::initLineFeed(encoding);
  // We now can use our own readNextLine() function
  wstring str;