refactor: merge some encoding and encoding names related code

2024-11-30 17:24:08 +00:00 · 2024-11-24 01:29:09 -05:00 · 2024-11-24 01:29:09 -05:00 · dda91a30dd
parent 52a9427b8b
commit dda91a30dd
11 changed files with 113 additions and 111 deletions
--- a/.clang-tidy
+++ b/.clang-tidy
@ -26,6 +26,7 @@ Checks: >
  -google-readability-casting,
  -hicpp-deprecated-headers,
  -hicpp-no-array-decay,
  -misc-confusable-identifiers,
  -misc-const-correctness,
  -misc-include-cleaner,
  -misc-non-private-member-variables-in-classes,
@ -33,6 +34,7 @@ Checks: >
  -modernize-deprecated-headers,
  -modernize-use-nodiscard,
  -modernize-use-trailing-return-type,
  -performance-enum-size,
  -readability-function-cognitive-complexity,
  -readability-identifier-length,
  -readability-magic-numbers,
--- a/src/common/iconv.cc
+++ b/src/common/iconv.cc
@ -6,12 +6,8 @@
 #include <errno.h>
 #include <string.h>
 char const * const Iconv::GdWchar = "UTF-32LE";
 char const * const Iconv::Utf16Le = "UTF-16LE";
 char const * const Iconv::Utf8    = "UTF-8";
 Iconv::Iconv( char const * from ):
-  state( iconv_open( Utf8, from ) )
+  state( iconv_open( Text::utf8, from ) )
 {
  if ( state == (iconv_t)-1 ) {
    throw exCantInit( strerror( errno ) );
--- a/src/common/iconv.hh
+++ b/src/common/iconv.hh
@ -3,14 +3,11 @@
 #pragma once
 #include <QString>
 #include "text.hh"
 #include "ex.hh"
-
+#include "text.hh"
 #include <QString>
 #include <iconv.h>
 /// "Internationalization conversion" for char encoding conversion, currently implemented with iconv()
 /// Only supports converting from a known "from" to UTF8
 class Iconv
@ -22,12 +19,6 @@ public:
  DEF_EX( Ex, "Iconv exception", std::exception )
  DEF_EX_STR( exCantInit, "Can't initialize iconv conversion:", Ex )
  // Some predefined character sets' names
  static char const * const GdWchar;
  static char const * const Utf16Le;
  static char const * const Utf8;
  explicit Iconv( char const * from );
  ~Iconv();
--- a/src/common/text.cc
+++ b/src/common/text.cc
@ -10,6 +10,60 @@
 namespace Text {
 const char * getEncodingNameFor( Encoding e )
 {
  switch ( e ) {
    case Encoding::Utf32LE:
      return utf32_le;
    case Encoding::Utf32BE:
      return utf32_be;
    case Encoding::Utf32:
      return utf32;
    case Encoding::Utf16LE:
      return utf16_le;
    case Encoding::Utf16BE:
      return utf16_be;
    case Encoding::Windows1252:
      return windows_1252;
    case Encoding::Windows1251:
      return windows_1251;
    case Encoding::Windows1250:
      return windows_1250;
    case Encoding::Utf8:
    default:
      return utf8;
  }
 }
 Encoding getEncodingForName( const QByteArray & name )
 {
  auto const n = name.toUpper();
  if ( n == utf32_le ) {
    return Encoding::Utf32LE;
  }
  if ( n == utf32_be ) {
    return Encoding::Utf32BE;
  }
  if ( n == utf32 ) {
    return Encoding::Utf32;
  }
  if ( n == utf16_le ) {
    return Encoding::Utf16LE;
  }
  if ( n == utf16_be ) {
    return Encoding::Utf16BE;
  }
  if ( n == windows_1252 ) {
    return Encoding::Windows1252;
  }
  if ( n == windows_1251 ) {
    return Encoding::Windows1251;
  }
  if ( n == windows_1250 ) {
    return Encoding::Windows1250;
  }
  return Encoding::Utf8;
 }
 /// Encodes the given UTF-32 into UTF-8. The inSize specifies the number
 /// of wide characters the 'in' pointer points to. The 'out' buffer must be
@ -200,87 +254,31 @@ int findFirstLinePosition( char * s1, int s1length, const char * s2, int s2lengt
  return pos - s1 + s2length;
 }
 char const * getEncodingNameFor( Encoding e )
 {
  switch ( e ) {
    case Utf32LE:
      return "UTF-32LE";
    case Utf32BE:
      return "UTF-32BE";
    case Utf16LE:
      return "UTF-16LE";
    case Utf16BE:
      return "UTF-16BE";
    case Windows1252:
      return "WINDOWS-1252";
    case Windows1251:
      return "WINDOWS-1251";
    case Utf8:
      return "UTF-8";
    case Windows1250:
      return "WINDOWS-1250";
    default:
      return "UTF-8";
  }
 }
 Encoding getEncodingForName( const QByteArray & _name )
 {
  const auto name = _name.toUpper();
  if ( name == "UTF-32LE" ) {
    return Utf32LE;
  }
  if ( name == "UTF-32BE" ) {
    return Utf32BE;
  }
  if ( name == "UTF-16LE" ) {
    return Utf16LE;
  }
  if ( name == "UTF-16BE" ) {
    return Utf16BE;
  }
  if ( name == "WINDOWS-1252" ) {
    return Windows1252;
  }
  if ( name == "WINDOWS-1251" ) {
    return Windows1251;
  }
  if ( name == "UTF-8" ) {
    return Utf8;
  }
  if ( name == "WINDOWS-1250" ) {
    return Windows1250;
  }
  return Utf8;
 }
 LineFeed initLineFeed( const Encoding e )
 {
  LineFeed lf{};
  switch ( e ) {
-    case Utf32LE:
+    case Encoding::Utf32LE:
      lf.lineFeed = new char[ 4 ]{ 0x0A, 0, 0, 0 };
      lf.length   = 4;
      break;
-    case Utf32BE:
+    case Encoding::Utf32BE:
      lf.lineFeed = new char[ 4 ]{ 0, 0, 0, 0x0A };
      lf.length   = 4;
      break;
-    case Utf16LE:
+    case Encoding::Utf16LE:
      lf.lineFeed = new char[ 2 ]{ 0x0A, 0 };
      lf.length   = 2;
      break;
-    case Utf16BE:
+    case Encoding::Utf16BE:
      lf.lineFeed = new char[ 2 ]{ 0, 0x0A };
      lf.length   = 2;
      break;
-    case Windows1252:
+    case Encoding::Windows1252:
-
+    case Encoding::Windows1251:
-    case Windows1251:
+    case Encoding::Windows1250:
-
+    case Encoding::Utf8:
    case Utf8:
    case Windows1250:
    default:
      lf.length   = 1;
      lf.lineFeed = new char[ 1 ]{ 0x0A };
--- a/src/common/text.hh
+++ b/src/common/text.hh
@ -2,18 +2,18 @@
 * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
 #pragma once
-#include <cstdio>
+#include "ex.hh"
 #include <QByteArray>
 #include <string>
 #include "ex.hh"
 /// Facilities to process Text, focusing on Unicode
 namespace Text {
 DEF_EX_STR( exCantDecode, "Can't decode the given string from Utf8:", std::exception )
-// Those are possible encodings for .dsl files
+/// Encoding names. Ref -> IANA's encoding names https://www.iana.org/assignments/character-sets/character-sets.xhtml
-enum Encoding {
+/// Notice: The ordering must not be changed before Utf32LE. The current .dsl format index file depends on it.
-  Utf16LE,
+enum class Encoding {
  Utf16LE = 0,
  Utf16BE,
  Windows1252,
  Windows1251,
@ -21,9 +21,25 @@ enum Encoding {
  Utf8,
  Utf32BE,
  Utf32LE,
  Utf32,
 };
 inline constexpr auto utf16_be     = "UTF-16BE";
 inline constexpr auto utf16_le     = "UTF-16LE";
 inline constexpr auto utf32        = "UTF-32";
 inline constexpr auto utf32_be     = "UTF-32BE";
 inline constexpr auto utf32_le     = "UTF-32LE";
 inline constexpr auto utf8         = "UTF-8";
 inline constexpr auto windows_1250 = "WINDOWS-1250";
 inline constexpr auto windows_1251 = "WINDOWS-1251";
 inline constexpr auto windows_1252 = "WINDOWS-1252";
 const char * getEncodingNameFor( Encoding e );
 Encoding getEncodingForName( const QByteArray & name );
 /// utf32 -> utf8
 std::string toUtf8( std::u32string const & ) noexcept;
 /// utf8 -> utf32
 std::u32string toUtf32( std::string const & );
 /// Since the standard isspace() is locale-specific, we need something
@ -33,8 +49,6 @@ bool isspace( int c );
 //get the first line in string s1. -1 if not found
 int findFirstLinePosition( char * s1, int s1length, const char * s2, int s2length );
 char const * getEncodingNameFor( Encoding e );
 Encoding getEncodingForName( const QByteArray & name );
 struct LineFeed
 {
--- a/src/dict/dsl.cc
+++ b/src/dict/dsl.cc
@ -1144,8 +1144,9 @@ void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword,
  }
  else {
    try {
-      articleData =
+      articleData = Iconv::toWstring( getEncodingNameFor( static_cast< Encoding >( idxHeader.dslEncoding ) ),
-        Iconv::toWstring( getEncodingNameFor( Encoding( idxHeader.dslEncoding ) ), articleBody, articleSize );
+                                      articleBody,
                                      articleSize );
      free( articleBody );
      // Strip DSL comments
@ -1789,7 +1790,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
            idx.write( soundDictName.data(), soundDictName.size() );
          }
-          idxHeader.dslEncoding = scanner.getEncoding();
+          idxHeader.dslEncoding = static_cast< uint32_t >( scanner.getEncoding() );
          IndexedWords indexedWords;
--- a/src/dict/dsl_details.cc
+++ b/src/dict/dsl_details.cc
@ -844,7 +844,7 @@ bool ArticleDom::atSignFirstInLine()
 /////////////// DslScanner
 DslScanner::DslScanner( string const & fileName ):
-  encoding( Text::Utf8 ),
+  encoding( Text::Encoding::Utf8 ),
  readBufferPtr( readBuffer ),
  readBufferLeft( 0 ),
  linesRead( 0 )
@ -875,19 +875,19 @@ DslScanner::DslScanner( string const & fileName ):
       guessedEncoding.has_value() ) {
    switch ( guessedEncoding.value() ) {
      case QStringConverter::Utf8:
-        encoding = Text::Utf8;
+        encoding = Text::Encoding::Utf8;
        break;
      case QStringConverter::Utf16LE:
-        encoding = Text::Utf16LE;
+        encoding = Text::Encoding::Utf16LE;
        break;
      case QStringConverter::Utf16BE:
-        encoding = Text::Utf16BE;
+        encoding = Text::Encoding::Utf16BE;
        break;
      case QStringConverter::Utf32LE:
-        encoding = Text::Utf16LE;
+        encoding = Text::Encoding::Utf16LE;
        break;
      case QStringConverter::Utf32BE:
-        encoding = Text::Utf32BE;
+        encoding = Text::Encoding::Utf32BE;
        break;
      default:
        break;
@ -976,13 +976,13 @@ DslScanner::DslScanner( string const & fileName ):
        qWarning( "Warning: encoding was specified in a Unicode file, ignoring." );
      }
      else if ( !arg.compare( U"Latin" ) ) {
-        encoding = Text::Windows1252;
+        encoding = Text::Encoding::Windows1252;
      }
      else if ( !arg.compare( U"Cyrillic" ) ) {
-        encoding = Text::Windows1251;
+        encoding = Text::Encoding::Windows1251;
      }
      else if ( !arg.compare( U"EasternEuropean" ) ) {
-        encoding = Text::Windows1250;
+        encoding = Text::Encoding::Windows1250;
      }
      else {
        gzclose( f );
--- a/src/dict/dsl_details.hh
+++ b/src/dict/dsl_details.hh
@ -207,8 +207,8 @@ void stripComments( std::u32string &, bool & );
 inline size_t DslScanner::distanceToBytes( size_t x ) const
 {
  switch ( encoding ) {
-    case Text::Utf16LE:
+    case Encoding::Utf16LE:
-    case Text::Utf16BE:
+    case Encoding::Utf16BE:
      return x * 2;
    default:
      return x;
--- a/src/dict/gls.cc
+++ b/src/dict/gls.cc
@ -123,7 +123,7 @@ public:
 };
 GlsScanner::GlsScanner( string const & fileName ):
-  encoding( Text::Utf8 ),
+  encoding( Encoding::Utf8 ),
  readBufferPtr( readBuffer ),
  readBufferLeft( 0 ),
  linesRead( 0 )
@ -149,10 +149,10 @@ GlsScanner::GlsScanner( string const & fileName ):
  // If the file begins with the dedicated Unicode marker, we just consume
  // it. If, on the other hand, it's not, we return the bytes back
  if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE ) {
-    encoding = Text::Utf16LE;
+    encoding = Encoding::Utf16LE;
  }
  else if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF ) {
-    encoding = Text::Utf16BE;
+    encoding = Encoding::Utf16BE;
  }
  else if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB ) {
    // Looks like Utf8, read one more byte
@ -161,14 +161,14 @@ GlsScanner::GlsScanner( string const & fileName ):
      gzclose( f );
      throw exMalformedGlsFile( fileName );
    }
-    encoding = Text::Utf8;
+    encoding = Encoding::Utf8;
  }
  else {
    if ( gzrewind( f ) ) {
      gzclose( f );
      throw exCantOpen( fileName );
    }
-    encoding = Text::Utf8;
+    encoding = Encoding::Utf8;
  }
  codec = QTextCodec::codecForName( Text::getEncodingNameFor( encoding ) );
@ -1259,7 +1259,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
          idx.write( (uint32_t)dictionaryName.size() );
          idx.write( dictionaryName.data(), dictionaryName.size() );
-          idxHeader.glsEncoding = scanner.getEncoding();
+          idxHeader.glsEncoding = static_cast< uint32_t >( scanner.getEncoding() );
          IndexedWords indexedWords;
--- a/src/dict/hunspell.cc
+++ b/src/dict/hunspell.cc
@ -207,7 +207,7 @@ void HunspellArticleRequest::run()
    QMutexLocker _( &hunspellMutex );
-    string trimmedWord_utf8 = Iconv::toUtf8( Iconv::GdWchar, trimmedWord.data(), trimmedWord.size() );
+    string trimmedWord_utf8 = Iconv::toUtf8( Text::utf32, trimmedWord.data(), trimmedWord.size() );
    if ( hunspell.spell( trimmedWord_utf8 ) ) {
      // Good word -- no spelling suggestions then.
@ -361,7 +361,7 @@ QList< std::u32string > suggest( std::u32string & word, QMutex & hunspellMutex,
  try {
    QMutexLocker _( &hunspellMutex );
-    auto suggestions = hunspell.analyze( Iconv::toUtf8( Iconv::GdWchar, word.data(), word.size() ) );
+    auto suggestions = hunspell.analyze( Iconv::toUtf8( Text::utf32, word.data(), word.size() ) );
    if ( !suggestions.empty() ) {
      // There were some suggestions made for us. Make an appropriate output.
@ -464,7 +464,7 @@ void HunspellPrefixMatchRequest::run()
    QMutexLocker _( &hunspellMutex );
-    if ( hunspell.spell( Iconv::toUtf8( Iconv::GdWchar, trimmedWord.data(), trimmedWord.size() ) ) ) {
+    if ( hunspell.spell( Iconv::toUtf8( Text::utf32, trimmedWord.data(), trimmedWord.size() ) ) ) {
      // Known word -- add it to the result
      QMutexLocker _( &dataMutex );
--- a/src/dict/lsa.cc
+++ b/src/dict/lsa.cc
@ -143,7 +143,7 @@ Entry::Entry( File::Index & f )
  // Read the size of the recording, in samples
  samplesLength = f.read< uint32_t >();
-  name = Iconv::toUtf8( Iconv::Utf16Le, &filenameBuffer.front(), read * sizeof( uint16_t ) );
+  name = Iconv::toUtf8( Text::utf16_le, &filenameBuffer.front(), read * sizeof( uint16_t ) );
 }
 class LsaDictionary: public BtreeIndexing::BtreeDictionary