From f0a3df3d6f5f60475ab1323239542b066762e4b8 Mon Sep 17 00:00:00 2001
From: xiaoyifang <yifang.xiao@outlook.com>
Date: Sat, 6 Nov 2021 16:26:30 +0800
Subject: [PATCH] refract encoding method

---
 dsl.cc         |   5 +-
 dsl_details.cc | 116 ++++++++++++++---------------------------
 dsl_details.hh |  25 +++------
 gls.cc         | 137 +++++++++++++++++++++++++------------------------
 utf8.cc        |  33 ++++++++++++
 utf8.hh        |  18 ++++++-
 6 files changed, 170 insertions(+), 164 deletions(-)

diff --git a/dsl.cc b/dsl.cc
index 19968748..376c8f5c 100644
--- a/dsl.cc
+++ b/dsl.cc
@@ -75,6 +75,7 @@ using gd::wstring;
 using gd::wchar;
 using std::vector;
 using std::list;
+using Utf8::Encoding;
 
 using BtreeIndexing::WordArticleLink;
 using BtreeIndexing::IndexedWords;
@@ -597,7 +598,7 @@ void DslDictionary::loadArticle( uint32_t address,
       {
         articleData =
           Iconv::toWstring(
-            getEncodingNameFor( DslEncoding( idxHeader.dslEncoding ) ),
+            Utf8::getEncodingNameFor( Encoding( idxHeader.dslEncoding ) ),
             articleBody, articleSize );
         free( articleBody );
 
@@ -1361,7 +1362,7 @@ void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword,
     {
       articleData =
         Iconv::toWstring(
-          getEncodingNameFor( DslEncoding( idxHeader.dslEncoding ) ),
+          getEncodingNameFor( Encoding( idxHeader.dslEncoding ) ),
           articleBody, articleSize );
       free( articleBody );
 
diff --git a/dsl_details.cc b/dsl_details.cc
index 80e4c861..540377d3 100644
--- a/dsl_details.cc
+++ b/dsl_details.cc
@@ -19,6 +19,7 @@ namespace Details {
 
 using gd::wstring;
 using std::list;
+using Utf8::Encoding;
 
 #ifndef __linux__
 
@@ -41,18 +42,6 @@ int wcscasecmp( const wchar *s1, const wchar *s2 )
 
 #endif
 
-//get the first line in string s1. -1 if not found
-int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length)
-{
-    char* pos = std::search(s1,s1+s1length, s2, s2+s2length);
-
-    if (pos == s1 + s1length)
-        return pos-s1;
-
-    //the line size.
-    return pos- s1+ s2length;
-}
-
 static DSLLangCode LangCodes[] =
 {
   { 1, "en" },
@@ -159,25 +148,7 @@ bool isAtSignFirst( wstring const & str )
   return reg.indexIn( gd::toQString( str ) ) == 0;
 }
 
-char const* getEncodingNameFor(DslEncoding e)
-{
-    switch (e)
-    {
-    case Utf16LE:
-        return "UTF-16LE";
-    case Utf16BE:
-        return "UTF-16BE";
-    case Windows1252:
-        return "WINDOWS-1252";
-    case Windows1251:
-        return "WINDOWS-1251";
-    case Details::Utf8:
-        return "UTF-8";
-    case Windows1250:
-    default:
-        return "WINDOWS-1250";
-    }
-}
+
 
 /////////////// ArticleDom
 
@@ -811,38 +782,36 @@ void ArticleDom::closeTag( wstring const & name,
 
 void ArticleDom::nextChar() THROW_SPEC( eot )
 {
-  if ( !*stringPos )
-    throw eot();
-  else{
-      ch = *stringPos++;
+    if ( !*stringPos )
+        throw eot();
 
-      if ( ch == L'\\' )
-      {
+    ch = *stringPos++;
+
+    if ( ch == L'\\' )
+    {
         if ( !*stringPos )
-          throw eot();
+            throw eot();
 
         ch = *stringPos++;
 
         escaped = true;
-      }
-      else
-      if ( ch == L'[' && *stringPos == L'[' )
-      {
+    }
+    else if ( ch == L'[' && *stringPos == L'[' )
+    {
         ++stringPos;
         escaped = true;
-      }
-      else
-      if ( ch == L']' && *stringPos == L']' )
-      {
+    }
+    else if ( ch == L']' && *stringPos == L']' )
+    {
         ++stringPos;
         escaped = true;
-      }
-      else
+    }
+    else
         escaped = false;
 
-      if( ch == '\n' || ch == '\r' )
+    if( ch == '\n' || ch == '\r' )
         lineStartPos = stringPos;
-  }
+
 }
 
 bool ArticleDom::atSignFirstInLine()
@@ -857,7 +826,7 @@ bool ArticleDom::atSignFirstInLine()
 /////////////// DslScanner
 
 DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
-  encoding( Windows1252 ), readBufferPtr( readBuffer ),
+  encoding( Utf8::Windows1252 ), readBufferPtr( readBuffer ),
   readBufferLeft( 0 ), linesRead( 0 )
 {
   // Since .dz is backwards-compatible with .gz, we use gz- functions to
@@ -884,10 +853,10 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
   // If the file begins with the dedicated Unicode marker, we just consume
   // it. If, on the other hand, it's not, we return the bytes back
   if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE )
-    encoding = Utf16LE;
+    encoding = Utf8::Utf16LE;
   else
   if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF )
-    encoding = Utf16BE;
+    encoding = Utf8::Utf16BE;
   else
   if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB )
   {
@@ -899,22 +868,22 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
       throw exMalformedDslFile( fileName );
     }
     
-    encoding = Utf8;
+    encoding = Utf8::Utf8;
   }
   else
   {
     if ( firstBytes[ 0 ] && !firstBytes[ 1 ] )
-      encoding = Utf16LE;
+      encoding = Utf8::Utf16LE;
     else
     if ( !firstBytes[ 0 ] && firstBytes[ 1 ] )
-      encoding = Utf16BE;
+      encoding = Utf8::Utf16BE;
     else
     {
       // Ok, this doesn't look like 16-bit Unicode. We will start with a
       // 8-bit encoding with an intent to find out the exact one from
       // the header.
       needExactEncoding = true;
-      encoding = Windows1251;
+      encoding = Utf8::Windows1251;
     }
 
     if ( gzrewind( f ) )
@@ -995,13 +964,13 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
       }
       else
       if ( !wcscasecmp( arg.c_str(), GD_NATIVE_TO_WS( L"Latin" ) ) )
-        encoding = Windows1252;
+        encoding = Utf8::Windows1252;
       else
       if ( !wcscasecmp( arg.c_str(), GD_NATIVE_TO_WS( L"Cyrillic" ) ) )
-        encoding = Windows1251;
+        encoding = Utf8::Windows1251;
       else
       if ( !wcscasecmp( arg.c_str(), GD_NATIVE_TO_WS( L"EasternEuropean" ) ) )
-        encoding = Windows1250;
+        encoding = Utf8::Windows1250;
       else
       {
         gzclose( f );
@@ -1036,8 +1005,6 @@ bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_wo
     // Check that we have bytes to read
     if ( readBufferLeft < 5000 )
     {
-      //readBufferPtr+=pos;
-      //readBufferLeft-=pos;
       if ( !gzeof( f ) )
       {
         // To avoid having to deal with ring logic, we move the remaining bytes
@@ -1053,19 +1020,12 @@ bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_wo
 
         readBufferPtr = readBuffer;
         readBufferLeft += (size_t) result;
-        /*QByteArray frag = QByteArray::fromRawData(readBuffer, readBufferLeft);
-        fragStream = new QTextStream(frag) ;
-        fragStream->setCodec(codec);*/
       }
     }
-
-    //if(fragStream->atEnd())
-    //    return false;
-
     if(readBufferLeft<=0)
         return false;
-    //QString line=fragStream->readLine();
-    int pos = findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed,lineFeedLength);
+
+    int pos = Utf8::findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed,lineFeedLength);
     if(pos==-1)
         return false;
     QString line = codec->toUnicode(readBufferPtr, pos);
@@ -1123,25 +1083,25 @@ bool DslScanner::readNextLineWithoutComments( wstring & out, size_t & offset , b
 
 /////////////// DslScanner
 
-void DslScanner::initLineFeed(DslEncoding e)
+void DslScanner::initLineFeed(Utf8::Encoding e)
 {
 	switch (e)
 	{
-	case Utf16LE:
+    case Utf8::Utf16LE:
         lineFeed= new char[2] {0x0A,0};
         lineFeedLength = 2;
         break;
-	case Utf16BE:
+    case Utf8::Utf16BE:
         lineFeed = new char[2] { 0,0x0A};
         lineFeedLength = 2;
         break;
-	case Windows1252:
+    case Utf8::Windows1252:
 		
-	case Windows1251:
+    case Utf8::Windows1251:
 		
-	case Details::Utf8:
+    case Utf8::Utf8:
 		
-	case Windows1250:
+    case Utf8::Windows1250:
 	default:
         lineFeedLength = 1;
         lineFeed = new char[1] {0x0A};
diff --git a/dsl_details.hh b/dsl_details.hh
index 98b6ee5b..5b698a74 100644
--- a/dsl_details.hh
+++ b/dsl_details.hh
@@ -12,6 +12,7 @@
 #include "iconv.hh"
 #include <QTextCodec>
 #include <QByteArray>
+#include "utf8.hh"
 
 // Implementation details for Dsl, not part of its interface
 namespace Dsl {
@@ -22,17 +23,9 @@ using gd::wstring;
 using gd::wchar;
 using std::list;
 using std::vector;
+using Utf8::Encoding;
+
 
-// Those are possible encodings for .dsl files
-enum DslEncoding
-{
-  Utf16LE,
-  Utf16BE,
-  Windows1252,
-  Windows1251,
-  Windows1250,
-  Utf8 // This is an extension. Detected solely by the UTF8 BOM.
-};
 
 struct DSLLangCode
 {
@@ -44,8 +37,6 @@ string findCodeForDslId( int id );
 
 bool isAtSignFirst( wstring const & str );
 
-char const* getEncodingNameFor(DslEncoding e);
-
 /// Parses the DSL language, representing it in its structural DOM form.
 struct ArticleDom
 {
@@ -111,7 +102,7 @@ private:
 class DslScanner
 {
   gzFile f;
-  DslEncoding encoding;
+  Encoding encoding;
   QTextCodec* codec;
   wstring dictionaryName;
   wstring langFrom, langTo;
@@ -138,9 +129,9 @@ public:
   ~DslScanner() throw();
 
   /// Returns the detected encoding of this file.
-  DslEncoding getEncoding() const
+  Encoding getEncoding() const
   { return encoding; }
-  void initLineFeed(DslEncoding e);
+  void initLineFeed(Encoding e);
 
   /// Returns the dictionary's name, as was read from file's headers.
   wstring const & getDictionaryName() const
@@ -207,8 +198,8 @@ inline size_t DslScanner::distanceToBytes( size_t x ) const
 {
   switch( encoding )
   {
-    case Utf16LE:
-    case Utf16BE:
+    case Utf8::Utf16LE:
+    case Utf8::Utf16BE:
       return x*2;
     default:
       return x;
diff --git a/gls.cc b/gls.cc
index 5d76fb75..59ecf360 100644
--- a/gls.cc
+++ b/gls.cc
@@ -58,13 +58,7 @@ using gd::wchar;
 using BtreeIndexing::WordArticleLink;
 using BtreeIndexing::IndexedWords;
 using BtreeIndexing::IndexInfo;
-
-enum Encoding
-{
-  Utf8,
-  Utf16LE,
-  Utf16BE
-};
+using Utf8::Encoding;
 
 /////////////// GlsScanner
 
@@ -73,15 +67,14 @@ class GlsScanner
   gzFile f;
   Encoding encoding;
   QTextCodec* codec;
-  Iconv iconv;
   wstring dictionaryName;
   wstring dictionaryDecription, dictionaryAuthor;
   wstring langFrom, langTo;
   char readBuffer[ 10000 ];
   char * readBufferPtr;
   size_t readBufferLeft;
-  QTextStream* fragStream;
-  qint64 pos;
+  const char* lineFeed;
+  int lineFeedLength;
   unsigned linesRead;
 
 public:
@@ -126,30 +119,15 @@ public:
   /// Reading begins from the first line after the headers (ones which end
   /// by the "### Glossary section:" line).
   bool readNextLine( wstring &, size_t & offset ) THROW_SPEC( Ex, Iconv::Ex );
-
+  void initLineFeed(Utf8::Encoding e);
   /// Returns the number of lines read so far from the file.
   unsigned getLinesRead() const
   { return linesRead; }
-
-  /// Returns a name to be passed to iconv for the given encoding.
-  static char const * getEncodingNameFor( Encoding e )
-  {
-    switch( e )
-    {
-      case Utf16LE:
-        return Iconv::Utf16Le;
-      case Utf16BE:
-        return "UTF-16BE";
-      case Utf8:
-      default:
-        return Iconv::Utf8;
-    }
-  }
 };
 
 GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
-  encoding( Utf8 ), iconv( Iconv::GdWchar, Iconv::Utf8 ), readBufferPtr( readBuffer ),
-  readBufferLeft( 0 ), linesRead( 0 ), pos(0)
+  encoding( Utf8::Utf8 ), readBufferPtr( readBuffer ),
+  readBufferLeft( 0 ), linesRead( 0 )
 {
   // Since .dz is backwards-compatible with .gz, we use gz- functions to
   // read it -- they are much nicer than the dict_data- ones.
@@ -172,10 +150,10 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
   // If the file begins with the dedicated Unicode marker, we just consume
   // it. If, on the other hand, it's not, we return the bytes back
   if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE )
-    encoding = Utf16LE;
+    encoding = Utf8::Utf16LE;
   else
   if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF )
-    encoding = Utf16BE;
+    encoding = Utf8::Utf16BE;
   else
   if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB )
   {
@@ -186,7 +164,7 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
       gzclose( f );
       throw exMalformedGlsFile( fileName );
     }
-    encoding = Utf8;
+    encoding = Utf8::Utf8;
   }
   else
   {
@@ -195,12 +173,10 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
       gzclose( f );
       throw exCantOpen( fileName );
     }
-    encoding = Utf8;
+    encoding = Utf8::Utf8;
   }
 
-  if( encoding != Utf8 )
-    iconv.reinit( Iconv::GdWchar, getEncodingNameFor( encoding ) );
-  codec = QTextCodec::codecForName(getEncodingNameFor(encoding));
+  codec = QTextCodec::codecForName(Utf8::getEncodingNameFor(encoding));
   // We now can use our own readNextLine() function
 
   wstring str;
@@ -267,45 +243,74 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
     }
   }
 }
+void GlsScanner::initLineFeed(Utf8::Encoding e)
+{
+    switch (e)
+    {
+    case Utf8::Utf16LE:
+        lineFeed= new char[2] {0x0A,0};
+        lineFeedLength = 2;
+        break;
+    case Utf8::Utf16BE:
+        lineFeed = new char[2] { 0,0x0A};
+        lineFeedLength = 2;
+        break;
+    case Utf8::Windows1252:
 
+    case Utf8::Windows1251:
+
+    case Utf8::Utf8:
+
+    case Utf8::Windows1250:
+    default:
+        lineFeedLength = 1;
+        lineFeed = new char[1] {0x0A};
+    }
+}
 bool GlsScanner::readNextLine( wstring & out, size_t & offset ) THROW_SPEC( Ex,
                                                                        Iconv::Ex )
 {
-	offset = (size_t)(gztell(f) - readBufferLeft + pos);
+    offset = (size_t)(gztell(f) - readBufferLeft);
 
-	{
-		// Check that we have bytes to read
-		if (readBufferLeft - pos < 2000)
-		{
-			readBufferPtr += pos;
-			readBufferLeft -= pos;
-			if (!gzeof(f))
-			{
-				// To avoid having to deal with ring logic, we move the remaining bytes
-				// to the beginning
-				memmove(readBuffer, readBufferPtr, readBufferLeft);
+    {
+      // Check that we have bytes to read
+      if ( readBufferLeft < 5000 )
+      {
+        if ( !gzeof( f ) )
+        {
+          // To avoid having to deal with ring logic, we move the remaining bytes
+          // to the beginning
+          memmove( readBuffer, readBufferPtr, readBufferLeft );
 
-				// Read some more bytes to readBuffer
-				int result = gzread(f, readBuffer + readBufferLeft,
-					sizeof(readBuffer) - readBufferLeft);
+          // Read some more bytes to readBuffer
+          int result = gzread( f, readBuffer + readBufferLeft,
+                               sizeof( readBuffer ) - readBufferLeft );
 
-				if (result == -1)
-                    throw exCantReadGlsFile();
+		  if (result == -1)
+            throw exCantReadGlsFile();
 
-				readBufferPtr = readBuffer;
-				readBufferLeft += (size_t)result;
-				QByteArray frag = QByteArray::fromRawData(readBuffer, readBufferLeft);
-				fragStream = new QTextStream(frag);
-				fragStream->setCodec(codec);
-			}
-		}
+          readBufferPtr = readBuffer;
+          readBufferLeft += (size_t) result;
+        }
+      }
+      if(readBufferLeft<=0)
+          return false;
 
-		if (fragStream->atEnd())
-			return false;
+      int pos = Utf8::findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed,lineFeedLength);
+      if(pos==-1)
+          return false;
+      QString line = codec->toUnicode(readBufferPtr, pos);
+      if(line.endsWith("\n"))
+          line.chop(1);
+      if(line.endsWith("\r"))
+          line.chop(1);
 
-		QString line = fragStream->readLine();
-		pos = fragStream->pos();
-		linesRead++;
+      if(pos>readBufferLeft){
+          pos=readBufferLeft;
+      }
+      readBufferLeft -= pos;
+      readBufferPtr += pos;
+      linesRead++;
 
 #ifdef __WIN32
 		out = line.toStdU32String();
@@ -314,7 +319,7 @@ bool GlsScanner::readNextLine( wstring & out, size_t & offset ) THROW_SPEC( Ex,
 #endif
 		return true;
 
-	}
+    }
 }
 
 GlsScanner::~GlsScanner() throw()
@@ -669,7 +674,7 @@ void GlsDictionary::loadArticleText( uint32_t address,
   }
   else
   {
-    string articleData = Iconv::toUtf8( GlsScanner::getEncodingNameFor( Encoding( idxHeader.glsEncoding ) ), articleBody, articleSize );
+    string articleData = Iconv::toUtf8( Utf8::getEncodingNameFor( Encoding( idxHeader.glsEncoding ) ), articleBody, articleSize );
     string::size_type start_pos = 0, end_pos = 0;
 
     for( ; ; )
diff --git a/utf8.cc b/utf8.cc
index c7e516f4..a1370d21 100644
--- a/utf8.cc
+++ b/utf8.cc
@@ -3,6 +3,7 @@
 
 #include "utf8.hh"
 #include <vector>
+#include <algorithm>
 
 namespace Utf8 {
 
@@ -175,4 +176,36 @@ bool isspace( int c )
   }
 }
 
+//get the first line in string s1. -1 if not found
+int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length)
+{
+    char* pos = std::search(s1,s1+s1length, s2, s2+s2length);
+
+    if (pos == s1 + s1length)
+        return pos-s1;
+
+    //the line size.
+    return pos- s1+ s2length;
+}
+
+char const* getEncodingNameFor(Encoding e)
+{
+    switch (e)
+    {
+    case Utf16LE:
+        return "UTF-16LE";
+    case Utf16BE:
+        return "UTF-16BE";
+    case Windows1252:
+        return "WINDOWS-1252";
+    case Windows1251:
+        return "WINDOWS-1251";
+    case Utf8:
+        return "UTF-8";
+    case Windows1250:
+    default:
+        return "WINDOWS-1250";
+    }
+}
+
 }
diff --git a/utf8.hh b/utf8.hh
index abd86a2f..daf06907 100644
--- a/utf8.hh
+++ b/utf8.hh
@@ -1,6 +1,7 @@
 /* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
  * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
-
+#ifndef __UTF8_HH_INCLUDED__
+#define __UTF8_HH_INCLUDED__
 #include <cstdio>
 #include <string>
 #include "cpp_features.hh"
@@ -13,6 +14,17 @@
 /// places.
 namespace Utf8 {
 
+// Those are possible encodings for .dsl files
+enum Encoding
+{
+  Utf16LE,
+  Utf16BE,
+  Windows1252,
+  Windows1251,
+  Windows1250,
+  Utf8 // This is an extension. Detected solely by the UTF8 BOM.
+};
+
 using std::string;
 using gd::wstring;
 using gd::wchar;
@@ -40,4 +52,8 @@ wstring decode( string const & ) THROW_SPEC( exCantDecode );
 /// Linux but was messing up strings under Windows.
 bool isspace( int c );
 
+//get the first line in string s1. -1 if not found
+int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length);
+char const* getEncodingNameFor(Encoding e);
 }
+#endif