refract encoding method

2024-11-27 19:24:08 +00:00 · 2021-11-06 16:26:30 +08:00 · 2021-11-06 16:26:30 +08:00 · f0a3df3d6f
parent 02a88c98ad
commit f0a3df3d6f
6 changed files with 170 additions and 164 deletions
--- a/dsl.cc
+++ b/dsl.cc
@ -75,6 +75,7 @@ using gd::wstring;
 using gd::wchar;
 using std::vector;
 using std::list;
 using Utf8::Encoding;
 using BtreeIndexing::WordArticleLink;
 using BtreeIndexing::IndexedWords;
@ -597,7 +598,7 @@ void DslDictionary::loadArticle( uint32_t address,
      {
        articleData =
          Iconv::toWstring(
-            getEncodingNameFor( DslEncoding( idxHeader.dslEncoding ) ),
+            Utf8::getEncodingNameFor( Encoding( idxHeader.dslEncoding ) ),
            articleBody, articleSize );
        free( articleBody );
@ -1361,7 +1362,7 @@ void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword,
    {
      articleData =
        Iconv::toWstring(
-          getEncodingNameFor( DslEncoding( idxHeader.dslEncoding ) ),
+          getEncodingNameFor( Encoding( idxHeader.dslEncoding ) ),
          articleBody, articleSize );
      free( articleBody );
--- a/dsl_details.cc
+++ b/dsl_details.cc
@ -19,6 +19,7 @@ namespace Details {
 using gd::wstring;
 using std::list;
 using Utf8::Encoding;
 #ifndef __linux__
@ -41,18 +42,6 @@ int wcscasecmp( const wchar *s1, const wchar *s2 )
 #endif
 //get the first line in string s1. -1 if not found
 int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length)
 {
    char* pos = std::search(s1,s1+s1length, s2, s2+s2length);
    if (pos == s1 + s1length)
        return pos-s1;
    //the line size.
    return pos- s1+ s2length;
 }
 static DSLLangCode LangCodes[] =
 {
  { 1, "en" },
@ -159,25 +148,7 @@ bool isAtSignFirst( wstring const & str )
  return reg.indexIn( gd::toQString( str ) ) == 0;
 }
-char const* getEncodingNameFor(DslEncoding e)
+
 {
    switch (e)
    {
    case Utf16LE:
        return "UTF-16LE";
    case Utf16BE:
        return "UTF-16BE";
    case Windows1252:
        return "WINDOWS-1252";
    case Windows1251:
        return "WINDOWS-1251";
    case Details::Utf8:
        return "UTF-8";
    case Windows1250:
    default:
        return "WINDOWS-1250";
    }
 }
 /////////////// ArticleDom
@ -811,38 +782,36 @@ void ArticleDom::closeTag( wstring const & name,
 void ArticleDom::nextChar() THROW_SPEC( eot )
 {
-  if ( !*stringPos )
+    if ( !*stringPos )
-    throw eot();
+        throw eot();
  else{
      ch = *stringPos++;
-      if ( ch == L'\\' )
+    ch = *stringPos++;
-      {
+
    if ( ch == L'\\' )
    {
        if ( !*stringPos )
-          throw eot();
+            throw eot();
        ch = *stringPos++;
        escaped = true;
-      }
+    }
-      else
+    else if ( ch == L'[' && *stringPos == L'[' )
-      if ( ch == L'[' && *stringPos == L'[' )
+    {
      {
        ++stringPos;
        escaped = true;
-      }
+    }
-      else
+    else if ( ch == L']' && *stringPos == L']' )
-      if ( ch == L']' && *stringPos == L']' )
+    {
      {
        ++stringPos;
        escaped = true;
-      }
+    }
-      else
+    else
        escaped = false;
-      if( ch == '\n' || ch == '\r' )
+    if( ch == '\n' || ch == '\r' )
        lineStartPos = stringPos;
-  }
+
 }
 bool ArticleDom::atSignFirstInLine()
@ -857,7 +826,7 @@ bool ArticleDom::atSignFirstInLine()
 /////////////// DslScanner
 DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
-  encoding( Windows1252 ), readBufferPtr( readBuffer ),
+  encoding( Utf8::Windows1252 ), readBufferPtr( readBuffer ),
  readBufferLeft( 0 ), linesRead( 0 )
 {
  // Since .dz is backwards-compatible with .gz, we use gz- functions to
@ -884,10 +853,10 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
  // If the file begins with the dedicated Unicode marker, we just consume
  // it. If, on the other hand, it's not, we return the bytes back
  if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE )
-    encoding = Utf16LE;
+    encoding = Utf8::Utf16LE;
  else
  if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF )
-    encoding = Utf16BE;
+    encoding = Utf8::Utf16BE;
  else
  if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB )
  {
@ -899,22 +868,22 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
      throw exMalformedDslFile( fileName );
    }
-    encoding = Utf8;
+    encoding = Utf8::Utf8;
  }
  else
  {
    if ( firstBytes[ 0 ] && !firstBytes[ 1 ] )
-      encoding = Utf16LE;
+      encoding = Utf8::Utf16LE;
    else
    if ( !firstBytes[ 0 ] && firstBytes[ 1 ] )
-      encoding = Utf16BE;
+      encoding = Utf8::Utf16BE;
    else
    {
      // Ok, this doesn't look like 16-bit Unicode. We will start with a
      // 8-bit encoding with an intent to find out the exact one from
      // the header.
      needExactEncoding = true;
-      encoding = Windows1251;
+      encoding = Utf8::Windows1251;
    }
    if ( gzrewind( f ) )
@ -995,13 +964,13 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
      }
      else
      if ( !wcscasecmp( arg.c_str(), GD_NATIVE_TO_WS( L"Latin" ) ) )
-        encoding = Windows1252;
+        encoding = Utf8::Windows1252;
      else
      if ( !wcscasecmp( arg.c_str(), GD_NATIVE_TO_WS( L"Cyrillic" ) ) )
-        encoding = Windows1251;
+        encoding = Utf8::Windows1251;
      else
      if ( !wcscasecmp( arg.c_str(), GD_NATIVE_TO_WS( L"EasternEuropean" ) ) )
-        encoding = Windows1250;
+        encoding = Utf8::Windows1250;
      else
      {
        gzclose( f );
@ -1036,8 +1005,6 @@ bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_wo
    // Check that we have bytes to read
    if ( readBufferLeft < 5000 )
    {
      //readBufferPtr+=pos;
      //readBufferLeft-=pos;
      if ( !gzeof( f ) )
      {
        // To avoid having to deal with ring logic, we move the remaining bytes
@ -1053,19 +1020,12 @@ bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_wo
        readBufferPtr = readBuffer;
        readBufferLeft += (size_t) result;
        /*QByteArray frag = QByteArray::fromRawData(readBuffer, readBufferLeft);
        fragStream = new QTextStream(frag) ;
        fragStream->setCodec(codec);*/
      }
    }
    //if(fragStream->atEnd())
    //    return false;
    if(readBufferLeft<=0)
        return false;
-    //QString line=fragStream->readLine();
+
-    int pos = findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed,lineFeedLength);
+    int pos = Utf8::findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed,lineFeedLength);
    if(pos==-1)
        return false;
    QString line = codec->toUnicode(readBufferPtr, pos);
@ -1123,25 +1083,25 @@ bool DslScanner::readNextLineWithoutComments( wstring & out, size_t & offset , b
 /////////////// DslScanner
-void DslScanner::initLineFeed(DslEncoding e)
+void DslScanner::initLineFeed(Utf8::Encoding e)
 {
 	switch (e)
 	{
-	case Utf16LE:
+    case Utf8::Utf16LE:
        lineFeed= new char[2] {0x0A,0};
        lineFeedLength = 2;
        break;
-	case Utf16BE:
+    case Utf8::Utf16BE:
        lineFeed = new char[2] { 0,0x0A};
        lineFeedLength = 2;
        break;
-	case Windows1252:
+    case Utf8::Windows1252:
-	case Windows1251:
+    case Utf8::Windows1251:
-	case Details::Utf8:
+    case Utf8::Utf8:
-	case Windows1250:
+    case Utf8::Windows1250:
 	default:
        lineFeedLength = 1;
        lineFeed = new char[1] {0x0A};
--- a/dsl_details.hh
+++ b/dsl_details.hh
@ -12,6 +12,7 @@
 #include "iconv.hh"
 #include <QTextCodec>
 #include <QByteArray>
 #include "utf8.hh"
 // Implementation details for Dsl, not part of its interface
 namespace Dsl {
@ -22,17 +23,9 @@ using gd::wstring;
 using gd::wchar;
 using std::list;
 using std::vector;
 using Utf8::Encoding;
 // Those are possible encodings for .dsl files
 enum DslEncoding
 {
  Utf16LE,
  Utf16BE,
  Windows1252,
  Windows1251,
  Windows1250,
  Utf8 // This is an extension. Detected solely by the UTF8 BOM.
 };
 struct DSLLangCode
 {
@ -44,8 +37,6 @@ string findCodeForDslId( int id );
 bool isAtSignFirst( wstring const & str );
 char const* getEncodingNameFor(DslEncoding e);
 /// Parses the DSL language, representing it in its structural DOM form.
 struct ArticleDom
 {
@ -111,7 +102,7 @@ private:
 class DslScanner
 {
  gzFile f;
-  DslEncoding encoding;
+  Encoding encoding;
  QTextCodec* codec;
  wstring dictionaryName;
  wstring langFrom, langTo;
@ -138,9 +129,9 @@ public:
  ~DslScanner() throw();
  /// Returns the detected encoding of this file.
-  DslEncoding getEncoding() const
+  Encoding getEncoding() const
  { return encoding; }
-  void initLineFeed(DslEncoding e);
+  void initLineFeed(Encoding e);
  /// Returns the dictionary's name, as was read from file's headers.
  wstring const & getDictionaryName() const
@ -207,8 +198,8 @@ inline size_t DslScanner::distanceToBytes( size_t x ) const
 {
  switch( encoding )
  {
-    case Utf16LE:
+    case Utf8::Utf16LE:
-    case Utf16BE:
+    case Utf8::Utf16BE:
      return x*2;
    default:
      return x;
--- a/gls.cc
+++ b/gls.cc
@ -58,13 +58,7 @@ using gd::wchar;
 using BtreeIndexing::WordArticleLink;
 using BtreeIndexing::IndexedWords;
 using BtreeIndexing::IndexInfo;
-
+using Utf8::Encoding;
 enum Encoding
 {
  Utf8,
  Utf16LE,
  Utf16BE
 };
 /////////////// GlsScanner
@ -73,15 +67,14 @@ class GlsScanner
  gzFile f;
  Encoding encoding;
  QTextCodec* codec;
  Iconv iconv;
  wstring dictionaryName;
  wstring dictionaryDecription, dictionaryAuthor;
  wstring langFrom, langTo;
  char readBuffer[ 10000 ];
  char * readBufferPtr;
  size_t readBufferLeft;
-  QTextStream* fragStream;
+  const char* lineFeed;
-  qint64 pos;
+  int lineFeedLength;
  unsigned linesRead;
 public:
@ -126,30 +119,15 @@ public:
  /// Reading begins from the first line after the headers (ones which end
  /// by the "### Glossary section:" line).
  bool readNextLine( wstring &, size_t & offset ) THROW_SPEC( Ex, Iconv::Ex );
-
+  void initLineFeed(Utf8::Encoding e);
  /// Returns the number of lines read so far from the file.
  unsigned getLinesRead() const
  { return linesRead; }
  /// Returns a name to be passed to iconv for the given encoding.
  static char const * getEncodingNameFor( Encoding e )
  {
    switch( e )
    {
      case Utf16LE:
        return Iconv::Utf16Le;
      case Utf16BE:
        return "UTF-16BE";
      case Utf8:
      default:
        return Iconv::Utf8;
    }
  }
 };
 GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
-  encoding( Utf8 ), iconv( Iconv::GdWchar, Iconv::Utf8 ), readBufferPtr( readBuffer ),
+  encoding( Utf8::Utf8 ), readBufferPtr( readBuffer ),
-  readBufferLeft( 0 ), linesRead( 0 ), pos(0)
+  readBufferLeft( 0 ), linesRead( 0 )
 {
  // Since .dz is backwards-compatible with .gz, we use gz- functions to
  // read it -- they are much nicer than the dict_data- ones.
@ -172,10 +150,10 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
  // If the file begins with the dedicated Unicode marker, we just consume
  // it. If, on the other hand, it's not, we return the bytes back
  if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE )
-    encoding = Utf16LE;
+    encoding = Utf8::Utf16LE;
  else
  if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF )
-    encoding = Utf16BE;
+    encoding = Utf8::Utf16BE;
  else
  if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB )
  {
@ -186,7 +164,7 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
      gzclose( f );
      throw exMalformedGlsFile( fileName );
    }
-    encoding = Utf8;
+    encoding = Utf8::Utf8;
  }
  else
  {
@ -195,12 +173,10 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
      gzclose( f );
      throw exCantOpen( fileName );
    }
-    encoding = Utf8;
+    encoding = Utf8::Utf8;
  }
-  if( encoding != Utf8 )
+  codec = QTextCodec::codecForName(Utf8::getEncodingNameFor(encoding));
    iconv.reinit( Iconv::GdWchar, getEncodingNameFor( encoding ) );
  codec = QTextCodec::codecForName(getEncodingNameFor(encoding));
  // We now can use our own readNextLine() function
  wstring str;
@ -267,45 +243,74 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
    }
  }
 }
 void GlsScanner::initLineFeed(Utf8::Encoding e)
 {
    switch (e)
    {
    case Utf8::Utf16LE:
        lineFeed= new char[2] {0x0A,0};
        lineFeedLength = 2;
        break;
    case Utf8::Utf16BE:
        lineFeed = new char[2] { 0,0x0A};
        lineFeedLength = 2;
        break;
    case Utf8::Windows1252:
    case Utf8::Windows1251:
    case Utf8::Utf8:
    case Utf8::Windows1250:
    default:
        lineFeedLength = 1;
        lineFeed = new char[1] {0x0A};
    }
 }
 bool GlsScanner::readNextLine( wstring & out, size_t & offset ) THROW_SPEC( Ex,
                                                                       Iconv::Ex )
 {
-	offset = (size_t)(gztell(f) - readBufferLeft + pos);
+    offset = (size_t)(gztell(f) - readBufferLeft);
-	{
+    {
-		// Check that we have bytes to read
+      // Check that we have bytes to read
-		if (readBufferLeft - pos < 2000)
+      if ( readBufferLeft < 5000 )
-		{
+      {
-			readBufferPtr += pos;
+        if ( !gzeof( f ) )
-			readBufferLeft -= pos;
+        {
-			if (!gzeof(f))
+          // To avoid having to deal with ring logic, we move the remaining bytes
-			{
+          // to the beginning
-				// To avoid having to deal with ring logic, we move the remaining bytes
+          memmove( readBuffer, readBufferPtr, readBufferLeft );
 				// to the beginning
 				memmove(readBuffer, readBufferPtr, readBufferLeft);
-				// Read some more bytes to readBuffer
+          // Read some more bytes to readBuffer
-				int result = gzread(f, readBuffer + readBufferLeft,
+          int result = gzread( f, readBuffer + readBufferLeft,
-					sizeof(readBuffer) - readBufferLeft);
+                               sizeof( readBuffer ) - readBufferLeft );
-				if (result == -1)
+		  if (result == -1)
-                    throw exCantReadGlsFile();
+            throw exCantReadGlsFile();
-				readBufferPtr = readBuffer;
+          readBufferPtr = readBuffer;
-				readBufferLeft += (size_t)result;
+          readBufferLeft += (size_t) result;
-				QByteArray frag = QByteArray::fromRawData(readBuffer, readBufferLeft);
+        }
-				fragStream = new QTextStream(frag);
+      }
-				fragStream->setCodec(codec);
+      if(readBufferLeft<=0)
-			}
+          return false;
 		}
-		if (fragStream->atEnd())
+      int pos = Utf8::findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed,lineFeedLength);
-			return false;
+      if(pos==-1)
          return false;
      QString line = codec->toUnicode(readBufferPtr, pos);
      if(line.endsWith("\n"))
          line.chop(1);
      if(line.endsWith("\r"))
          line.chop(1);
-		QString line = fragStream->readLine();
+      if(pos>readBufferLeft){
-		pos = fragStream->pos();
+          pos=readBufferLeft;
-		linesRead++;
+      }
      readBufferLeft -= pos;
      readBufferPtr += pos;
      linesRead++;
 #ifdef __WIN32
 		out = line.toStdU32String();
@ -314,7 +319,7 @@ bool GlsScanner::readNextLine( wstring & out, size_t & offset ) THROW_SPEC( Ex,
 #endif
 		return true;
-	}
+    }
 }
 GlsScanner::~GlsScanner() throw()
@ -669,7 +674,7 @@ void GlsDictionary::loadArticleText( uint32_t address,
  }
  else
  {
-    string articleData = Iconv::toUtf8( GlsScanner::getEncodingNameFor( Encoding( idxHeader.glsEncoding ) ), articleBody, articleSize );
+    string articleData = Iconv::toUtf8( Utf8::getEncodingNameFor( Encoding( idxHeader.glsEncoding ) ), articleBody, articleSize );
    string::size_type start_pos = 0, end_pos = 0;
    for( ; ; )
--- a/utf8.cc
+++ b/utf8.cc
@ -3,6 +3,7 @@
 #include "utf8.hh"
 #include <vector>
 #include <algorithm>
 namespace Utf8 {
@ -175,4 +176,36 @@ bool isspace( int c )
  }
 }
 //get the first line in string s1. -1 if not found
 int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length)
 {
    char* pos = std::search(s1,s1+s1length, s2, s2+s2length);
    if (pos == s1 + s1length)
        return pos-s1;
    //the line size.
    return pos- s1+ s2length;
 }
 char const* getEncodingNameFor(Encoding e)
 {
    switch (e)
    {
    case Utf16LE:
        return "UTF-16LE";
    case Utf16BE:
        return "UTF-16BE";
    case Windows1252:
        return "WINDOWS-1252";
    case Windows1251:
        return "WINDOWS-1251";
    case Utf8:
        return "UTF-8";
    case Windows1250:
    default:
        return "WINDOWS-1250";
    }
 }
 }
--- a/utf8.hh
+++ b/utf8.hh
@ -1,6 +1,7 @@
 /* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
 * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
-
+#ifndef __UTF8_HH_INCLUDED__
 #define __UTF8_HH_INCLUDED__
 #include <cstdio>
 #include <string>
 #include "cpp_features.hh"
@ -13,6 +14,17 @@
 /// places.
 namespace Utf8 {
 // Those are possible encodings for .dsl files
 enum Encoding
 {
  Utf16LE,
  Utf16BE,
  Windows1252,
  Windows1251,
  Windows1250,
  Utf8 // This is an extension. Detected solely by the UTF8 BOM.
 };
 using std::string;
 using gd::wstring;
 using gd::wchar;
@ -40,4 +52,8 @@ wstring decode( string const & ) THROW_SPEC( exCantDecode );
 /// Linux but was messing up strings under Windows.
 bool isspace( int c );
 //get the first line in string s1. -1 if not found
 int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length);
 char const* getEncodingNameFor(Encoding e);
 }
 #endif