refract encoding method

2024-11-24 00:14:06 +00:00 · 2021-11-06 16:26:30 +08:00 · 2021-11-06 16:26:30 +08:00 · f0a3df3d6f
parent 02a88c98ad
commit f0a3df3d6f
6 changed files with 170 additions and 164 deletions
--- a/dsl.cc
+++ b/dsl.cc
@ -75,6 +75,7 @@ using gd::wstring;
 using gd::wchar;
 using std::vector;
 using std::list;
+using Utf8::Encoding;

 using BtreeIndexing::WordArticleLink;
 using BtreeIndexing::IndexedWords;
@ -597,7 +598,7 @@ void DslDictionary::loadArticle( uint32_t address,
      {
        articleData =
          Iconv::toWstring(
-            getEncodingNameFor( DslEncoding( idxHeader.dslEncoding ) ),
+            Utf8::getEncodingNameFor( Encoding( idxHeader.dslEncoding ) ),
            articleBody, articleSize );
        free( articleBody );

@ -1361,7 +1362,7 @@ void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword,
    {
      articleData =
        Iconv::toWstring(
-          getEncodingNameFor( DslEncoding( idxHeader.dslEncoding ) ),
+          getEncodingNameFor( Encoding( idxHeader.dslEncoding ) ),
          articleBody, articleSize );
      free( articleBody );

--- a/dsl_details.cc
+++ b/dsl_details.cc
@ -19,6 +19,7 @@ namespace Details {

 using gd::wstring;
 using std::list;
+using Utf8::Encoding;

 #ifndef __linux__

@ -41,18 +42,6 @@ int wcscasecmp( const wchar *s1, const wchar *s2 )

 #endif

-//get the first line in string s1. -1 if not found
-int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length)
-{
-    char* pos = std::search(s1,s1+s1length, s2, s2+s2length);
-
-    if (pos == s1 + s1length)
-        return pos-s1;
-
-    //the line size.
-    return pos- s1+ s2length;
-}
-
 static DSLLangCode LangCodes[] =
 {
  { 1, "en" },
@ -159,25 +148,7 @@ bool isAtSignFirst( wstring const & str )
  return reg.indexIn( gd::toQString( str ) ) == 0;
 }

-char const* getEncodingNameFor(DslEncoding e)
-{
-    switch (e)
-    {
-    case Utf16LE:
-        return "UTF-16LE";
-    case Utf16BE:
-        return "UTF-16BE";
-    case Windows1252:
-        return "WINDOWS-1252";
-    case Windows1251:
-        return "WINDOWS-1251";
-    case Details::Utf8:
-        return "UTF-8";
-    case Windows1250:
-    default:
-        return "WINDOWS-1250";
-    }
-}
+

 /////////////// ArticleDom

@ -811,38 +782,36 @@ void ArticleDom::closeTag( wstring const & name,

 void ArticleDom::nextChar() THROW_SPEC( eot )
 {
-  if ( !*stringPos )
-    throw eot();
-  else{
-      ch = *stringPos++;
+    if ( !*stringPos )
+        throw eot();

-      if ( ch == L'\\' )
-      {
+    ch = *stringPos++;
+
+    if ( ch == L'\\' )
+    {
        if ( !*stringPos )
-          throw eot();
+            throw eot();

        ch = *stringPos++;

        escaped = true;
-      }
-      else
-      if ( ch == L'[' && *stringPos == L'[' )
-      {
+    }
+    else if ( ch == L'[' && *stringPos == L'[' )
+    {
        ++stringPos;
        escaped = true;
-      }
-      else
-      if ( ch == L']' && *stringPos == L']' )
-      {
+    }
+    else if ( ch == L']' && *stringPos == L']' )
+    {
        ++stringPos;
        escaped = true;
-      }
-      else
+    }
+    else
        escaped = false;

-      if( ch == '\n' || ch == '\r' )
+    if( ch == '\n' || ch == '\r' )
        lineStartPos = stringPos;
-  }
+
 }

 bool ArticleDom::atSignFirstInLine()
@ -857,7 +826,7 @@ bool ArticleDom::atSignFirstInLine()
 /////////////// DslScanner

 DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
-  encoding( Windows1252 ), readBufferPtr( readBuffer ),
+  encoding( Utf8::Windows1252 ), readBufferPtr( readBuffer ),
  readBufferLeft( 0 ), linesRead( 0 )
 {
  // Since .dz is backwards-compatible with .gz, we use gz- functions to
@ -884,10 +853,10 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
  // If the file begins with the dedicated Unicode marker, we just consume
  // it. If, on the other hand, it's not, we return the bytes back
  if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE )
-    encoding = Utf16LE;
+    encoding = Utf8::Utf16LE;
  else
  if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF )
-    encoding = Utf16BE;
+    encoding = Utf8::Utf16BE;
  else
  if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB )
  {
@ -899,22 +868,22 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
      throw exMalformedDslFile( fileName );
    }
    
-    encoding = Utf8;
+    encoding = Utf8::Utf8;
  }
  else
  {
    if ( firstBytes[ 0 ] && !firstBytes[ 1 ] )
-      encoding = Utf16LE;
+      encoding = Utf8::Utf16LE;
    else
    if ( !firstBytes[ 0 ] && firstBytes[ 1 ] )
-      encoding = Utf16BE;
+      encoding = Utf8::Utf16BE;
    else
    {
      // Ok, this doesn't look like 16-bit Unicode. We will start with a
      // 8-bit encoding with an intent to find out the exact one from
      // the header.
      needExactEncoding = true;
-      encoding = Windows1251;
+      encoding = Utf8::Windows1251;
    }

    if ( gzrewind( f ) )
@ -995,13 +964,13 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
      }
      else
      if ( !wcscasecmp( arg.c_str(), GD_NATIVE_TO_WS( L"Latin" ) ) )
-        encoding = Windows1252;
+        encoding = Utf8::Windows1252;
      else
      if ( !wcscasecmp( arg.c_str(), GD_NATIVE_TO_WS( L"Cyrillic" ) ) )
-        encoding = Windows1251;
+        encoding = Utf8::Windows1251;
      else
      if ( !wcscasecmp( arg.c_str(), GD_NATIVE_TO_WS( L"EasternEuropean" ) ) )
-        encoding = Windows1250;
+        encoding = Utf8::Windows1250;
      else
      {
        gzclose( f );
@ -1036,8 +1005,6 @@ bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_wo
    // Check that we have bytes to read
    if ( readBufferLeft < 5000 )
    {
-      //readBufferPtr+=pos;
-      //readBufferLeft-=pos;
      if ( !gzeof( f ) )
      {
        // To avoid having to deal with ring logic, we move the remaining bytes
@ -1053,19 +1020,12 @@ bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_wo

        readBufferPtr = readBuffer;
        readBufferLeft += (size_t) result;
-        /*QByteArray frag = QByteArray::fromRawData(readBuffer, readBufferLeft);
-        fragStream = new QTextStream(frag) ;
-        fragStream->setCodec(codec);*/
      }
    }
-
-    //if(fragStream->atEnd())
-    //    return false;
-
    if(readBufferLeft<=0)
        return false;
-    //QString line=fragStream->readLine();
-    int pos = findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed,lineFeedLength);
+
+    int pos = Utf8::findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed,lineFeedLength);
    if(pos==-1)
        return false;
    QString line = codec->toUnicode(readBufferPtr, pos);
@ -1123,25 +1083,25 @@ bool DslScanner::readNextLineWithoutComments( wstring & out, size_t & offset , b

 /////////////// DslScanner

-void DslScanner::initLineFeed(DslEncoding e)
+void DslScanner::initLineFeed(Utf8::Encoding e)
 {
 	switch (e)
 	{
-	case Utf16LE:
+    case Utf8::Utf16LE:
        lineFeed= new char[2] {0x0A,0};
        lineFeedLength = 2;
        break;
-	case Utf16BE:
+    case Utf8::Utf16BE:
        lineFeed = new char[2] { 0,0x0A};
        lineFeedLength = 2;
        break;
-	case Windows1252:
+    case Utf8::Windows1252:
 		
-	case Windows1251:
+    case Utf8::Windows1251:
 		
-	case Details::Utf8:
+    case Utf8::Utf8:
 		
-	case Windows1250:
+    case Utf8::Windows1250:
 	default:
        lineFeedLength = 1;
        lineFeed = new char[1] {0x0A};
--- a/dsl_details.hh
+++ b/dsl_details.hh
@ -12,6 +12,7 @@
 #include "iconv.hh"
 #include <QTextCodec>
 #include <QByteArray>
+#include "utf8.hh"

 // Implementation details for Dsl, not part of its interface
 namespace Dsl {
@ -22,17 +23,9 @@ using gd::wstring;
 using gd::wchar;
 using std::list;
 using std::vector;
+using Utf8::Encoding;
+

-// Those are possible encodings for .dsl files
-enum DslEncoding
-{
-  Utf16LE,
-  Utf16BE,
-  Windows1252,
-  Windows1251,
-  Windows1250,
-  Utf8 // This is an extension. Detected solely by the UTF8 BOM.
-};

 struct DSLLangCode
 {
@ -44,8 +37,6 @@ string findCodeForDslId( int id );

 bool isAtSignFirst( wstring const & str );

-char const* getEncodingNameFor(DslEncoding e);
-
 /// Parses the DSL language, representing it in its structural DOM form.
 struct ArticleDom
 {
@ -111,7 +102,7 @@ private:
 class DslScanner
 {
  gzFile f;
-  DslEncoding encoding;
+  Encoding encoding;
  QTextCodec* codec;
  wstring dictionaryName;
  wstring langFrom, langTo;
@ -138,9 +129,9 @@ public:
  ~DslScanner() throw();

  /// Returns the detected encoding of this file.
-  DslEncoding getEncoding() const
+  Encoding getEncoding() const
  { return encoding; }
-  void initLineFeed(DslEncoding e);
+  void initLineFeed(Encoding e);

  /// Returns the dictionary's name, as was read from file's headers.
  wstring const & getDictionaryName() const
@ -207,8 +198,8 @@ inline size_t DslScanner::distanceToBytes( size_t x ) const
 {
  switch( encoding )
  {
-    case Utf16LE:
-    case Utf16BE:
+    case Utf8::Utf16LE:
+    case Utf8::Utf16BE:
      return x*2;
    default:
      return x;
--- a/gls.cc
+++ b/gls.cc
@ -58,13 +58,7 @@ using gd::wchar;
 using BtreeIndexing::WordArticleLink;
 using BtreeIndexing::IndexedWords;
 using BtreeIndexing::IndexInfo;
-
-enum Encoding
-{
-  Utf8,
-  Utf16LE,
-  Utf16BE
-};
+using Utf8::Encoding;

 /////////////// GlsScanner

@ -73,15 +67,14 @@ class GlsScanner
  gzFile f;
  Encoding encoding;
  QTextCodec* codec;
-  Iconv iconv;
  wstring dictionaryName;
  wstring dictionaryDecription, dictionaryAuthor;
  wstring langFrom, langTo;
  char readBuffer[ 10000 ];
  char * readBufferPtr;
  size_t readBufferLeft;
-  QTextStream* fragStream;
-  qint64 pos;
+  const char* lineFeed;
+  int lineFeedLength;
  unsigned linesRead;

 public:
@ -126,30 +119,15 @@ public:
  /// Reading begins from the first line after the headers (ones which end
  /// by the "### Glossary section:" line).
  bool readNextLine( wstring &, size_t & offset ) THROW_SPEC( Ex, Iconv::Ex );
-
+  void initLineFeed(Utf8::Encoding e);
  /// Returns the number of lines read so far from the file.
  unsigned getLinesRead() const
  { return linesRead; }
-
-  /// Returns a name to be passed to iconv for the given encoding.
-  static char const * getEncodingNameFor( Encoding e )
-  {
-    switch( e )
-    {
-      case Utf16LE:
-        return Iconv::Utf16Le;
-      case Utf16BE:
-        return "UTF-16BE";
-      case Utf8:
-      default:
-        return Iconv::Utf8;
-    }
-  }
 };

 GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
-  encoding( Utf8 ), iconv( Iconv::GdWchar, Iconv::Utf8 ), readBufferPtr( readBuffer ),
-  readBufferLeft( 0 ), linesRead( 0 ), pos(0)
+  encoding( Utf8::Utf8 ), readBufferPtr( readBuffer ),
+  readBufferLeft( 0 ), linesRead( 0 )
 {
  // Since .dz is backwards-compatible with .gz, we use gz- functions to
  // read it -- they are much nicer than the dict_data- ones.
@ -172,10 +150,10 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
  // If the file begins with the dedicated Unicode marker, we just consume
  // it. If, on the other hand, it's not, we return the bytes back
  if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE )
-    encoding = Utf16LE;
+    encoding = Utf8::Utf16LE;
  else
  if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF )
-    encoding = Utf16BE;
+    encoding = Utf8::Utf16BE;
  else
  if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB )
  {
@ -186,7 +164,7 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
      gzclose( f );
      throw exMalformedGlsFile( fileName );
    }
-    encoding = Utf8;
+    encoding = Utf8::Utf8;
  }
  else
  {
@ -195,12 +173,10 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
      gzclose( f );
      throw exCantOpen( fileName );
    }
-    encoding = Utf8;
+    encoding = Utf8::Utf8;
  }

-  if( encoding != Utf8 )
-    iconv.reinit( Iconv::GdWchar, getEncodingNameFor( encoding ) );
-  codec = QTextCodec::codecForName(getEncodingNameFor(encoding));
+  codec = QTextCodec::codecForName(Utf8::getEncodingNameFor(encoding));
  // We now can use our own readNextLine() function

  wstring str;
@ -267,45 +243,74 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
    }
  }
 }
+void GlsScanner::initLineFeed(Utf8::Encoding e)
+{
+    switch (e)
+    {
+    case Utf8::Utf16LE:
+        lineFeed= new char[2] {0x0A,0};
+        lineFeedLength = 2;
+        break;
+    case Utf8::Utf16BE:
+        lineFeed = new char[2] { 0,0x0A};
+        lineFeedLength = 2;
+        break;
+    case Utf8::Windows1252:

+    case Utf8::Windows1251:
+
+    case Utf8::Utf8:
+
+    case Utf8::Windows1250:
+    default:
+        lineFeedLength = 1;
+        lineFeed = new char[1] {0x0A};
+    }
+}
 bool GlsScanner::readNextLine( wstring & out, size_t & offset ) THROW_SPEC( Ex,
                                                                       Iconv::Ex )
 {
-	offset = (size_t)(gztell(f) - readBufferLeft + pos);
+    offset = (size_t)(gztell(f) - readBufferLeft);

-	{
-		// Check that we have bytes to read
-		if (readBufferLeft - pos < 2000)
-		{
-			readBufferPtr += pos;
-			readBufferLeft -= pos;
-			if (!gzeof(f))
-			{
-				// To avoid having to deal with ring logic, we move the remaining bytes
-				// to the beginning
-				memmove(readBuffer, readBufferPtr, readBufferLeft);
+    {
+      // Check that we have bytes to read
+      if ( readBufferLeft < 5000 )
+      {
+        if ( !gzeof( f ) )
+        {
+          // To avoid having to deal with ring logic, we move the remaining bytes
+          // to the beginning
+          memmove( readBuffer, readBufferPtr, readBufferLeft );

-				// Read some more bytes to readBuffer
-				int result = gzread(f, readBuffer + readBufferLeft,
-					sizeof(readBuffer) - readBufferLeft);
+          // Read some more bytes to readBuffer
+          int result = gzread( f, readBuffer + readBufferLeft,
+                               sizeof( readBuffer ) - readBufferLeft );

-				if (result == -1)
-                    throw exCantReadGlsFile();
+		  if (result == -1)
+            throw exCantReadGlsFile();

-				readBufferPtr = readBuffer;
-				readBufferLeft += (size_t)result;
-				QByteArray frag = QByteArray::fromRawData(readBuffer, readBufferLeft);
-				fragStream = new QTextStream(frag);
-				fragStream->setCodec(codec);
-			}
-		}
+          readBufferPtr = readBuffer;
+          readBufferLeft += (size_t) result;
+        }
+      }
+      if(readBufferLeft<=0)
+          return false;

-		if (fragStream->atEnd())
-			return false;
+      int pos = Utf8::findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed,lineFeedLength);
+      if(pos==-1)
+          return false;
+      QString line = codec->toUnicode(readBufferPtr, pos);
+      if(line.endsWith("\n"))
+          line.chop(1);
+      if(line.endsWith("\r"))
+          line.chop(1);

-		QString line = fragStream->readLine();
-		pos = fragStream->pos();
-		linesRead++;
+      if(pos>readBufferLeft){
+          pos=readBufferLeft;
+      }
+      readBufferLeft -= pos;
+      readBufferPtr += pos;
+      linesRead++;

 #ifdef __WIN32
 		out = line.toStdU32String();
@ -314,7 +319,7 @@ bool GlsScanner::readNextLine( wstring & out, size_t & offset ) THROW_SPEC( Ex,
 #endif
 		return true;

-	}
+    }
 }

 GlsScanner::~GlsScanner() throw()
@ -669,7 +674,7 @@ void GlsDictionary::loadArticleText( uint32_t address,
  }
  else
  {
-    string articleData = Iconv::toUtf8( GlsScanner::getEncodingNameFor( Encoding( idxHeader.glsEncoding ) ), articleBody, articleSize );
+    string articleData = Iconv::toUtf8( Utf8::getEncodingNameFor( Encoding( idxHeader.glsEncoding ) ), articleBody, articleSize );
    string::size_type start_pos = 0, end_pos = 0;

    for( ; ; )
--- a/utf8.cc
+++ b/utf8.cc
@ -3,6 +3,7 @@

 #include "utf8.hh"
 #include <vector>
+#include <algorithm>

 namespace Utf8 {

@ -175,4 +176,36 @@ bool isspace( int c )
  }
 }

+//get the first line in string s1. -1 if not found
+int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length)
+{
+    char* pos = std::search(s1,s1+s1length, s2, s2+s2length);
+
+    if (pos == s1 + s1length)
+        return pos-s1;
+
+    //the line size.
+    return pos- s1+ s2length;
+}
+
+char const* getEncodingNameFor(Encoding e)
+{
+    switch (e)
+    {
+    case Utf16LE:
+        return "UTF-16LE";
+    case Utf16BE:
+        return "UTF-16BE";
+    case Windows1252:
+        return "WINDOWS-1252";
+    case Windows1251:
+        return "WINDOWS-1251";
+    case Utf8:
+        return "UTF-8";
+    case Windows1250:
+    default:
+        return "WINDOWS-1250";
+    }
+}
+
 }
--- a/utf8.hh
+++ b/utf8.hh
@ -1,6 +1,7 @@
 /* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
 * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
-
+#ifndef __UTF8_HH_INCLUDED__
+#define __UTF8_HH_INCLUDED__
 #include <cstdio>
 #include <string>
 #include "cpp_features.hh"
@ -13,6 +14,17 @@
 /// places.
 namespace Utf8 {

+// Those are possible encodings for .dsl files
+enum Encoding
+{
+  Utf16LE,
+  Utf16BE,
+  Windows1252,
+  Windows1251,
+  Windows1250,
+  Utf8 // This is an extension. Detected solely by the UTF8 BOM.
+};
+
 using std::string;
 using gd::wstring;
 using gd::wchar;
@ -40,4 +52,8 @@ wstring decode( string const & ) THROW_SPEC( exCantDecode );
 /// Linux but was messing up strings under Windows.
 bool isspace( int c );

+//get the first line in string s1. -1 if not found
+int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length);
+char const* getEncodingNameFor(Encoding e);
 }
+#endif