From f0a3df3d6f5f60475ab1323239542b066762e4b8 Mon Sep 17 00:00:00 2001 From: xiaoyifang Date: Sat, 6 Nov 2021 16:26:30 +0800 Subject: [PATCH] refract encoding method --- dsl.cc | 5 +- dsl_details.cc | 116 ++++++++++++++--------------------------- dsl_details.hh | 25 +++------ gls.cc | 137 +++++++++++++++++++++++++------------------------ utf8.cc | 33 ++++++++++++ utf8.hh | 18 ++++++- 6 files changed, 170 insertions(+), 164 deletions(-) diff --git a/dsl.cc b/dsl.cc index 19968748..376c8f5c 100644 --- a/dsl.cc +++ b/dsl.cc @@ -75,6 +75,7 @@ using gd::wstring; using gd::wchar; using std::vector; using std::list; +using Utf8::Encoding; using BtreeIndexing::WordArticleLink; using BtreeIndexing::IndexedWords; @@ -597,7 +598,7 @@ void DslDictionary::loadArticle( uint32_t address, { articleData = Iconv::toWstring( - getEncodingNameFor( DslEncoding( idxHeader.dslEncoding ) ), + Utf8::getEncodingNameFor( Encoding( idxHeader.dslEncoding ) ), articleBody, articleSize ); free( articleBody ); @@ -1361,7 +1362,7 @@ void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword, { articleData = Iconv::toWstring( - getEncodingNameFor( DslEncoding( idxHeader.dslEncoding ) ), + getEncodingNameFor( Encoding( idxHeader.dslEncoding ) ), articleBody, articleSize ); free( articleBody ); diff --git a/dsl_details.cc b/dsl_details.cc index 80e4c861..540377d3 100644 --- a/dsl_details.cc +++ b/dsl_details.cc @@ -19,6 +19,7 @@ namespace Details { using gd::wstring; using std::list; +using Utf8::Encoding; #ifndef __linux__ @@ -41,18 +42,6 @@ int wcscasecmp( const wchar *s1, const wchar *s2 ) #endif -//get the first line in string s1. -1 if not found -int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length) -{ - char* pos = std::search(s1,s1+s1length, s2, s2+s2length); - - if (pos == s1 + s1length) - return pos-s1; - - //the line size. - return pos- s1+ s2length; -} - static DSLLangCode LangCodes[] = { { 1, "en" }, @@ -159,25 +148,7 @@ bool isAtSignFirst( wstring const & str ) return reg.indexIn( gd::toQString( str ) ) == 0; } -char const* getEncodingNameFor(DslEncoding e) -{ - switch (e) - { - case Utf16LE: - return "UTF-16LE"; - case Utf16BE: - return "UTF-16BE"; - case Windows1252: - return "WINDOWS-1252"; - case Windows1251: - return "WINDOWS-1251"; - case Details::Utf8: - return "UTF-8"; - case Windows1250: - default: - return "WINDOWS-1250"; - } -} + /////////////// ArticleDom @@ -811,38 +782,36 @@ void ArticleDom::closeTag( wstring const & name, void ArticleDom::nextChar() THROW_SPEC( eot ) { - if ( !*stringPos ) - throw eot(); - else{ - ch = *stringPos++; + if ( !*stringPos ) + throw eot(); - if ( ch == L'\\' ) - { + ch = *stringPos++; + + if ( ch == L'\\' ) + { if ( !*stringPos ) - throw eot(); + throw eot(); ch = *stringPos++; escaped = true; - } - else - if ( ch == L'[' && *stringPos == L'[' ) - { + } + else if ( ch == L'[' && *stringPos == L'[' ) + { ++stringPos; escaped = true; - } - else - if ( ch == L']' && *stringPos == L']' ) - { + } + else if ( ch == L']' && *stringPos == L']' ) + { ++stringPos; escaped = true; - } - else + } + else escaped = false; - if( ch == '\n' || ch == '\r' ) + if( ch == '\n' || ch == '\r' ) lineStartPos = stringPos; - } + } bool ArticleDom::atSignFirstInLine() @@ -857,7 +826,7 @@ bool ArticleDom::atSignFirstInLine() /////////////// DslScanner DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ): - encoding( Windows1252 ), readBufferPtr( readBuffer ), + encoding( Utf8::Windows1252 ), readBufferPtr( readBuffer ), readBufferLeft( 0 ), linesRead( 0 ) { // Since .dz is backwards-compatible with .gz, we use gz- functions to @@ -884,10 +853,10 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ): // If the file begins with the dedicated Unicode marker, we just consume // it. If, on the other hand, it's not, we return the bytes back if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE ) - encoding = Utf16LE; + encoding = Utf8::Utf16LE; else if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF ) - encoding = Utf16BE; + encoding = Utf8::Utf16BE; else if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB ) { @@ -899,22 +868,22 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ): throw exMalformedDslFile( fileName ); } - encoding = Utf8; + encoding = Utf8::Utf8; } else { if ( firstBytes[ 0 ] && !firstBytes[ 1 ] ) - encoding = Utf16LE; + encoding = Utf8::Utf16LE; else if ( !firstBytes[ 0 ] && firstBytes[ 1 ] ) - encoding = Utf16BE; + encoding = Utf8::Utf16BE; else { // Ok, this doesn't look like 16-bit Unicode. We will start with a // 8-bit encoding with an intent to find out the exact one from // the header. needExactEncoding = true; - encoding = Windows1251; + encoding = Utf8::Windows1251; } if ( gzrewind( f ) ) @@ -995,13 +964,13 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ): } else if ( !wcscasecmp( arg.c_str(), GD_NATIVE_TO_WS( L"Latin" ) ) ) - encoding = Windows1252; + encoding = Utf8::Windows1252; else if ( !wcscasecmp( arg.c_str(), GD_NATIVE_TO_WS( L"Cyrillic" ) ) ) - encoding = Windows1251; + encoding = Utf8::Windows1251; else if ( !wcscasecmp( arg.c_str(), GD_NATIVE_TO_WS( L"EasternEuropean" ) ) ) - encoding = Windows1250; + encoding = Utf8::Windows1250; else { gzclose( f ); @@ -1036,8 +1005,6 @@ bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_wo // Check that we have bytes to read if ( readBufferLeft < 5000 ) { - //readBufferPtr+=pos; - //readBufferLeft-=pos; if ( !gzeof( f ) ) { // To avoid having to deal with ring logic, we move the remaining bytes @@ -1053,19 +1020,12 @@ bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_wo readBufferPtr = readBuffer; readBufferLeft += (size_t) result; - /*QByteArray frag = QByteArray::fromRawData(readBuffer, readBufferLeft); - fragStream = new QTextStream(frag) ; - fragStream->setCodec(codec);*/ } } - - //if(fragStream->atEnd()) - // return false; - if(readBufferLeft<=0) return false; - //QString line=fragStream->readLine(); - int pos = findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed,lineFeedLength); + + int pos = Utf8::findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed,lineFeedLength); if(pos==-1) return false; QString line = codec->toUnicode(readBufferPtr, pos); @@ -1123,25 +1083,25 @@ bool DslScanner::readNextLineWithoutComments( wstring & out, size_t & offset , b /////////////// DslScanner -void DslScanner::initLineFeed(DslEncoding e) +void DslScanner::initLineFeed(Utf8::Encoding e) { switch (e) { - case Utf16LE: + case Utf8::Utf16LE: lineFeed= new char[2] {0x0A,0}; lineFeedLength = 2; break; - case Utf16BE: + case Utf8::Utf16BE: lineFeed = new char[2] { 0,0x0A}; lineFeedLength = 2; break; - case Windows1252: + case Utf8::Windows1252: - case Windows1251: + case Utf8::Windows1251: - case Details::Utf8: + case Utf8::Utf8: - case Windows1250: + case Utf8::Windows1250: default: lineFeedLength = 1; lineFeed = new char[1] {0x0A}; diff --git a/dsl_details.hh b/dsl_details.hh index 98b6ee5b..5b698a74 100644 --- a/dsl_details.hh +++ b/dsl_details.hh @@ -12,6 +12,7 @@ #include "iconv.hh" #include #include +#include "utf8.hh" // Implementation details for Dsl, not part of its interface namespace Dsl { @@ -22,17 +23,9 @@ using gd::wstring; using gd::wchar; using std::list; using std::vector; +using Utf8::Encoding; + -// Those are possible encodings for .dsl files -enum DslEncoding -{ - Utf16LE, - Utf16BE, - Windows1252, - Windows1251, - Windows1250, - Utf8 // This is an extension. Detected solely by the UTF8 BOM. -}; struct DSLLangCode { @@ -44,8 +37,6 @@ string findCodeForDslId( int id ); bool isAtSignFirst( wstring const & str ); -char const* getEncodingNameFor(DslEncoding e); - /// Parses the DSL language, representing it in its structural DOM form. struct ArticleDom { @@ -111,7 +102,7 @@ private: class DslScanner { gzFile f; - DslEncoding encoding; + Encoding encoding; QTextCodec* codec; wstring dictionaryName; wstring langFrom, langTo; @@ -138,9 +129,9 @@ public: ~DslScanner() throw(); /// Returns the detected encoding of this file. - DslEncoding getEncoding() const + Encoding getEncoding() const { return encoding; } - void initLineFeed(DslEncoding e); + void initLineFeed(Encoding e); /// Returns the dictionary's name, as was read from file's headers. wstring const & getDictionaryName() const @@ -207,8 +198,8 @@ inline size_t DslScanner::distanceToBytes( size_t x ) const { switch( encoding ) { - case Utf16LE: - case Utf16BE: + case Utf8::Utf16LE: + case Utf8::Utf16BE: return x*2; default: return x; diff --git a/gls.cc b/gls.cc index 5d76fb75..59ecf360 100644 --- a/gls.cc +++ b/gls.cc @@ -58,13 +58,7 @@ using gd::wchar; using BtreeIndexing::WordArticleLink; using BtreeIndexing::IndexedWords; using BtreeIndexing::IndexInfo; - -enum Encoding -{ - Utf8, - Utf16LE, - Utf16BE -}; +using Utf8::Encoding; /////////////// GlsScanner @@ -73,15 +67,14 @@ class GlsScanner gzFile f; Encoding encoding; QTextCodec* codec; - Iconv iconv; wstring dictionaryName; wstring dictionaryDecription, dictionaryAuthor; wstring langFrom, langTo; char readBuffer[ 10000 ]; char * readBufferPtr; size_t readBufferLeft; - QTextStream* fragStream; - qint64 pos; + const char* lineFeed; + int lineFeedLength; unsigned linesRead; public: @@ -126,30 +119,15 @@ public: /// Reading begins from the first line after the headers (ones which end /// by the "### Glossary section:" line). bool readNextLine( wstring &, size_t & offset ) THROW_SPEC( Ex, Iconv::Ex ); - + void initLineFeed(Utf8::Encoding e); /// Returns the number of lines read so far from the file. unsigned getLinesRead() const { return linesRead; } - - /// Returns a name to be passed to iconv for the given encoding. - static char const * getEncodingNameFor( Encoding e ) - { - switch( e ) - { - case Utf16LE: - return Iconv::Utf16Le; - case Utf16BE: - return "UTF-16BE"; - case Utf8: - default: - return Iconv::Utf8; - } - } }; GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ): - encoding( Utf8 ), iconv( Iconv::GdWchar, Iconv::Utf8 ), readBufferPtr( readBuffer ), - readBufferLeft( 0 ), linesRead( 0 ), pos(0) + encoding( Utf8::Utf8 ), readBufferPtr( readBuffer ), + readBufferLeft( 0 ), linesRead( 0 ) { // Since .dz is backwards-compatible with .gz, we use gz- functions to // read it -- they are much nicer than the dict_data- ones. @@ -172,10 +150,10 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ): // If the file begins with the dedicated Unicode marker, we just consume // it. If, on the other hand, it's not, we return the bytes back if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE ) - encoding = Utf16LE; + encoding = Utf8::Utf16LE; else if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF ) - encoding = Utf16BE; + encoding = Utf8::Utf16BE; else if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB ) { @@ -186,7 +164,7 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ): gzclose( f ); throw exMalformedGlsFile( fileName ); } - encoding = Utf8; + encoding = Utf8::Utf8; } else { @@ -195,12 +173,10 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ): gzclose( f ); throw exCantOpen( fileName ); } - encoding = Utf8; + encoding = Utf8::Utf8; } - if( encoding != Utf8 ) - iconv.reinit( Iconv::GdWchar, getEncodingNameFor( encoding ) ); - codec = QTextCodec::codecForName(getEncodingNameFor(encoding)); + codec = QTextCodec::codecForName(Utf8::getEncodingNameFor(encoding)); // We now can use our own readNextLine() function wstring str; @@ -267,45 +243,74 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ): } } } +void GlsScanner::initLineFeed(Utf8::Encoding e) +{ + switch (e) + { + case Utf8::Utf16LE: + lineFeed= new char[2] {0x0A,0}; + lineFeedLength = 2; + break; + case Utf8::Utf16BE: + lineFeed = new char[2] { 0,0x0A}; + lineFeedLength = 2; + break; + case Utf8::Windows1252: + case Utf8::Windows1251: + + case Utf8::Utf8: + + case Utf8::Windows1250: + default: + lineFeedLength = 1; + lineFeed = new char[1] {0x0A}; + } +} bool GlsScanner::readNextLine( wstring & out, size_t & offset ) THROW_SPEC( Ex, Iconv::Ex ) { - offset = (size_t)(gztell(f) - readBufferLeft + pos); + offset = (size_t)(gztell(f) - readBufferLeft); - { - // Check that we have bytes to read - if (readBufferLeft - pos < 2000) - { - readBufferPtr += pos; - readBufferLeft -= pos; - if (!gzeof(f)) - { - // To avoid having to deal with ring logic, we move the remaining bytes - // to the beginning - memmove(readBuffer, readBufferPtr, readBufferLeft); + { + // Check that we have bytes to read + if ( readBufferLeft < 5000 ) + { + if ( !gzeof( f ) ) + { + // To avoid having to deal with ring logic, we move the remaining bytes + // to the beginning + memmove( readBuffer, readBufferPtr, readBufferLeft ); - // Read some more bytes to readBuffer - int result = gzread(f, readBuffer + readBufferLeft, - sizeof(readBuffer) - readBufferLeft); + // Read some more bytes to readBuffer + int result = gzread( f, readBuffer + readBufferLeft, + sizeof( readBuffer ) - readBufferLeft ); - if (result == -1) - throw exCantReadGlsFile(); + if (result == -1) + throw exCantReadGlsFile(); - readBufferPtr = readBuffer; - readBufferLeft += (size_t)result; - QByteArray frag = QByteArray::fromRawData(readBuffer, readBufferLeft); - fragStream = new QTextStream(frag); - fragStream->setCodec(codec); - } - } + readBufferPtr = readBuffer; + readBufferLeft += (size_t) result; + } + } + if(readBufferLeft<=0) + return false; - if (fragStream->atEnd()) - return false; + int pos = Utf8::findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed,lineFeedLength); + if(pos==-1) + return false; + QString line = codec->toUnicode(readBufferPtr, pos); + if(line.endsWith("\n")) + line.chop(1); + if(line.endsWith("\r")) + line.chop(1); - QString line = fragStream->readLine(); - pos = fragStream->pos(); - linesRead++; + if(pos>readBufferLeft){ + pos=readBufferLeft; + } + readBufferLeft -= pos; + readBufferPtr += pos; + linesRead++; #ifdef __WIN32 out = line.toStdU32String(); @@ -314,7 +319,7 @@ bool GlsScanner::readNextLine( wstring & out, size_t & offset ) THROW_SPEC( Ex, #endif return true; - } + } } GlsScanner::~GlsScanner() throw() @@ -669,7 +674,7 @@ void GlsDictionary::loadArticleText( uint32_t address, } else { - string articleData = Iconv::toUtf8( GlsScanner::getEncodingNameFor( Encoding( idxHeader.glsEncoding ) ), articleBody, articleSize ); + string articleData = Iconv::toUtf8( Utf8::getEncodingNameFor( Encoding( idxHeader.glsEncoding ) ), articleBody, articleSize ); string::size_type start_pos = 0, end_pos = 0; for( ; ; ) diff --git a/utf8.cc b/utf8.cc index c7e516f4..a1370d21 100644 --- a/utf8.cc +++ b/utf8.cc @@ -3,6 +3,7 @@ #include "utf8.hh" #include +#include namespace Utf8 { @@ -175,4 +176,36 @@ bool isspace( int c ) } } +//get the first line in string s1. -1 if not found +int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length) +{ + char* pos = std::search(s1,s1+s1length, s2, s2+s2length); + + if (pos == s1 + s1length) + return pos-s1; + + //the line size. + return pos- s1+ s2length; +} + +char const* getEncodingNameFor(Encoding e) +{ + switch (e) + { + case Utf16LE: + return "UTF-16LE"; + case Utf16BE: + return "UTF-16BE"; + case Windows1252: + return "WINDOWS-1252"; + case Windows1251: + return "WINDOWS-1251"; + case Utf8: + return "UTF-8"; + case Windows1250: + default: + return "WINDOWS-1250"; + } +} + } diff --git a/utf8.hh b/utf8.hh index abd86a2f..daf06907 100644 --- a/utf8.hh +++ b/utf8.hh @@ -1,6 +1,7 @@ /* This file is (c) 2008-2012 Konstantin Isakov * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */ - +#ifndef __UTF8_HH_INCLUDED__ +#define __UTF8_HH_INCLUDED__ #include #include #include "cpp_features.hh" @@ -13,6 +14,17 @@ /// places. namespace Utf8 { +// Those are possible encodings for .dsl files +enum Encoding +{ + Utf16LE, + Utf16BE, + Windows1252, + Windows1251, + Windows1250, + Utf8 // This is an extension. Detected solely by the UTF8 BOM. +}; + using std::string; using gd::wstring; using gd::wchar; @@ -40,4 +52,8 @@ wstring decode( string const & ) THROW_SPEC( exCantDecode ); /// Linux but was messing up strings under Windows. bool isspace( int c ); +//get the first line in string s1. -1 if not found +int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length); +char const* getEncodingNameFor(Encoding e); } +#endif