refract encoding method

This commit is contained in:
xiaoyifang 2021-11-06 16:26:30 +08:00
parent 02a88c98ad
commit f0a3df3d6f
6 changed files with 170 additions and 164 deletions

5
dsl.cc
View file

@ -75,6 +75,7 @@ using gd::wstring;
using gd::wchar;
using std::vector;
using std::list;
using Utf8::Encoding;
using BtreeIndexing::WordArticleLink;
using BtreeIndexing::IndexedWords;
@ -597,7 +598,7 @@ void DslDictionary::loadArticle( uint32_t address,
{
articleData =
Iconv::toWstring(
getEncodingNameFor( DslEncoding( idxHeader.dslEncoding ) ),
Utf8::getEncodingNameFor( Encoding( idxHeader.dslEncoding ) ),
articleBody, articleSize );
free( articleBody );
@ -1361,7 +1362,7 @@ void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword,
{
articleData =
Iconv::toWstring(
getEncodingNameFor( DslEncoding( idxHeader.dslEncoding ) ),
getEncodingNameFor( Encoding( idxHeader.dslEncoding ) ),
articleBody, articleSize );
free( articleBody );

View file

@ -19,6 +19,7 @@ namespace Details {
using gd::wstring;
using std::list;
using Utf8::Encoding;
#ifndef __linux__
@ -41,18 +42,6 @@ int wcscasecmp( const wchar *s1, const wchar *s2 )
#endif
//get the first line in string s1. -1 if not found
int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length)
{
char* pos = std::search(s1,s1+s1length, s2, s2+s2length);
if (pos == s1 + s1length)
return pos-s1;
//the line size.
return pos- s1+ s2length;
}
static DSLLangCode LangCodes[] =
{
{ 1, "en" },
@ -159,25 +148,7 @@ bool isAtSignFirst( wstring const & str )
return reg.indexIn( gd::toQString( str ) ) == 0;
}
char const* getEncodingNameFor(DslEncoding e)
{
switch (e)
{
case Utf16LE:
return "UTF-16LE";
case Utf16BE:
return "UTF-16BE";
case Windows1252:
return "WINDOWS-1252";
case Windows1251:
return "WINDOWS-1251";
case Details::Utf8:
return "UTF-8";
case Windows1250:
default:
return "WINDOWS-1250";
}
}
/////////////// ArticleDom
@ -813,7 +784,7 @@ void ArticleDom::nextChar() THROW_SPEC( eot )
{
if ( !*stringPos )
throw eot();
else{
ch = *stringPos++;
if ( ch == L'\\' )
@ -825,14 +796,12 @@ void ArticleDom::nextChar() THROW_SPEC( eot )
escaped = true;
}
else
if ( ch == L'[' && *stringPos == L'[' )
else if ( ch == L'[' && *stringPos == L'[' )
{
++stringPos;
escaped = true;
}
else
if ( ch == L']' && *stringPos == L']' )
else if ( ch == L']' && *stringPos == L']' )
{
++stringPos;
escaped = true;
@ -842,7 +811,7 @@ void ArticleDom::nextChar() THROW_SPEC( eot )
if( ch == '\n' || ch == '\r' )
lineStartPos = stringPos;
}
}
bool ArticleDom::atSignFirstInLine()
@ -857,7 +826,7 @@ bool ArticleDom::atSignFirstInLine()
/////////////// DslScanner
DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
encoding( Windows1252 ), readBufferPtr( readBuffer ),
encoding( Utf8::Windows1252 ), readBufferPtr( readBuffer ),
readBufferLeft( 0 ), linesRead( 0 )
{
// Since .dz is backwards-compatible with .gz, we use gz- functions to
@ -884,10 +853,10 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
// If the file begins with the dedicated Unicode marker, we just consume
// it. If, on the other hand, it's not, we return the bytes back
if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE )
encoding = Utf16LE;
encoding = Utf8::Utf16LE;
else
if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF )
encoding = Utf16BE;
encoding = Utf8::Utf16BE;
else
if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB )
{
@ -899,22 +868,22 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
throw exMalformedDslFile( fileName );
}
encoding = Utf8;
encoding = Utf8::Utf8;
}
else
{
if ( firstBytes[ 0 ] && !firstBytes[ 1 ] )
encoding = Utf16LE;
encoding = Utf8::Utf16LE;
else
if ( !firstBytes[ 0 ] && firstBytes[ 1 ] )
encoding = Utf16BE;
encoding = Utf8::Utf16BE;
else
{
// Ok, this doesn't look like 16-bit Unicode. We will start with a
// 8-bit encoding with an intent to find out the exact one from
// the header.
needExactEncoding = true;
encoding = Windows1251;
encoding = Utf8::Windows1251;
}
if ( gzrewind( f ) )
@ -995,13 +964,13 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
}
else
if ( !wcscasecmp( arg.c_str(), GD_NATIVE_TO_WS( L"Latin" ) ) )
encoding = Windows1252;
encoding = Utf8::Windows1252;
else
if ( !wcscasecmp( arg.c_str(), GD_NATIVE_TO_WS( L"Cyrillic" ) ) )
encoding = Windows1251;
encoding = Utf8::Windows1251;
else
if ( !wcscasecmp( arg.c_str(), GD_NATIVE_TO_WS( L"EasternEuropean" ) ) )
encoding = Windows1250;
encoding = Utf8::Windows1250;
else
{
gzclose( f );
@ -1036,8 +1005,6 @@ bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_wo
// Check that we have bytes to read
if ( readBufferLeft < 5000 )
{
//readBufferPtr+=pos;
//readBufferLeft-=pos;
if ( !gzeof( f ) )
{
// To avoid having to deal with ring logic, we move the remaining bytes
@ -1053,19 +1020,12 @@ bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_wo
readBufferPtr = readBuffer;
readBufferLeft += (size_t) result;
/*QByteArray frag = QByteArray::fromRawData(readBuffer, readBufferLeft);
fragStream = new QTextStream(frag) ;
fragStream->setCodec(codec);*/
}
}
//if(fragStream->atEnd())
// return false;
if(readBufferLeft<=0)
return false;
//QString line=fragStream->readLine();
int pos = findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed,lineFeedLength);
int pos = Utf8::findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed,lineFeedLength);
if(pos==-1)
return false;
QString line = codec->toUnicode(readBufferPtr, pos);
@ -1123,25 +1083,25 @@ bool DslScanner::readNextLineWithoutComments( wstring & out, size_t & offset , b
/////////////// DslScanner
void DslScanner::initLineFeed(DslEncoding e)
void DslScanner::initLineFeed(Utf8::Encoding e)
{
switch (e)
{
case Utf16LE:
case Utf8::Utf16LE:
lineFeed= new char[2] {0x0A,0};
lineFeedLength = 2;
break;
case Utf16BE:
case Utf8::Utf16BE:
lineFeed = new char[2] { 0,0x0A};
lineFeedLength = 2;
break;
case Windows1252:
case Utf8::Windows1252:
case Windows1251:
case Utf8::Windows1251:
case Details::Utf8:
case Utf8::Utf8:
case Windows1250:
case Utf8::Windows1250:
default:
lineFeedLength = 1;
lineFeed = new char[1] {0x0A};

View file

@ -12,6 +12,7 @@
#include "iconv.hh"
#include <QTextCodec>
#include <QByteArray>
#include "utf8.hh"
// Implementation details for Dsl, not part of its interface
namespace Dsl {
@ -22,17 +23,9 @@ using gd::wstring;
using gd::wchar;
using std::list;
using std::vector;
using Utf8::Encoding;
// Those are possible encodings for .dsl files
enum DslEncoding
{
Utf16LE,
Utf16BE,
Windows1252,
Windows1251,
Windows1250,
Utf8 // This is an extension. Detected solely by the UTF8 BOM.
};
struct DSLLangCode
{
@ -44,8 +37,6 @@ string findCodeForDslId( int id );
bool isAtSignFirst( wstring const & str );
char const* getEncodingNameFor(DslEncoding e);
/// Parses the DSL language, representing it in its structural DOM form.
struct ArticleDom
{
@ -111,7 +102,7 @@ private:
class DslScanner
{
gzFile f;
DslEncoding encoding;
Encoding encoding;
QTextCodec* codec;
wstring dictionaryName;
wstring langFrom, langTo;
@ -138,9 +129,9 @@ public:
~DslScanner() throw();
/// Returns the detected encoding of this file.
DslEncoding getEncoding() const
Encoding getEncoding() const
{ return encoding; }
void initLineFeed(DslEncoding e);
void initLineFeed(Encoding e);
/// Returns the dictionary's name, as was read from file's headers.
wstring const & getDictionaryName() const
@ -207,8 +198,8 @@ inline size_t DslScanner::distanceToBytes( size_t x ) const
{
switch( encoding )
{
case Utf16LE:
case Utf16BE:
case Utf8::Utf16LE:
case Utf8::Utf16BE:
return x*2;
default:
return x;

109
gls.cc
View file

@ -58,13 +58,7 @@ using gd::wchar;
using BtreeIndexing::WordArticleLink;
using BtreeIndexing::IndexedWords;
using BtreeIndexing::IndexInfo;
enum Encoding
{
Utf8,
Utf16LE,
Utf16BE
};
using Utf8::Encoding;
/////////////// GlsScanner
@ -73,15 +67,14 @@ class GlsScanner
gzFile f;
Encoding encoding;
QTextCodec* codec;
Iconv iconv;
wstring dictionaryName;
wstring dictionaryDecription, dictionaryAuthor;
wstring langFrom, langTo;
char readBuffer[ 10000 ];
char * readBufferPtr;
size_t readBufferLeft;
QTextStream* fragStream;
qint64 pos;
const char* lineFeed;
int lineFeedLength;
unsigned linesRead;
public:
@ -126,30 +119,15 @@ public:
/// Reading begins from the first line after the headers (ones which end
/// by the "### Glossary section:" line).
bool readNextLine( wstring &, size_t & offset ) THROW_SPEC( Ex, Iconv::Ex );
void initLineFeed(Utf8::Encoding e);
/// Returns the number of lines read so far from the file.
unsigned getLinesRead() const
{ return linesRead; }
/// Returns a name to be passed to iconv for the given encoding.
static char const * getEncodingNameFor( Encoding e )
{
switch( e )
{
case Utf16LE:
return Iconv::Utf16Le;
case Utf16BE:
return "UTF-16BE";
case Utf8:
default:
return Iconv::Utf8;
}
}
};
GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
encoding( Utf8 ), iconv( Iconv::GdWchar, Iconv::Utf8 ), readBufferPtr( readBuffer ),
readBufferLeft( 0 ), linesRead( 0 ), pos(0)
encoding( Utf8::Utf8 ), readBufferPtr( readBuffer ),
readBufferLeft( 0 ), linesRead( 0 )
{
// Since .dz is backwards-compatible with .gz, we use gz- functions to
// read it -- they are much nicer than the dict_data- ones.
@ -172,10 +150,10 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
// If the file begins with the dedicated Unicode marker, we just consume
// it. If, on the other hand, it's not, we return the bytes back
if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE )
encoding = Utf16LE;
encoding = Utf8::Utf16LE;
else
if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF )
encoding = Utf16BE;
encoding = Utf8::Utf16BE;
else
if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB )
{
@ -186,7 +164,7 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
gzclose( f );
throw exMalformedGlsFile( fileName );
}
encoding = Utf8;
encoding = Utf8::Utf8;
}
else
{
@ -195,12 +173,10 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
gzclose( f );
throw exCantOpen( fileName );
}
encoding = Utf8;
encoding = Utf8::Utf8;
}
if( encoding != Utf8 )
iconv.reinit( Iconv::GdWchar, getEncodingNameFor( encoding ) );
codec = QTextCodec::codecForName(getEncodingNameFor(encoding));
codec = QTextCodec::codecForName(Utf8::getEncodingNameFor(encoding));
// We now can use our own readNextLine() function
wstring str;
@ -267,44 +243,73 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
}
}
}
void GlsScanner::initLineFeed(Utf8::Encoding e)
{
switch (e)
{
case Utf8::Utf16LE:
lineFeed= new char[2] {0x0A,0};
lineFeedLength = 2;
break;
case Utf8::Utf16BE:
lineFeed = new char[2] { 0,0x0A};
lineFeedLength = 2;
break;
case Utf8::Windows1252:
case Utf8::Windows1251:
case Utf8::Utf8:
case Utf8::Windows1250:
default:
lineFeedLength = 1;
lineFeed = new char[1] {0x0A};
}
}
bool GlsScanner::readNextLine( wstring & out, size_t & offset ) THROW_SPEC( Ex,
Iconv::Ex )
{
offset = (size_t)(gztell(f) - readBufferLeft + pos);
offset = (size_t)(gztell(f) - readBufferLeft);
{
// Check that we have bytes to read
if (readBufferLeft - pos < 2000)
if ( readBufferLeft < 5000 )
{
readBufferPtr += pos;
readBufferLeft -= pos;
if (!gzeof(f))
if ( !gzeof( f ) )
{
// To avoid having to deal with ring logic, we move the remaining bytes
// to the beginning
memmove(readBuffer, readBufferPtr, readBufferLeft);
memmove( readBuffer, readBufferPtr, readBufferLeft );
// Read some more bytes to readBuffer
int result = gzread(f, readBuffer + readBufferLeft,
sizeof(readBuffer) - readBufferLeft);
int result = gzread( f, readBuffer + readBufferLeft,
sizeof( readBuffer ) - readBufferLeft );
if (result == -1)
throw exCantReadGlsFile();
readBufferPtr = readBuffer;
readBufferLeft += (size_t)result;
QByteArray frag = QByteArray::fromRawData(readBuffer, readBufferLeft);
fragStream = new QTextStream(frag);
fragStream->setCodec(codec);
readBufferLeft += (size_t) result;
}
}
if (fragStream->atEnd())
if(readBufferLeft<=0)
return false;
QString line = fragStream->readLine();
pos = fragStream->pos();
int pos = Utf8::findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed,lineFeedLength);
if(pos==-1)
return false;
QString line = codec->toUnicode(readBufferPtr, pos);
if(line.endsWith("\n"))
line.chop(1);
if(line.endsWith("\r"))
line.chop(1);
if(pos>readBufferLeft){
pos=readBufferLeft;
}
readBufferLeft -= pos;
readBufferPtr += pos;
linesRead++;
#ifdef __WIN32
@ -669,7 +674,7 @@ void GlsDictionary::loadArticleText( uint32_t address,
}
else
{
string articleData = Iconv::toUtf8( GlsScanner::getEncodingNameFor( Encoding( idxHeader.glsEncoding ) ), articleBody, articleSize );
string articleData = Iconv::toUtf8( Utf8::getEncodingNameFor( Encoding( idxHeader.glsEncoding ) ), articleBody, articleSize );
string::size_type start_pos = 0, end_pos = 0;
for( ; ; )

33
utf8.cc
View file

@ -3,6 +3,7 @@
#include "utf8.hh"
#include <vector>
#include <algorithm>
namespace Utf8 {
@ -175,4 +176,36 @@ bool isspace( int c )
}
}
//get the first line in string s1. -1 if not found
int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length)
{
char* pos = std::search(s1,s1+s1length, s2, s2+s2length);
if (pos == s1 + s1length)
return pos-s1;
//the line size.
return pos- s1+ s2length;
}
char const* getEncodingNameFor(Encoding e)
{
switch (e)
{
case Utf16LE:
return "UTF-16LE";
case Utf16BE:
return "UTF-16BE";
case Windows1252:
return "WINDOWS-1252";
case Windows1251:
return "WINDOWS-1251";
case Utf8:
return "UTF-8";
case Windows1250:
default:
return "WINDOWS-1250";
}
}
}

18
utf8.hh
View file

@ -1,6 +1,7 @@
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#ifndef __UTF8_HH_INCLUDED__
#define __UTF8_HH_INCLUDED__
#include <cstdio>
#include <string>
#include "cpp_features.hh"
@ -13,6 +14,17 @@
/// places.
namespace Utf8 {
// Those are possible encodings for .dsl files
enum Encoding
{
Utf16LE,
Utf16BE,
Windows1252,
Windows1251,
Windows1250,
Utf8 // This is an extension. Detected solely by the UTF8 BOM.
};
using std::string;
using gd::wstring;
using gd::wchar;
@ -40,4 +52,8 @@ wstring decode( string const & ) THROW_SPEC( exCantDecode );
/// Linux but was messing up strings under Windows.
bool isspace( int c );
//get the first line in string s1. -1 if not found
int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length);
char const* getEncodingNameFor(Encoding e);
}
#endif