mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-27 19:24:08 +00:00
refract encoding method
This commit is contained in:
parent
02a88c98ad
commit
f0a3df3d6f
5
dsl.cc
5
dsl.cc
|
@ -75,6 +75,7 @@ using gd::wstring;
|
||||||
using gd::wchar;
|
using gd::wchar;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
using std::list;
|
using std::list;
|
||||||
|
using Utf8::Encoding;
|
||||||
|
|
||||||
using BtreeIndexing::WordArticleLink;
|
using BtreeIndexing::WordArticleLink;
|
||||||
using BtreeIndexing::IndexedWords;
|
using BtreeIndexing::IndexedWords;
|
||||||
|
@ -597,7 +598,7 @@ void DslDictionary::loadArticle( uint32_t address,
|
||||||
{
|
{
|
||||||
articleData =
|
articleData =
|
||||||
Iconv::toWstring(
|
Iconv::toWstring(
|
||||||
getEncodingNameFor( DslEncoding( idxHeader.dslEncoding ) ),
|
Utf8::getEncodingNameFor( Encoding( idxHeader.dslEncoding ) ),
|
||||||
articleBody, articleSize );
|
articleBody, articleSize );
|
||||||
free( articleBody );
|
free( articleBody );
|
||||||
|
|
||||||
|
@ -1361,7 +1362,7 @@ void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword,
|
||||||
{
|
{
|
||||||
articleData =
|
articleData =
|
||||||
Iconv::toWstring(
|
Iconv::toWstring(
|
||||||
getEncodingNameFor( DslEncoding( idxHeader.dslEncoding ) ),
|
getEncodingNameFor( Encoding( idxHeader.dslEncoding ) ),
|
||||||
articleBody, articleSize );
|
articleBody, articleSize );
|
||||||
free( articleBody );
|
free( articleBody );
|
||||||
|
|
||||||
|
|
116
dsl_details.cc
116
dsl_details.cc
|
@ -19,6 +19,7 @@ namespace Details {
|
||||||
|
|
||||||
using gd::wstring;
|
using gd::wstring;
|
||||||
using std::list;
|
using std::list;
|
||||||
|
using Utf8::Encoding;
|
||||||
|
|
||||||
#ifndef __linux__
|
#ifndef __linux__
|
||||||
|
|
||||||
|
@ -41,18 +42,6 @@ int wcscasecmp( const wchar *s1, const wchar *s2 )
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
//get the first line in string s1. -1 if not found
|
|
||||||
int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length)
|
|
||||||
{
|
|
||||||
char* pos = std::search(s1,s1+s1length, s2, s2+s2length);
|
|
||||||
|
|
||||||
if (pos == s1 + s1length)
|
|
||||||
return pos-s1;
|
|
||||||
|
|
||||||
//the line size.
|
|
||||||
return pos- s1+ s2length;
|
|
||||||
}
|
|
||||||
|
|
||||||
static DSLLangCode LangCodes[] =
|
static DSLLangCode LangCodes[] =
|
||||||
{
|
{
|
||||||
{ 1, "en" },
|
{ 1, "en" },
|
||||||
|
@ -159,25 +148,7 @@ bool isAtSignFirst( wstring const & str )
|
||||||
return reg.indexIn( gd::toQString( str ) ) == 0;
|
return reg.indexIn( gd::toQString( str ) ) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
char const* getEncodingNameFor(DslEncoding e)
|
|
||||||
{
|
|
||||||
switch (e)
|
|
||||||
{
|
|
||||||
case Utf16LE:
|
|
||||||
return "UTF-16LE";
|
|
||||||
case Utf16BE:
|
|
||||||
return "UTF-16BE";
|
|
||||||
case Windows1252:
|
|
||||||
return "WINDOWS-1252";
|
|
||||||
case Windows1251:
|
|
||||||
return "WINDOWS-1251";
|
|
||||||
case Details::Utf8:
|
|
||||||
return "UTF-8";
|
|
||||||
case Windows1250:
|
|
||||||
default:
|
|
||||||
return "WINDOWS-1250";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/////////////// ArticleDom
|
/////////////// ArticleDom
|
||||||
|
|
||||||
|
@ -811,38 +782,36 @@ void ArticleDom::closeTag( wstring const & name,
|
||||||
|
|
||||||
void ArticleDom::nextChar() THROW_SPEC( eot )
|
void ArticleDom::nextChar() THROW_SPEC( eot )
|
||||||
{
|
{
|
||||||
if ( !*stringPos )
|
if ( !*stringPos )
|
||||||
throw eot();
|
throw eot();
|
||||||
else{
|
|
||||||
ch = *stringPos++;
|
|
||||||
|
|
||||||
if ( ch == L'\\' )
|
ch = *stringPos++;
|
||||||
{
|
|
||||||
|
if ( ch == L'\\' )
|
||||||
|
{
|
||||||
if ( !*stringPos )
|
if ( !*stringPos )
|
||||||
throw eot();
|
throw eot();
|
||||||
|
|
||||||
ch = *stringPos++;
|
ch = *stringPos++;
|
||||||
|
|
||||||
escaped = true;
|
escaped = true;
|
||||||
}
|
}
|
||||||
else
|
else if ( ch == L'[' && *stringPos == L'[' )
|
||||||
if ( ch == L'[' && *stringPos == L'[' )
|
{
|
||||||
{
|
|
||||||
++stringPos;
|
++stringPos;
|
||||||
escaped = true;
|
escaped = true;
|
||||||
}
|
}
|
||||||
else
|
else if ( ch == L']' && *stringPos == L']' )
|
||||||
if ( ch == L']' && *stringPos == L']' )
|
{
|
||||||
{
|
|
||||||
++stringPos;
|
++stringPos;
|
||||||
escaped = true;
|
escaped = true;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
escaped = false;
|
escaped = false;
|
||||||
|
|
||||||
if( ch == '\n' || ch == '\r' )
|
if( ch == '\n' || ch == '\r' )
|
||||||
lineStartPos = stringPos;
|
lineStartPos = stringPos;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ArticleDom::atSignFirstInLine()
|
bool ArticleDom::atSignFirstInLine()
|
||||||
|
@ -857,7 +826,7 @@ bool ArticleDom::atSignFirstInLine()
|
||||||
/////////////// DslScanner
|
/////////////// DslScanner
|
||||||
|
|
||||||
DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
|
DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
|
||||||
encoding( Windows1252 ), readBufferPtr( readBuffer ),
|
encoding( Utf8::Windows1252 ), readBufferPtr( readBuffer ),
|
||||||
readBufferLeft( 0 ), linesRead( 0 )
|
readBufferLeft( 0 ), linesRead( 0 )
|
||||||
{
|
{
|
||||||
// Since .dz is backwards-compatible with .gz, we use gz- functions to
|
// Since .dz is backwards-compatible with .gz, we use gz- functions to
|
||||||
|
@ -884,10 +853,10 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
|
||||||
// If the file begins with the dedicated Unicode marker, we just consume
|
// If the file begins with the dedicated Unicode marker, we just consume
|
||||||
// it. If, on the other hand, it's not, we return the bytes back
|
// it. If, on the other hand, it's not, we return the bytes back
|
||||||
if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE )
|
if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE )
|
||||||
encoding = Utf16LE;
|
encoding = Utf8::Utf16LE;
|
||||||
else
|
else
|
||||||
if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF )
|
if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF )
|
||||||
encoding = Utf16BE;
|
encoding = Utf8::Utf16BE;
|
||||||
else
|
else
|
||||||
if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB )
|
if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB )
|
||||||
{
|
{
|
||||||
|
@ -899,22 +868,22 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
|
||||||
throw exMalformedDslFile( fileName );
|
throw exMalformedDslFile( fileName );
|
||||||
}
|
}
|
||||||
|
|
||||||
encoding = Utf8;
|
encoding = Utf8::Utf8;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if ( firstBytes[ 0 ] && !firstBytes[ 1 ] )
|
if ( firstBytes[ 0 ] && !firstBytes[ 1 ] )
|
||||||
encoding = Utf16LE;
|
encoding = Utf8::Utf16LE;
|
||||||
else
|
else
|
||||||
if ( !firstBytes[ 0 ] && firstBytes[ 1 ] )
|
if ( !firstBytes[ 0 ] && firstBytes[ 1 ] )
|
||||||
encoding = Utf16BE;
|
encoding = Utf8::Utf16BE;
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Ok, this doesn't look like 16-bit Unicode. We will start with a
|
// Ok, this doesn't look like 16-bit Unicode. We will start with a
|
||||||
// 8-bit encoding with an intent to find out the exact one from
|
// 8-bit encoding with an intent to find out the exact one from
|
||||||
// the header.
|
// the header.
|
||||||
needExactEncoding = true;
|
needExactEncoding = true;
|
||||||
encoding = Windows1251;
|
encoding = Utf8::Windows1251;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( gzrewind( f ) )
|
if ( gzrewind( f ) )
|
||||||
|
@ -995,13 +964,13 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
if ( !wcscasecmp( arg.c_str(), GD_NATIVE_TO_WS( L"Latin" ) ) )
|
if ( !wcscasecmp( arg.c_str(), GD_NATIVE_TO_WS( L"Latin" ) ) )
|
||||||
encoding = Windows1252;
|
encoding = Utf8::Windows1252;
|
||||||
else
|
else
|
||||||
if ( !wcscasecmp( arg.c_str(), GD_NATIVE_TO_WS( L"Cyrillic" ) ) )
|
if ( !wcscasecmp( arg.c_str(), GD_NATIVE_TO_WS( L"Cyrillic" ) ) )
|
||||||
encoding = Windows1251;
|
encoding = Utf8::Windows1251;
|
||||||
else
|
else
|
||||||
if ( !wcscasecmp( arg.c_str(), GD_NATIVE_TO_WS( L"EasternEuropean" ) ) )
|
if ( !wcscasecmp( arg.c_str(), GD_NATIVE_TO_WS( L"EasternEuropean" ) ) )
|
||||||
encoding = Windows1250;
|
encoding = Utf8::Windows1250;
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
gzclose( f );
|
gzclose( f );
|
||||||
|
@ -1036,8 +1005,6 @@ bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_wo
|
||||||
// Check that we have bytes to read
|
// Check that we have bytes to read
|
||||||
if ( readBufferLeft < 5000 )
|
if ( readBufferLeft < 5000 )
|
||||||
{
|
{
|
||||||
//readBufferPtr+=pos;
|
|
||||||
//readBufferLeft-=pos;
|
|
||||||
if ( !gzeof( f ) )
|
if ( !gzeof( f ) )
|
||||||
{
|
{
|
||||||
// To avoid having to deal with ring logic, we move the remaining bytes
|
// To avoid having to deal with ring logic, we move the remaining bytes
|
||||||
|
@ -1053,19 +1020,12 @@ bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_wo
|
||||||
|
|
||||||
readBufferPtr = readBuffer;
|
readBufferPtr = readBuffer;
|
||||||
readBufferLeft += (size_t) result;
|
readBufferLeft += (size_t) result;
|
||||||
/*QByteArray frag = QByteArray::fromRawData(readBuffer, readBufferLeft);
|
|
||||||
fragStream = new QTextStream(frag) ;
|
|
||||||
fragStream->setCodec(codec);*/
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//if(fragStream->atEnd())
|
|
||||||
// return false;
|
|
||||||
|
|
||||||
if(readBufferLeft<=0)
|
if(readBufferLeft<=0)
|
||||||
return false;
|
return false;
|
||||||
//QString line=fragStream->readLine();
|
|
||||||
int pos = findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed,lineFeedLength);
|
int pos = Utf8::findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed,lineFeedLength);
|
||||||
if(pos==-1)
|
if(pos==-1)
|
||||||
return false;
|
return false;
|
||||||
QString line = codec->toUnicode(readBufferPtr, pos);
|
QString line = codec->toUnicode(readBufferPtr, pos);
|
||||||
|
@ -1123,25 +1083,25 @@ bool DslScanner::readNextLineWithoutComments( wstring & out, size_t & offset , b
|
||||||
|
|
||||||
/////////////// DslScanner
|
/////////////// DslScanner
|
||||||
|
|
||||||
void DslScanner::initLineFeed(DslEncoding e)
|
void DslScanner::initLineFeed(Utf8::Encoding e)
|
||||||
{
|
{
|
||||||
switch (e)
|
switch (e)
|
||||||
{
|
{
|
||||||
case Utf16LE:
|
case Utf8::Utf16LE:
|
||||||
lineFeed= new char[2] {0x0A,0};
|
lineFeed= new char[2] {0x0A,0};
|
||||||
lineFeedLength = 2;
|
lineFeedLength = 2;
|
||||||
break;
|
break;
|
||||||
case Utf16BE:
|
case Utf8::Utf16BE:
|
||||||
lineFeed = new char[2] { 0,0x0A};
|
lineFeed = new char[2] { 0,0x0A};
|
||||||
lineFeedLength = 2;
|
lineFeedLength = 2;
|
||||||
break;
|
break;
|
||||||
case Windows1252:
|
case Utf8::Windows1252:
|
||||||
|
|
||||||
case Windows1251:
|
case Utf8::Windows1251:
|
||||||
|
|
||||||
case Details::Utf8:
|
case Utf8::Utf8:
|
||||||
|
|
||||||
case Windows1250:
|
case Utf8::Windows1250:
|
||||||
default:
|
default:
|
||||||
lineFeedLength = 1;
|
lineFeedLength = 1;
|
||||||
lineFeed = new char[1] {0x0A};
|
lineFeed = new char[1] {0x0A};
|
||||||
|
|
|
@ -12,6 +12,7 @@
|
||||||
#include "iconv.hh"
|
#include "iconv.hh"
|
||||||
#include <QTextCodec>
|
#include <QTextCodec>
|
||||||
#include <QByteArray>
|
#include <QByteArray>
|
||||||
|
#include "utf8.hh"
|
||||||
|
|
||||||
// Implementation details for Dsl, not part of its interface
|
// Implementation details for Dsl, not part of its interface
|
||||||
namespace Dsl {
|
namespace Dsl {
|
||||||
|
@ -22,17 +23,9 @@ using gd::wstring;
|
||||||
using gd::wchar;
|
using gd::wchar;
|
||||||
using std::list;
|
using std::list;
|
||||||
using std::vector;
|
using std::vector;
|
||||||
|
using Utf8::Encoding;
|
||||||
|
|
||||||
|
|
||||||
// Those are possible encodings for .dsl files
|
|
||||||
enum DslEncoding
|
|
||||||
{
|
|
||||||
Utf16LE,
|
|
||||||
Utf16BE,
|
|
||||||
Windows1252,
|
|
||||||
Windows1251,
|
|
||||||
Windows1250,
|
|
||||||
Utf8 // This is an extension. Detected solely by the UTF8 BOM.
|
|
||||||
};
|
|
||||||
|
|
||||||
struct DSLLangCode
|
struct DSLLangCode
|
||||||
{
|
{
|
||||||
|
@ -44,8 +37,6 @@ string findCodeForDslId( int id );
|
||||||
|
|
||||||
bool isAtSignFirst( wstring const & str );
|
bool isAtSignFirst( wstring const & str );
|
||||||
|
|
||||||
char const* getEncodingNameFor(DslEncoding e);
|
|
||||||
|
|
||||||
/// Parses the DSL language, representing it in its structural DOM form.
|
/// Parses the DSL language, representing it in its structural DOM form.
|
||||||
struct ArticleDom
|
struct ArticleDom
|
||||||
{
|
{
|
||||||
|
@ -111,7 +102,7 @@ private:
|
||||||
class DslScanner
|
class DslScanner
|
||||||
{
|
{
|
||||||
gzFile f;
|
gzFile f;
|
||||||
DslEncoding encoding;
|
Encoding encoding;
|
||||||
QTextCodec* codec;
|
QTextCodec* codec;
|
||||||
wstring dictionaryName;
|
wstring dictionaryName;
|
||||||
wstring langFrom, langTo;
|
wstring langFrom, langTo;
|
||||||
|
@ -138,9 +129,9 @@ public:
|
||||||
~DslScanner() throw();
|
~DslScanner() throw();
|
||||||
|
|
||||||
/// Returns the detected encoding of this file.
|
/// Returns the detected encoding of this file.
|
||||||
DslEncoding getEncoding() const
|
Encoding getEncoding() const
|
||||||
{ return encoding; }
|
{ return encoding; }
|
||||||
void initLineFeed(DslEncoding e);
|
void initLineFeed(Encoding e);
|
||||||
|
|
||||||
/// Returns the dictionary's name, as was read from file's headers.
|
/// Returns the dictionary's name, as was read from file's headers.
|
||||||
wstring const & getDictionaryName() const
|
wstring const & getDictionaryName() const
|
||||||
|
@ -207,8 +198,8 @@ inline size_t DslScanner::distanceToBytes( size_t x ) const
|
||||||
{
|
{
|
||||||
switch( encoding )
|
switch( encoding )
|
||||||
{
|
{
|
||||||
case Utf16LE:
|
case Utf8::Utf16LE:
|
||||||
case Utf16BE:
|
case Utf8::Utf16BE:
|
||||||
return x*2;
|
return x*2;
|
||||||
default:
|
default:
|
||||||
return x;
|
return x;
|
||||||
|
|
137
gls.cc
137
gls.cc
|
@ -58,13 +58,7 @@ using gd::wchar;
|
||||||
using BtreeIndexing::WordArticleLink;
|
using BtreeIndexing::WordArticleLink;
|
||||||
using BtreeIndexing::IndexedWords;
|
using BtreeIndexing::IndexedWords;
|
||||||
using BtreeIndexing::IndexInfo;
|
using BtreeIndexing::IndexInfo;
|
||||||
|
using Utf8::Encoding;
|
||||||
enum Encoding
|
|
||||||
{
|
|
||||||
Utf8,
|
|
||||||
Utf16LE,
|
|
||||||
Utf16BE
|
|
||||||
};
|
|
||||||
|
|
||||||
/////////////// GlsScanner
|
/////////////// GlsScanner
|
||||||
|
|
||||||
|
@ -73,15 +67,14 @@ class GlsScanner
|
||||||
gzFile f;
|
gzFile f;
|
||||||
Encoding encoding;
|
Encoding encoding;
|
||||||
QTextCodec* codec;
|
QTextCodec* codec;
|
||||||
Iconv iconv;
|
|
||||||
wstring dictionaryName;
|
wstring dictionaryName;
|
||||||
wstring dictionaryDecription, dictionaryAuthor;
|
wstring dictionaryDecription, dictionaryAuthor;
|
||||||
wstring langFrom, langTo;
|
wstring langFrom, langTo;
|
||||||
char readBuffer[ 10000 ];
|
char readBuffer[ 10000 ];
|
||||||
char * readBufferPtr;
|
char * readBufferPtr;
|
||||||
size_t readBufferLeft;
|
size_t readBufferLeft;
|
||||||
QTextStream* fragStream;
|
const char* lineFeed;
|
||||||
qint64 pos;
|
int lineFeedLength;
|
||||||
unsigned linesRead;
|
unsigned linesRead;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
@ -126,30 +119,15 @@ public:
|
||||||
/// Reading begins from the first line after the headers (ones which end
|
/// Reading begins from the first line after the headers (ones which end
|
||||||
/// by the "### Glossary section:" line).
|
/// by the "### Glossary section:" line).
|
||||||
bool readNextLine( wstring &, size_t & offset ) THROW_SPEC( Ex, Iconv::Ex );
|
bool readNextLine( wstring &, size_t & offset ) THROW_SPEC( Ex, Iconv::Ex );
|
||||||
|
void initLineFeed(Utf8::Encoding e);
|
||||||
/// Returns the number of lines read so far from the file.
|
/// Returns the number of lines read so far from the file.
|
||||||
unsigned getLinesRead() const
|
unsigned getLinesRead() const
|
||||||
{ return linesRead; }
|
{ return linesRead; }
|
||||||
|
|
||||||
/// Returns a name to be passed to iconv for the given encoding.
|
|
||||||
static char const * getEncodingNameFor( Encoding e )
|
|
||||||
{
|
|
||||||
switch( e )
|
|
||||||
{
|
|
||||||
case Utf16LE:
|
|
||||||
return Iconv::Utf16Le;
|
|
||||||
case Utf16BE:
|
|
||||||
return "UTF-16BE";
|
|
||||||
case Utf8:
|
|
||||||
default:
|
|
||||||
return Iconv::Utf8;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
|
GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
|
||||||
encoding( Utf8 ), iconv( Iconv::GdWchar, Iconv::Utf8 ), readBufferPtr( readBuffer ),
|
encoding( Utf8::Utf8 ), readBufferPtr( readBuffer ),
|
||||||
readBufferLeft( 0 ), linesRead( 0 ), pos(0)
|
readBufferLeft( 0 ), linesRead( 0 )
|
||||||
{
|
{
|
||||||
// Since .dz is backwards-compatible with .gz, we use gz- functions to
|
// Since .dz is backwards-compatible with .gz, we use gz- functions to
|
||||||
// read it -- they are much nicer than the dict_data- ones.
|
// read it -- they are much nicer than the dict_data- ones.
|
||||||
|
@ -172,10 +150,10 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
|
||||||
// If the file begins with the dedicated Unicode marker, we just consume
|
// If the file begins with the dedicated Unicode marker, we just consume
|
||||||
// it. If, on the other hand, it's not, we return the bytes back
|
// it. If, on the other hand, it's not, we return the bytes back
|
||||||
if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE )
|
if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE )
|
||||||
encoding = Utf16LE;
|
encoding = Utf8::Utf16LE;
|
||||||
else
|
else
|
||||||
if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF )
|
if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF )
|
||||||
encoding = Utf16BE;
|
encoding = Utf8::Utf16BE;
|
||||||
else
|
else
|
||||||
if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB )
|
if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB )
|
||||||
{
|
{
|
||||||
|
@ -186,7 +164,7 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
|
||||||
gzclose( f );
|
gzclose( f );
|
||||||
throw exMalformedGlsFile( fileName );
|
throw exMalformedGlsFile( fileName );
|
||||||
}
|
}
|
||||||
encoding = Utf8;
|
encoding = Utf8::Utf8;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -195,12 +173,10 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
|
||||||
gzclose( f );
|
gzclose( f );
|
||||||
throw exCantOpen( fileName );
|
throw exCantOpen( fileName );
|
||||||
}
|
}
|
||||||
encoding = Utf8;
|
encoding = Utf8::Utf8;
|
||||||
}
|
}
|
||||||
|
|
||||||
if( encoding != Utf8 )
|
codec = QTextCodec::codecForName(Utf8::getEncodingNameFor(encoding));
|
||||||
iconv.reinit( Iconv::GdWchar, getEncodingNameFor( encoding ) );
|
|
||||||
codec = QTextCodec::codecForName(getEncodingNameFor(encoding));
|
|
||||||
// We now can use our own readNextLine() function
|
// We now can use our own readNextLine() function
|
||||||
|
|
||||||
wstring str;
|
wstring str;
|
||||||
|
@ -267,45 +243,74 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
void GlsScanner::initLineFeed(Utf8::Encoding e)
|
||||||
|
{
|
||||||
|
switch (e)
|
||||||
|
{
|
||||||
|
case Utf8::Utf16LE:
|
||||||
|
lineFeed= new char[2] {0x0A,0};
|
||||||
|
lineFeedLength = 2;
|
||||||
|
break;
|
||||||
|
case Utf8::Utf16BE:
|
||||||
|
lineFeed = new char[2] { 0,0x0A};
|
||||||
|
lineFeedLength = 2;
|
||||||
|
break;
|
||||||
|
case Utf8::Windows1252:
|
||||||
|
|
||||||
|
case Utf8::Windows1251:
|
||||||
|
|
||||||
|
case Utf8::Utf8:
|
||||||
|
|
||||||
|
case Utf8::Windows1250:
|
||||||
|
default:
|
||||||
|
lineFeedLength = 1;
|
||||||
|
lineFeed = new char[1] {0x0A};
|
||||||
|
}
|
||||||
|
}
|
||||||
bool GlsScanner::readNextLine( wstring & out, size_t & offset ) THROW_SPEC( Ex,
|
bool GlsScanner::readNextLine( wstring & out, size_t & offset ) THROW_SPEC( Ex,
|
||||||
Iconv::Ex )
|
Iconv::Ex )
|
||||||
{
|
{
|
||||||
offset = (size_t)(gztell(f) - readBufferLeft + pos);
|
offset = (size_t)(gztell(f) - readBufferLeft);
|
||||||
|
|
||||||
{
|
{
|
||||||
// Check that we have bytes to read
|
// Check that we have bytes to read
|
||||||
if (readBufferLeft - pos < 2000)
|
if ( readBufferLeft < 5000 )
|
||||||
{
|
{
|
||||||
readBufferPtr += pos;
|
if ( !gzeof( f ) )
|
||||||
readBufferLeft -= pos;
|
{
|
||||||
if (!gzeof(f))
|
// To avoid having to deal with ring logic, we move the remaining bytes
|
||||||
{
|
// to the beginning
|
||||||
// To avoid having to deal with ring logic, we move the remaining bytes
|
memmove( readBuffer, readBufferPtr, readBufferLeft );
|
||||||
// to the beginning
|
|
||||||
memmove(readBuffer, readBufferPtr, readBufferLeft);
|
|
||||||
|
|
||||||
// Read some more bytes to readBuffer
|
// Read some more bytes to readBuffer
|
||||||
int result = gzread(f, readBuffer + readBufferLeft,
|
int result = gzread( f, readBuffer + readBufferLeft,
|
||||||
sizeof(readBuffer) - readBufferLeft);
|
sizeof( readBuffer ) - readBufferLeft );
|
||||||
|
|
||||||
if (result == -1)
|
if (result == -1)
|
||||||
throw exCantReadGlsFile();
|
throw exCantReadGlsFile();
|
||||||
|
|
||||||
readBufferPtr = readBuffer;
|
readBufferPtr = readBuffer;
|
||||||
readBufferLeft += (size_t)result;
|
readBufferLeft += (size_t) result;
|
||||||
QByteArray frag = QByteArray::fromRawData(readBuffer, readBufferLeft);
|
}
|
||||||
fragStream = new QTextStream(frag);
|
}
|
||||||
fragStream->setCodec(codec);
|
if(readBufferLeft<=0)
|
||||||
}
|
return false;
|
||||||
}
|
|
||||||
|
|
||||||
if (fragStream->atEnd())
|
int pos = Utf8::findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed,lineFeedLength);
|
||||||
return false;
|
if(pos==-1)
|
||||||
|
return false;
|
||||||
|
QString line = codec->toUnicode(readBufferPtr, pos);
|
||||||
|
if(line.endsWith("\n"))
|
||||||
|
line.chop(1);
|
||||||
|
if(line.endsWith("\r"))
|
||||||
|
line.chop(1);
|
||||||
|
|
||||||
QString line = fragStream->readLine();
|
if(pos>readBufferLeft){
|
||||||
pos = fragStream->pos();
|
pos=readBufferLeft;
|
||||||
linesRead++;
|
}
|
||||||
|
readBufferLeft -= pos;
|
||||||
|
readBufferPtr += pos;
|
||||||
|
linesRead++;
|
||||||
|
|
||||||
#ifdef __WIN32
|
#ifdef __WIN32
|
||||||
out = line.toStdU32String();
|
out = line.toStdU32String();
|
||||||
|
@ -314,7 +319,7 @@ bool GlsScanner::readNextLine( wstring & out, size_t & offset ) THROW_SPEC( Ex,
|
||||||
#endif
|
#endif
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
GlsScanner::~GlsScanner() throw()
|
GlsScanner::~GlsScanner() throw()
|
||||||
|
@ -669,7 +674,7 @@ void GlsDictionary::loadArticleText( uint32_t address,
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
string articleData = Iconv::toUtf8( GlsScanner::getEncodingNameFor( Encoding( idxHeader.glsEncoding ) ), articleBody, articleSize );
|
string articleData = Iconv::toUtf8( Utf8::getEncodingNameFor( Encoding( idxHeader.glsEncoding ) ), articleBody, articleSize );
|
||||||
string::size_type start_pos = 0, end_pos = 0;
|
string::size_type start_pos = 0, end_pos = 0;
|
||||||
|
|
||||||
for( ; ; )
|
for( ; ; )
|
||||||
|
|
33
utf8.cc
33
utf8.cc
|
@ -3,6 +3,7 @@
|
||||||
|
|
||||||
#include "utf8.hh"
|
#include "utf8.hh"
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
namespace Utf8 {
|
namespace Utf8 {
|
||||||
|
|
||||||
|
@ -175,4 +176,36 @@ bool isspace( int c )
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//get the first line in string s1. -1 if not found
|
||||||
|
int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length)
|
||||||
|
{
|
||||||
|
char* pos = std::search(s1,s1+s1length, s2, s2+s2length);
|
||||||
|
|
||||||
|
if (pos == s1 + s1length)
|
||||||
|
return pos-s1;
|
||||||
|
|
||||||
|
//the line size.
|
||||||
|
return pos- s1+ s2length;
|
||||||
|
}
|
||||||
|
|
||||||
|
char const* getEncodingNameFor(Encoding e)
|
||||||
|
{
|
||||||
|
switch (e)
|
||||||
|
{
|
||||||
|
case Utf16LE:
|
||||||
|
return "UTF-16LE";
|
||||||
|
case Utf16BE:
|
||||||
|
return "UTF-16BE";
|
||||||
|
case Windows1252:
|
||||||
|
return "WINDOWS-1252";
|
||||||
|
case Windows1251:
|
||||||
|
return "WINDOWS-1251";
|
||||||
|
case Utf8:
|
||||||
|
return "UTF-8";
|
||||||
|
case Windows1250:
|
||||||
|
default:
|
||||||
|
return "WINDOWS-1250";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
18
utf8.hh
18
utf8.hh
|
@ -1,6 +1,7 @@
|
||||||
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
|
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
|
||||||
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
|
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
|
||||||
|
#ifndef __UTF8_HH_INCLUDED__
|
||||||
|
#define __UTF8_HH_INCLUDED__
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include "cpp_features.hh"
|
#include "cpp_features.hh"
|
||||||
|
@ -13,6 +14,17 @@
|
||||||
/// places.
|
/// places.
|
||||||
namespace Utf8 {
|
namespace Utf8 {
|
||||||
|
|
||||||
|
// Those are possible encodings for .dsl files
|
||||||
|
enum Encoding
|
||||||
|
{
|
||||||
|
Utf16LE,
|
||||||
|
Utf16BE,
|
||||||
|
Windows1252,
|
||||||
|
Windows1251,
|
||||||
|
Windows1250,
|
||||||
|
Utf8 // This is an extension. Detected solely by the UTF8 BOM.
|
||||||
|
};
|
||||||
|
|
||||||
using std::string;
|
using std::string;
|
||||||
using gd::wstring;
|
using gd::wstring;
|
||||||
using gd::wchar;
|
using gd::wchar;
|
||||||
|
@ -40,4 +52,8 @@ wstring decode( string const & ) THROW_SPEC( exCantDecode );
|
||||||
/// Linux but was messing up strings under Windows.
|
/// Linux but was messing up strings under Windows.
|
||||||
bool isspace( int c );
|
bool isspace( int c );
|
||||||
|
|
||||||
|
//get the first line in string s1. -1 if not found
|
||||||
|
int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length);
|
||||||
|
char const* getEncodingNameFor(Encoding e);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
Loading…
Reference in a new issue