refract encoding method

This commit is contained in:
xiaoyifang 2021-11-06 16:55:51 +08:00
parent f0a3df3d6f
commit 8405035d2f
5 changed files with 47 additions and 58 deletions

View file

@ -895,7 +895,7 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
//iconv.reinit( encoding );
codec = QTextCodec::codecForName(getEncodingNameFor(encoding));
initLineFeed(encoding);
lineFeed=Utf8::initLineFeed(encoding);
// We now can use our own readNextLine() function
wstring str;
@ -1025,7 +1025,7 @@ bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_wo
if(readBufferLeft<=0)
return false;
int pos = Utf8::findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed,lineFeedLength);
int pos = Utf8::findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed.lineFeed,lineFeed.length);
if(pos==-1)
return false;
QString line = codec->toUnicode(readBufferPtr, pos);
@ -1083,31 +1083,6 @@ bool DslScanner::readNextLineWithoutComments( wstring & out, size_t & offset , b
/////////////// DslScanner
void DslScanner::initLineFeed(Utf8::Encoding e)
{
switch (e)
{
case Utf8::Utf16LE:
lineFeed= new char[2] {0x0A,0};
lineFeedLength = 2;
break;
case Utf8::Utf16BE:
lineFeed = new char[2] { 0,0x0A};
lineFeedLength = 2;
break;
case Utf8::Windows1252:
case Utf8::Windows1251:
case Utf8::Utf8:
case Utf8::Windows1250:
default:
lineFeedLength = 1;
lineFeed = new char[1] {0x0A};
}
}
void processUnsortedParts( wstring & str, bool strip )
{
int refCount = 0;

View file

@ -24,6 +24,7 @@ using gd::wchar;
using std::list;
using std::vector;
using Utf8::Encoding;
using Utf8::LineFeed;
@ -110,8 +111,7 @@ class DslScanner
char readBuffer[ 65536 ];
QTextStream* fragStream;
char * readBufferPtr;
const char* lineFeed;
int lineFeedLength;
LineFeed lineFeed;
size_t readBufferLeft;
//qint64 pos;
unsigned linesRead;
@ -131,7 +131,6 @@ public:
/// Returns the detected encoding of this file.
Encoding getEncoding() const
{ return encoding; }
void initLineFeed(Encoding e);
/// Returns the dictionary's name, as was read from file's headers.
wstring const & getDictionaryName() const

33
gls.cc
View file

@ -59,6 +59,7 @@ using BtreeIndexing::WordArticleLink;
using BtreeIndexing::IndexedWords;
using BtreeIndexing::IndexInfo;
using Utf8::Encoding;
using Utf8::LineFeed;
/////////////// GlsScanner
@ -73,8 +74,7 @@ class GlsScanner
char readBuffer[ 10000 ];
char * readBufferPtr;
size_t readBufferLeft;
const char* lineFeed;
int lineFeedLength;
LineFeed lineFeed;
unsigned linesRead;
public:
@ -119,7 +119,6 @@ public:
/// Reading begins from the first line after the headers (ones which end
/// by the "### Glossary section:" line).
bool readNextLine( wstring &, size_t & offset ) THROW_SPEC( Ex, Iconv::Ex );
void initLineFeed(Utf8::Encoding e);
/// Returns the number of lines read so far from the file.
unsigned getLinesRead() const
{ return linesRead; }
@ -178,6 +177,7 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
codec = QTextCodec::codecForName(Utf8::getEncodingNameFor(encoding));
// We now can use our own readNextLine() function
lineFeed = Utf8::initLineFeed(encoding);
wstring str;
wstring *currentField = 0;
@ -243,30 +243,7 @@ GlsScanner::GlsScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
}
}
}
void GlsScanner::initLineFeed(Utf8::Encoding e)
{
switch (e)
{
case Utf8::Utf16LE:
lineFeed= new char[2] {0x0A,0};
lineFeedLength = 2;
break;
case Utf8::Utf16BE:
lineFeed = new char[2] { 0,0x0A};
lineFeedLength = 2;
break;
case Utf8::Windows1252:
case Utf8::Windows1251:
case Utf8::Utf8:
case Utf8::Windows1250:
default:
lineFeedLength = 1;
lineFeed = new char[1] {0x0A};
}
}
bool GlsScanner::readNextLine( wstring & out, size_t & offset ) THROW_SPEC( Ex,
Iconv::Ex )
{
@ -286,7 +263,7 @@ bool GlsScanner::readNextLine( wstring & out, size_t & offset ) THROW_SPEC( Ex,
int result = gzread( f, readBuffer + readBufferLeft,
sizeof( readBuffer ) - readBufferLeft );
if (result == -1)
if (result == -1)
throw exCantReadGlsFile();
readBufferPtr = readBuffer;
@ -296,7 +273,7 @@ bool GlsScanner::readNextLine( wstring & out, size_t & offset ) THROW_SPEC( Ex,
if(readBufferLeft<=0)
return false;
int pos = Utf8::findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed,lineFeedLength);
int pos = Utf8::findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed.lineFeed,lineFeed.length);
if(pos==-1)
return false;
QString line = codec->toUnicode(readBufferPtr, pos);

27
utf8.cc
View file

@ -208,4 +208,31 @@ char const* getEncodingNameFor(Encoding e)
}
}
LineFeed initLineFeed(Encoding e)
{
LineFeed lf;
switch (e)
{
case Utf8::Utf16LE:
lf.lineFeed= new char[2]{ 0x0A,0 };
lf.length = 2;
break;
case Utf8::Utf16BE:
lf.lineFeed = new char[2]{ 0,0x0A };
lf.length = 2;
break;
case Utf8::Windows1252:
case Utf8::Windows1251:
case Utf8::Utf8:
case Utf8::Windows1250:
default:
lf.length = 1;
lf.lineFeed = new char[1]{ 0x0A };
}
return lf;
}
}

11
utf8.hh
View file

@ -2,6 +2,7 @@
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#ifndef __UTF8_HH_INCLUDED__
#define __UTF8_HH_INCLUDED__
#include <cstdio>
#include <string>
#include "cpp_features.hh"
@ -55,5 +56,15 @@ bool isspace( int c );
//get the first line in string s1. -1 if not found
int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length);
char const* getEncodingNameFor(Encoding e);
struct LineFeed
{
int length;
char* lineFeed;
};
LineFeed initLineFeed(Encoding e);
}
#endif