mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-27 19:24:08 +00:00
load dsl dictionary performance improved
This commit is contained in:
parent
fb2b667cc5
commit
2633b32458
128
dsl_details.cc
128
dsl_details.cc
|
@ -12,6 +12,7 @@
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <wctype.h>
|
#include <wctype.h>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
namespace Dsl {
|
namespace Dsl {
|
||||||
namespace Details {
|
namespace Details {
|
||||||
|
@ -40,6 +41,18 @@ int wcscasecmp( const wchar *s1, const wchar *s2 )
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
//get the first line in string s1. -1 if not found
|
||||||
|
int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length)
|
||||||
|
{
|
||||||
|
char* pos = std::search(s1,s1+s1length, s2, s2+s2length);
|
||||||
|
|
||||||
|
if (pos == s1 + s1length)
|
||||||
|
return pos-s1;
|
||||||
|
|
||||||
|
//the line size.
|
||||||
|
return pos- s1+ s2length;
|
||||||
|
}
|
||||||
|
|
||||||
static DSLLangCode LangCodes[] =
|
static DSLLangCode LangCodes[] =
|
||||||
{
|
{
|
||||||
{ 1, "en" },
|
{ 1, "en" },
|
||||||
|
@ -780,35 +793,36 @@ void ArticleDom::nextChar() THROW_SPEC( eot )
|
||||||
{
|
{
|
||||||
if ( !*stringPos )
|
if ( !*stringPos )
|
||||||
throw eot();
|
throw eot();
|
||||||
|
else{
|
||||||
|
ch = *stringPos++;
|
||||||
|
|
||||||
ch = *stringPos++;
|
if ( ch == L'\\' )
|
||||||
|
{
|
||||||
|
if ( !*stringPos )
|
||||||
|
throw eot();
|
||||||
|
|
||||||
if ( ch == L'\\' )
|
ch = *stringPos++;
|
||||||
{
|
|
||||||
if ( !*stringPos )
|
|
||||||
throw eot();
|
|
||||||
|
|
||||||
ch = *stringPos++;
|
escaped = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
if ( ch == L'[' && *stringPos == L'[' )
|
||||||
|
{
|
||||||
|
++stringPos;
|
||||||
|
escaped = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
if ( ch == L']' && *stringPos == L']' )
|
||||||
|
{
|
||||||
|
++stringPos;
|
||||||
|
escaped = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
escaped = false;
|
||||||
|
|
||||||
escaped = true;
|
if( ch == '\n' || ch == '\r' )
|
||||||
|
lineStartPos = stringPos;
|
||||||
}
|
}
|
||||||
else
|
|
||||||
if ( ch == L'[' && *stringPos == L'[' )
|
|
||||||
{
|
|
||||||
++stringPos;
|
|
||||||
escaped = true;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
if ( ch == L']' && *stringPos == L']' )
|
|
||||||
{
|
|
||||||
++stringPos;
|
|
||||||
escaped = true;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
escaped = false;
|
|
||||||
|
|
||||||
if( ch == '\n' || ch == '\r' )
|
|
||||||
lineStartPos = stringPos;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ArticleDom::atSignFirstInLine()
|
bool ArticleDom::atSignFirstInLine()
|
||||||
|
@ -824,7 +838,7 @@ bool ArticleDom::atSignFirstInLine()
|
||||||
|
|
||||||
DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
|
DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
|
||||||
encoding( Windows1252 ), iconv( encoding ), readBufferPtr( readBuffer ),
|
encoding( Windows1252 ), iconv( encoding ), readBufferPtr( readBuffer ),
|
||||||
readBufferLeft( 0 ), linesRead( 0 ), pos(0)
|
readBufferLeft( 0 ), linesRead( 0 )
|
||||||
{
|
{
|
||||||
// Since .dz is backwards-compatible with .gz, we use gz- functions to
|
// Since .dz is backwards-compatible with .gz, we use gz- functions to
|
||||||
// read it -- they are much nicer than the dict_data- ones.
|
// read it -- they are much nicer than the dict_data- ones.
|
||||||
|
@ -890,8 +904,9 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
iconv.reinit( encoding );
|
//iconv.reinit( encoding );
|
||||||
codec = QTextCodec::codecForName(iconv.getEncodingNameFor(encoding));
|
codec = QTextCodec::codecForName(iconv.getEncodingNameFor(encoding));
|
||||||
|
initLineFeed(encoding);
|
||||||
// We now can use our own readNextLine() function
|
// We now can use our own readNextLine() function
|
||||||
|
|
||||||
wstring str;
|
wstring str;
|
||||||
|
@ -984,7 +999,6 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
|
||||||
gzseek( f, offset, SEEK_SET );
|
gzseek( f, offset, SEEK_SET );
|
||||||
readBufferPtr = readBuffer;
|
readBufferPtr = readBuffer;
|
||||||
readBufferLeft = 0;
|
readBufferLeft = 0;
|
||||||
pos = 0;
|
|
||||||
|
|
||||||
if ( needExactEncoding )
|
if ( needExactEncoding )
|
||||||
iconv.reinit( encoding );
|
iconv.reinit( encoding );
|
||||||
|
@ -998,15 +1012,15 @@ DslScanner::~DslScanner() throw()
|
||||||
bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_word ) THROW_SPEC( Ex,
|
bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_word ) THROW_SPEC( Ex,
|
||||||
Iconv::Ex )
|
Iconv::Ex )
|
||||||
{
|
{
|
||||||
offset = (size_t)( gztell( f ) - readBufferLeft+pos );
|
offset = (size_t)( gztell( f ) - readBufferLeft/*+pos*/ );
|
||||||
|
|
||||||
for(;;)
|
for(;;)
|
||||||
{
|
{
|
||||||
// Check that we have bytes to read
|
// Check that we have bytes to read
|
||||||
if ( readBufferLeft-pos < 2000 )
|
if ( readBufferLeft < 5000 )
|
||||||
{
|
{
|
||||||
readBufferPtr+=pos;
|
//readBufferPtr+=pos;
|
||||||
readBufferLeft-=pos;
|
//readBufferLeft-=pos;
|
||||||
if ( !gzeof( f ) )
|
if ( !gzeof( f ) )
|
||||||
{
|
{
|
||||||
// To avoid having to deal with ring logic, we move the remaining bytes
|
// To avoid having to deal with ring logic, we move the remaining bytes
|
||||||
|
@ -1022,17 +1036,32 @@ bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_wo
|
||||||
|
|
||||||
readBufferPtr = readBuffer;
|
readBufferPtr = readBuffer;
|
||||||
readBufferLeft += (size_t) result;
|
readBufferLeft += (size_t) result;
|
||||||
QByteArray frag = QByteArray::fromRawData(readBuffer, readBufferLeft);
|
/*QByteArray frag = QByteArray::fromRawData(readBuffer, readBufferLeft);
|
||||||
fragStream = new QTextStream(frag) ;
|
fragStream = new QTextStream(frag) ;
|
||||||
fragStream->setCodec(codec);
|
fragStream->setCodec(codec);*/
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(fragStream->atEnd())
|
//if(fragStream->atEnd())
|
||||||
return false;
|
// return false;
|
||||||
|
|
||||||
QString line=fragStream->readLine();
|
if(readBufferLeft<=0)
|
||||||
pos = fragStream->pos();
|
return false;
|
||||||
|
//QString line=fragStream->readLine();
|
||||||
|
int pos = findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed,lineFeedLength);
|
||||||
|
if(pos==-1)
|
||||||
|
return false;
|
||||||
|
QString line = codec->toUnicode(readBufferPtr, pos);
|
||||||
|
if(line.endsWith("\n"))
|
||||||
|
line.chop(1);
|
||||||
|
if(line.endsWith("\r"))
|
||||||
|
line.chop(1);
|
||||||
|
|
||||||
|
if(pos>readBufferLeft){
|
||||||
|
pos=readBufferLeft;
|
||||||
|
}
|
||||||
|
readBufferLeft -= pos;
|
||||||
|
readBufferPtr += pos;
|
||||||
linesRead++;
|
linesRead++;
|
||||||
if(only_head_word &&( line.isEmpty()||line.at(0).isSpace()))
|
if(only_head_word &&( line.isEmpty()||line.at(0).isSpace()))
|
||||||
continue;
|
continue;
|
||||||
|
@ -1108,6 +1137,31 @@ char const * DslIconv::getEncodingNameFor( DslEncoding e )
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void DslScanner::initLineFeed(DslEncoding e)
|
||||||
|
{
|
||||||
|
switch (e)
|
||||||
|
{
|
||||||
|
case Utf16LE:
|
||||||
|
lineFeed= new char[2] {0x0A,0};
|
||||||
|
lineFeedLength = 2;
|
||||||
|
break;
|
||||||
|
case Utf16BE:
|
||||||
|
lineFeed = new char[2] { 0,0x0A};
|
||||||
|
lineFeedLength = 2;
|
||||||
|
break;
|
||||||
|
case Windows1252:
|
||||||
|
|
||||||
|
case Windows1251:
|
||||||
|
|
||||||
|
case Details::Utf8:
|
||||||
|
|
||||||
|
case Windows1250:
|
||||||
|
default:
|
||||||
|
lineFeedLength = 1;
|
||||||
|
lineFeed = new char[1] {0x0A};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void processUnsortedParts( wstring & str, bool strip )
|
void processUnsortedParts( wstring & str, bool strip )
|
||||||
{
|
{
|
||||||
int refCount = 0;
|
int refCount = 0;
|
||||||
|
|
|
@ -126,11 +126,13 @@ class DslScanner
|
||||||
wstring dictionaryName;
|
wstring dictionaryName;
|
||||||
wstring langFrom, langTo;
|
wstring langFrom, langTo;
|
||||||
wstring soundDictionary;
|
wstring soundDictionary;
|
||||||
char readBuffer[ 10000 ];
|
char readBuffer[ 65536 ];
|
||||||
QTextStream* fragStream;
|
QTextStream* fragStream;
|
||||||
char * readBufferPtr;
|
char * readBufferPtr;
|
||||||
|
const char* lineFeed;
|
||||||
|
int lineFeedLength;
|
||||||
size_t readBufferLeft;
|
size_t readBufferLeft;
|
||||||
qint64 pos;
|
//qint64 pos;
|
||||||
unsigned linesRead;
|
unsigned linesRead;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
@ -148,6 +150,7 @@ public:
|
||||||
/// Returns the detected encoding of this file.
|
/// Returns the detected encoding of this file.
|
||||||
DslEncoding getEncoding() const
|
DslEncoding getEncoding() const
|
||||||
{ return encoding; }
|
{ return encoding; }
|
||||||
|
void initLineFeed(DslEncoding e);
|
||||||
|
|
||||||
/// Returns the dictionary's name, as was read from file's headers.
|
/// Returns the dictionary's name, as was read from file's headers.
|
||||||
wstring const & getDictionaryName() const
|
wstring const & getDictionaryName() const
|
||||||
|
|
Loading…
Reference in a new issue