load dsl dictionary performance improved

This commit is contained in:
xiaoyifang 2021-11-06 10:10:37 +08:00
parent fb2b667cc5
commit 2633b32458
2 changed files with 96 additions and 39 deletions

View file

@ -12,6 +12,7 @@
#include <stdio.h> #include <stdio.h>
#include <wctype.h> #include <wctype.h>
#include <algorithm>
namespace Dsl { namespace Dsl {
namespace Details { namespace Details {
@ -40,6 +41,18 @@ int wcscasecmp( const wchar *s1, const wchar *s2 )
#endif #endif
//get the first line in string s1. -1 if not found
int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length)
{
char* pos = std::search(s1,s1+s1length, s2, s2+s2length);
if (pos == s1 + s1length)
return pos-s1;
//the line size.
return pos- s1+ s2length;
}
static DSLLangCode LangCodes[] = static DSLLangCode LangCodes[] =
{ {
{ 1, "en" }, { 1, "en" },
@ -780,35 +793,36 @@ void ArticleDom::nextChar() THROW_SPEC( eot )
{ {
if ( !*stringPos ) if ( !*stringPos )
throw eot(); throw eot();
else{
ch = *stringPos++;
ch = *stringPos++; if ( ch == L'\\' )
{
if ( !*stringPos )
throw eot();
if ( ch == L'\\' ) ch = *stringPos++;
{
if ( !*stringPos )
throw eot();
ch = *stringPos++; escaped = true;
}
else
if ( ch == L'[' && *stringPos == L'[' )
{
++stringPos;
escaped = true;
}
else
if ( ch == L']' && *stringPos == L']' )
{
++stringPos;
escaped = true;
}
else
escaped = false;
escaped = true; if( ch == '\n' || ch == '\r' )
lineStartPos = stringPos;
} }
else
if ( ch == L'[' && *stringPos == L'[' )
{
++stringPos;
escaped = true;
}
else
if ( ch == L']' && *stringPos == L']' )
{
++stringPos;
escaped = true;
}
else
escaped = false;
if( ch == '\n' || ch == '\r' )
lineStartPos = stringPos;
} }
bool ArticleDom::atSignFirstInLine() bool ArticleDom::atSignFirstInLine()
@ -824,7 +838,7 @@ bool ArticleDom::atSignFirstInLine()
DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ): DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
encoding( Windows1252 ), iconv( encoding ), readBufferPtr( readBuffer ), encoding( Windows1252 ), iconv( encoding ), readBufferPtr( readBuffer ),
readBufferLeft( 0 ), linesRead( 0 ), pos(0) readBufferLeft( 0 ), linesRead( 0 )
{ {
// Since .dz is backwards-compatible with .gz, we use gz- functions to // Since .dz is backwards-compatible with .gz, we use gz- functions to
// read it -- they are much nicer than the dict_data- ones. // read it -- they are much nicer than the dict_data- ones.
@ -890,8 +904,9 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
} }
} }
iconv.reinit( encoding ); //iconv.reinit( encoding );
codec = QTextCodec::codecForName(iconv.getEncodingNameFor(encoding)); codec = QTextCodec::codecForName(iconv.getEncodingNameFor(encoding));
initLineFeed(encoding);
// We now can use our own readNextLine() function // We now can use our own readNextLine() function
wstring str; wstring str;
@ -984,7 +999,6 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
gzseek( f, offset, SEEK_SET ); gzseek( f, offset, SEEK_SET );
readBufferPtr = readBuffer; readBufferPtr = readBuffer;
readBufferLeft = 0; readBufferLeft = 0;
pos = 0;
if ( needExactEncoding ) if ( needExactEncoding )
iconv.reinit( encoding ); iconv.reinit( encoding );
@ -998,15 +1012,15 @@ DslScanner::~DslScanner() throw()
bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_word ) THROW_SPEC( Ex, bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_word ) THROW_SPEC( Ex,
Iconv::Ex ) Iconv::Ex )
{ {
offset = (size_t)( gztell( f ) - readBufferLeft+pos ); offset = (size_t)( gztell( f ) - readBufferLeft/*+pos*/ );
for(;;) for(;;)
{ {
// Check that we have bytes to read // Check that we have bytes to read
if ( readBufferLeft-pos < 2000 ) if ( readBufferLeft < 5000 )
{ {
readBufferPtr+=pos; //readBufferPtr+=pos;
readBufferLeft-=pos; //readBufferLeft-=pos;
if ( !gzeof( f ) ) if ( !gzeof( f ) )
{ {
// To avoid having to deal with ring logic, we move the remaining bytes // To avoid having to deal with ring logic, we move the remaining bytes
@ -1022,17 +1036,32 @@ bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_wo
readBufferPtr = readBuffer; readBufferPtr = readBuffer;
readBufferLeft += (size_t) result; readBufferLeft += (size_t) result;
QByteArray frag = QByteArray::fromRawData(readBuffer, readBufferLeft); /*QByteArray frag = QByteArray::fromRawData(readBuffer, readBufferLeft);
fragStream = new QTextStream(frag) ; fragStream = new QTextStream(frag) ;
fragStream->setCodec(codec); fragStream->setCodec(codec);*/
} }
} }
if(fragStream->atEnd()) //if(fragStream->atEnd())
return false; // return false;
QString line=fragStream->readLine(); if(readBufferLeft<=0)
pos = fragStream->pos(); return false;
//QString line=fragStream->readLine();
int pos = findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed,lineFeedLength);
if(pos==-1)
return false;
QString line = codec->toUnicode(readBufferPtr, pos);
if(line.endsWith("\n"))
line.chop(1);
if(line.endsWith("\r"))
line.chop(1);
if(pos>readBufferLeft){
pos=readBufferLeft;
}
readBufferLeft -= pos;
readBufferPtr += pos;
linesRead++; linesRead++;
if(only_head_word &&( line.isEmpty()||line.at(0).isSpace())) if(only_head_word &&( line.isEmpty()||line.at(0).isSpace()))
continue; continue;
@ -1108,6 +1137,31 @@ char const * DslIconv::getEncodingNameFor( DslEncoding e )
} }
void DslScanner::initLineFeed(DslEncoding e)
{
switch (e)
{
case Utf16LE:
lineFeed= new char[2] {0x0A,0};
lineFeedLength = 2;
break;
case Utf16BE:
lineFeed = new char[2] { 0,0x0A};
lineFeedLength = 2;
break;
case Windows1252:
case Windows1251:
case Details::Utf8:
case Windows1250:
default:
lineFeedLength = 1;
lineFeed = new char[1] {0x0A};
}
}
void processUnsortedParts( wstring & str, bool strip ) void processUnsortedParts( wstring & str, bool strip )
{ {
int refCount = 0; int refCount = 0;

View file

@ -126,11 +126,13 @@ class DslScanner
wstring dictionaryName; wstring dictionaryName;
wstring langFrom, langTo; wstring langFrom, langTo;
wstring soundDictionary; wstring soundDictionary;
char readBuffer[ 10000 ]; char readBuffer[ 65536 ];
QTextStream* fragStream; QTextStream* fragStream;
char * readBufferPtr; char * readBufferPtr;
const char* lineFeed;
int lineFeedLength;
size_t readBufferLeft; size_t readBufferLeft;
qint64 pos; //qint64 pos;
unsigned linesRead; unsigned linesRead;
public: public:
@ -148,6 +150,7 @@ public:
/// Returns the detected encoding of this file. /// Returns the detected encoding of this file.
DslEncoding getEncoding() const DslEncoding getEncoding() const
{ return encoding; } { return encoding; }
void initLineFeed(DslEncoding e);
/// Returns the dictionary's name, as was read from file's headers. /// Returns the dictionary's name, as was read from file's headers.
wstring const & getDictionaryName() const wstring const & getDictionaryName() const