load dsl dictionary performance improved

This commit is contained in:
xiaoyifang 2021-11-06 10:10:37 +08:00
parent fb2b667cc5
commit 2633b32458
2 changed files with 96 additions and 39 deletions

View file

@ -12,6 +12,7 @@
#include <stdio.h>
#include <wctype.h>
#include <algorithm>
namespace Dsl {
namespace Details {
@ -40,6 +41,18 @@ int wcscasecmp( const wchar *s1, const wchar *s2 )
#endif
//get the first line in string s1. -1 if not found
int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length)
{
char* pos = std::search(s1,s1+s1length, s2, s2+s2length);
if (pos == s1 + s1length)
return pos-s1;
//the line size.
return pos- s1+ s2length;
}
static DSLLangCode LangCodes[] =
{
{ 1, "en" },
@ -780,35 +793,36 @@ void ArticleDom::nextChar() THROW_SPEC( eot )
{
if ( !*stringPos )
throw eot();
else{
ch = *stringPos++;
ch = *stringPos++;
if ( ch == L'\\' )
{
if ( !*stringPos )
throw eot();
if ( ch == L'\\' )
{
if ( !*stringPos )
throw eot();
ch = *stringPos++;
ch = *stringPos++;
escaped = true;
}
else
if ( ch == L'[' && *stringPos == L'[' )
{
++stringPos;
escaped = true;
}
else
if ( ch == L']' && *stringPos == L']' )
{
++stringPos;
escaped = true;
}
else
escaped = false;
escaped = true;
if( ch == '\n' || ch == '\r' )
lineStartPos = stringPos;
}
else
if ( ch == L'[' && *stringPos == L'[' )
{
++stringPos;
escaped = true;
}
else
if ( ch == L']' && *stringPos == L']' )
{
++stringPos;
escaped = true;
}
else
escaped = false;
if( ch == '\n' || ch == '\r' )
lineStartPos = stringPos;
}
bool ArticleDom::atSignFirstInLine()
@ -824,7 +838,7 @@ bool ArticleDom::atSignFirstInLine()
DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
encoding( Windows1252 ), iconv( encoding ), readBufferPtr( readBuffer ),
readBufferLeft( 0 ), linesRead( 0 ), pos(0)
readBufferLeft( 0 ), linesRead( 0 )
{
// Since .dz is backwards-compatible with .gz, we use gz- functions to
// read it -- they are much nicer than the dict_data- ones.
@ -890,8 +904,9 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
}
}
iconv.reinit( encoding );
//iconv.reinit( encoding );
codec = QTextCodec::codecForName(iconv.getEncodingNameFor(encoding));
initLineFeed(encoding);
// We now can use our own readNextLine() function
wstring str;
@ -984,7 +999,6 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ):
gzseek( f, offset, SEEK_SET );
readBufferPtr = readBuffer;
readBufferLeft = 0;
pos = 0;
if ( needExactEncoding )
iconv.reinit( encoding );
@ -998,15 +1012,15 @@ DslScanner::~DslScanner() throw()
bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_word ) THROW_SPEC( Ex,
Iconv::Ex )
{
offset = (size_t)( gztell( f ) - readBufferLeft+pos );
offset = (size_t)( gztell( f ) - readBufferLeft/*+pos*/ );
for(;;)
{
// Check that we have bytes to read
if ( readBufferLeft-pos < 2000 )
if ( readBufferLeft < 5000 )
{
readBufferPtr+=pos;
readBufferLeft-=pos;
//readBufferPtr+=pos;
//readBufferLeft-=pos;
if ( !gzeof( f ) )
{
// To avoid having to deal with ring logic, we move the remaining bytes
@ -1022,17 +1036,32 @@ bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_wo
readBufferPtr = readBuffer;
readBufferLeft += (size_t) result;
QByteArray frag = QByteArray::fromRawData(readBuffer, readBufferLeft);
/*QByteArray frag = QByteArray::fromRawData(readBuffer, readBufferLeft);
fragStream = new QTextStream(frag) ;
fragStream->setCodec(codec);
fragStream->setCodec(codec);*/
}
}
if(fragStream->atEnd())
return false;
//if(fragStream->atEnd())
// return false;
QString line=fragStream->readLine();
pos = fragStream->pos();
if(readBufferLeft<=0)
return false;
//QString line=fragStream->readLine();
int pos = findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed,lineFeedLength);
if(pos==-1)
return false;
QString line = codec->toUnicode(readBufferPtr, pos);
if(line.endsWith("\n"))
line.chop(1);
if(line.endsWith("\r"))
line.chop(1);
if(pos>readBufferLeft){
pos=readBufferLeft;
}
readBufferLeft -= pos;
readBufferPtr += pos;
linesRead++;
if(only_head_word &&( line.isEmpty()||line.at(0).isSpace()))
continue;
@ -1108,6 +1137,31 @@ char const * DslIconv::getEncodingNameFor( DslEncoding e )
}
void DslScanner::initLineFeed(DslEncoding e)
{
switch (e)
{
case Utf16LE:
lineFeed= new char[2] {0x0A,0};
lineFeedLength = 2;
break;
case Utf16BE:
lineFeed = new char[2] { 0,0x0A};
lineFeedLength = 2;
break;
case Windows1252:
case Windows1251:
case Details::Utf8:
case Windows1250:
default:
lineFeedLength = 1;
lineFeed = new char[1] {0x0A};
}
}
void processUnsortedParts( wstring & str, bool strip )
{
int refCount = 0;

View file

@ -126,11 +126,13 @@ class DslScanner
wstring dictionaryName;
wstring langFrom, langTo;
wstring soundDictionary;
char readBuffer[ 10000 ];
char readBuffer[ 65536 ];
QTextStream* fragStream;
char * readBufferPtr;
const char* lineFeed;
int lineFeedLength;
size_t readBufferLeft;
qint64 pos;
//qint64 pos;
unsigned linesRead;
public:
@ -148,6 +150,7 @@ public:
/// Returns the detected encoding of this file.
DslEncoding getEncoding() const
{ return encoding; }
void initLineFeed(DslEncoding e);
/// Returns the dictionary's name, as was read from file's headers.
wstring const & getDictionaryName() const