From 2633b324589a3cf289ada6085e6d8a9659615b83 Mon Sep 17 00:00:00 2001 From: xiaoyifang Date: Sat, 6 Nov 2021 10:10:37 +0800 Subject: [PATCH] load dsl dictionary performance improved --- dsl_details.cc | 128 +++++++++++++++++++++++++++++++++++-------------- dsl_details.hh | 7 ++- 2 files changed, 96 insertions(+), 39 deletions(-) diff --git a/dsl_details.cc b/dsl_details.cc index ff0b8299..cf2f2a94 100644 --- a/dsl_details.cc +++ b/dsl_details.cc @@ -12,6 +12,7 @@ #include #include +#include namespace Dsl { namespace Details { @@ -40,6 +41,18 @@ int wcscasecmp( const wchar *s1, const wchar *s2 ) #endif +//get the first line in string s1. -1 if not found +int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length) +{ + char* pos = std::search(s1,s1+s1length, s2, s2+s2length); + + if (pos == s1 + s1length) + return pos-s1; + + //the line size. + return pos- s1+ s2length; +} + static DSLLangCode LangCodes[] = { { 1, "en" }, @@ -780,35 +793,36 @@ void ArticleDom::nextChar() THROW_SPEC( eot ) { if ( !*stringPos ) throw eot(); + else{ + ch = *stringPos++; - ch = *stringPos++; + if ( ch == L'\\' ) + { + if ( !*stringPos ) + throw eot(); - if ( ch == L'\\' ) - { - if ( !*stringPos ) - throw eot(); + ch = *stringPos++; - ch = *stringPos++; + escaped = true; + } + else + if ( ch == L'[' && *stringPos == L'[' ) + { + ++stringPos; + escaped = true; + } + else + if ( ch == L']' && *stringPos == L']' ) + { + ++stringPos; + escaped = true; + } + else + escaped = false; - escaped = true; + if( ch == '\n' || ch == '\r' ) + lineStartPos = stringPos; } - else - if ( ch == L'[' && *stringPos == L'[' ) - { - ++stringPos; - escaped = true; - } - else - if ( ch == L']' && *stringPos == L']' ) - { - ++stringPos; - escaped = true; - } - else - escaped = false; - - if( ch == '\n' || ch == '\r' ) - lineStartPos = stringPos; } bool ArticleDom::atSignFirstInLine() @@ -824,7 +838,7 @@ bool ArticleDom::atSignFirstInLine() DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ): encoding( Windows1252 ), iconv( encoding ), readBufferPtr( readBuffer ), - readBufferLeft( 0 ), linesRead( 0 ), pos(0) + readBufferLeft( 0 ), linesRead( 0 ) { // Since .dz is backwards-compatible with .gz, we use gz- functions to // read it -- they are much nicer than the dict_data- ones. @@ -890,8 +904,9 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ): } } - iconv.reinit( encoding ); + //iconv.reinit( encoding ); codec = QTextCodec::codecForName(iconv.getEncodingNameFor(encoding)); + initLineFeed(encoding); // We now can use our own readNextLine() function wstring str; @@ -984,7 +999,6 @@ DslScanner::DslScanner( string const & fileName ) THROW_SPEC( Ex, Iconv::Ex ): gzseek( f, offset, SEEK_SET ); readBufferPtr = readBuffer; readBufferLeft = 0; - pos = 0; if ( needExactEncoding ) iconv.reinit( encoding ); @@ -998,15 +1012,15 @@ DslScanner::~DslScanner() throw() bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_word ) THROW_SPEC( Ex, Iconv::Ex ) { - offset = (size_t)( gztell( f ) - readBufferLeft+pos ); + offset = (size_t)( gztell( f ) - readBufferLeft/*+pos*/ ); for(;;) { // Check that we have bytes to read - if ( readBufferLeft-pos < 2000 ) + if ( readBufferLeft < 5000 ) { - readBufferPtr+=pos; - readBufferLeft-=pos; + //readBufferPtr+=pos; + //readBufferLeft-=pos; if ( !gzeof( f ) ) { // To avoid having to deal with ring logic, we move the remaining bytes @@ -1022,17 +1036,32 @@ bool DslScanner::readNextLine( wstring & out, size_t & offset, bool only_head_wo readBufferPtr = readBuffer; readBufferLeft += (size_t) result; - QByteArray frag = QByteArray::fromRawData(readBuffer, readBufferLeft); + /*QByteArray frag = QByteArray::fromRawData(readBuffer, readBufferLeft); fragStream = new QTextStream(frag) ; - fragStream->setCodec(codec); + fragStream->setCodec(codec);*/ } } - if(fragStream->atEnd()) - return false; + //if(fragStream->atEnd()) + // return false; - QString line=fragStream->readLine(); - pos = fragStream->pos(); + if(readBufferLeft<=0) + return false; + //QString line=fragStream->readLine(); + int pos = findFirstLinePosition(readBufferPtr,readBufferLeft, lineFeed,lineFeedLength); + if(pos==-1) + return false; + QString line = codec->toUnicode(readBufferPtr, pos); + if(line.endsWith("\n")) + line.chop(1); + if(line.endsWith("\r")) + line.chop(1); + + if(pos>readBufferLeft){ + pos=readBufferLeft; + } + readBufferLeft -= pos; + readBufferPtr += pos; linesRead++; if(only_head_word &&( line.isEmpty()||line.at(0).isSpace())) continue; @@ -1108,6 +1137,31 @@ char const * DslIconv::getEncodingNameFor( DslEncoding e ) } +void DslScanner::initLineFeed(DslEncoding e) +{ + switch (e) + { + case Utf16LE: + lineFeed= new char[2] {0x0A,0}; + lineFeedLength = 2; + break; + case Utf16BE: + lineFeed = new char[2] { 0,0x0A}; + lineFeedLength = 2; + break; + case Windows1252: + + case Windows1251: + + case Details::Utf8: + + case Windows1250: + default: + lineFeedLength = 1; + lineFeed = new char[1] {0x0A}; + } +} + void processUnsortedParts( wstring & str, bool strip ) { int refCount = 0; diff --git a/dsl_details.hh b/dsl_details.hh index 35fa7103..7ae16451 100644 --- a/dsl_details.hh +++ b/dsl_details.hh @@ -126,11 +126,13 @@ class DslScanner wstring dictionaryName; wstring langFrom, langTo; wstring soundDictionary; - char readBuffer[ 10000 ]; + char readBuffer[ 65536 ]; QTextStream* fragStream; char * readBufferPtr; + const char* lineFeed; + int lineFeedLength; size_t readBufferLeft; - qint64 pos; + //qint64 pos; unsigned linesRead; public: @@ -148,6 +150,7 @@ public: /// Returns the detected encoding of this file. DslEncoding getEncoding() const { return encoding; } + void initLineFeed(DslEncoding e); /// Returns the dictionary's name, as was read from file's headers. wstring const & getDictionaryName() const