From b771f0e340ced72b3d45a3ffc75bd4f27ae6e8c7 Mon Sep 17 00:00:00 2001 From: YiFang Xiao Date: Thu, 31 Mar 2022 17:51:22 +0800 Subject: [PATCH] optimize the utf8::encoding method invocation. --- article_maker.cc | 2 +- bgl.cc | 15 +------- btreeidx.cc | 89 ++++++++++++------------------------------------ 3 files changed, 23 insertions(+), 83 deletions(-) diff --git a/article_maker.cc b/article_maker.cc index e6b04075..db56ca2b 100644 --- a/article_maker.cc +++ b/article_maker.cc @@ -126,7 +126,7 @@ std::string ArticleMaker::makeHtmlHeader( QString const & word, } } - result += "" + Html::escape( Utf8::encode( gd::toWString( word ) ) ) + ""; + result += "" + Html::escape( word.toStdString()) + ""; // This doesn't seem to be much of influence right now, but we'll keep // it anyway. diff --git a/bgl.cc b/bgl.cc index bec116d6..ec11ba0b 100644 --- a/bgl.cc +++ b/bgl.cc @@ -178,20 +178,7 @@ namespace } // Convert the word from utf8 to wide chars - - if ( wcharBuffer.size() <= word.size() ) - wcharBuffer.resize( word.size() + 1 ); - - long result = Utf8::decode( word.c_str(), word.size(), - &wcharBuffer.front() ); - - if ( result < 0 ) - { - gdWarning( "Failed to decode utf8 of headword \"%s\", skipping it.", word.c_str() ); - return; - } - - indexedWords.addWord( wstring( &wcharBuffer.front(), result ), articleOffset ); + indexedWords.addWord( Utf8::decode( word ), articleOffset ); } diff --git a/btreeidx.cc b/btreeidx.cc index 16cb687c..645ae4e1 100644 --- a/btreeidx.cc +++ b/btreeidx.cc @@ -526,8 +526,8 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix( wstring const & target, // Lookup the index by traversing the index btree - vector< wchar > wcharBuffer; - + // vector< wchar > wcharBuffer; + wstring w_word; exactMatch = false; // Read a node @@ -615,20 +615,10 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix( wstring const & target, --closestString; size_t wordSize = strlen( closestString ); - - if ( wcharBuffer.size() <= wordSize ) - wcharBuffer.resize( wordSize + 1 ); - - long result = Utf8::decode( closestString, wordSize, &wcharBuffer.front() ); - if ( result < 0 ) - throw Utf8::exCantDecode( closestString ); - - wcharBuffer[ result ] = 0; + w_word = Utf8::decode( string( closestString, wordSize ) ); - //GD_DPRINTF( "Checking against %s\n", closestString ); - - compareResult = target.compare( &wcharBuffer.front() ); + compareResult = target.compare( w_word); if ( !compareResult ) { @@ -749,22 +739,12 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix( wstring const & target, ptr += sizeof( uint32_t ); size_t wordSize = strlen( ptr ); + + w_word = Utf8::decode( string( ptr, wordSize ) ); - if ( wcharBuffer.size() <= wordSize ) - wcharBuffer.resize( wordSize + 1 ); - - //GD_DPRINTF( "checking against word %s, left = %u\n", ptr, leafEntries ); - - long result = Utf8::decode( ptr, wordSize, &wcharBuffer.front() ); - - if ( result < 0 ) - throw Utf8::exCantDecode( ptr ); - - wcharBuffer[ result ] = 0; - - wstring foldedWord = Folding::apply( &wcharBuffer.front() ); + wstring foldedWord = Folding::apply( w_word ); if( foldedWord.empty() ) - foldedWord = Folding::applyWhitespaceOnly( &wcharBuffer.front() ); + foldedWord = Folding::applyWhitespaceOnly( w_word ); int compareResult = target.compare( foldedWord ); @@ -1071,22 +1051,10 @@ void IndexedWords::addWord( wstring const & word, uint32_t articleOffset, unsign // Safeguard us against various bugs here. Don't attempt adding words // which are freakishly huge. - if ( wordSize > maxHeadwordSize ) + if( wordSize > maxHeadwordSize ) { -#define MAX_LOG_WORD_SIZE 500 - string headword; - if( wordSize <= MAX_LOG_WORD_SIZE ) - headword = Utf8::encode( word ); - else - { - std::vector< char > buffer( MAX_LOG_WORD_SIZE * 4 ); - headword = string( &buffer.front(), - Utf8::encode( wordBegin, MAX_LOG_WORD_SIZE, &buffer.front() ) ); - headword += "..."; - } - gdWarning( "Skipped too long headword: \"%s\"", headword.c_str() ); + qWarning() << "Skipped too long headword: " << word.substr( 0, 100 ).c_str() << "size:" << wordSize; return; -#undef MAX_LOG_WORD_SIZE } // Skip any leading whitespace @@ -1118,17 +1086,11 @@ void IndexedWords::addWord( wstring const & word, uint32_t articleOffset, unsign wstring folded = Folding::applyWhitespaceOnly( wstring( wordBegin, wordSize ) ); if( !folded.empty() ) { - iterator i = insert( - IndexedWords::value_type( - string( &utfBuffer.front(), - Utf8::encode( folded.data(), folded.size(), &utfBuffer.front() ) ), - vector< WordArticleLink >() ) ).first; + iterator i = insert( { Utf8::encode( folded ), + vector< WordArticleLink >() } ) + .first; - // Try to conserve memory somewhat -- slow insertions are ok - i->second.reserve( i->second.size() + 1 ); - - string utfWord( &utfBuffer.front(), - Utf8::encode( wordBegin, wordSize, &utfBuffer.front() ) ); + string utfWord=Utf8::encode( wstring(wordBegin, wordSize )) ; string utfPrefix; i->second.push_back( WordArticleLink( utfWord, articleOffset, utfPrefix ) ); } @@ -1142,24 +1104,15 @@ void IndexedWords::addWord( wstring const & word, uint32_t articleOffset, unsign // Insert this word wstring folded = Folding::apply( nextChar ); - - iterator i = insert( - IndexedWords::value_type( - string( &utfBuffer.front(), - Utf8::encode( folded.data(), folded.size(), &utfBuffer.front() ) ), - vector< WordArticleLink >() ) ).first; - if ( ( i->second.size() < 1024 ) || ( nextChar == wordBegin ) ) // Don't overpopulate chains with middle matches + iterator i = insert( { Utf8::encode( folded ), vector< WordArticleLink >() } ).first; + + if( ( i->second.size() < 1024 ) || ( nextChar == wordBegin ) ) // Don't overpopulate chains with middle matches { - // Try to conserve memory somewhat -- slow insertions are ok - i->second.reserve( i->second.size() + 1 ); - - string utfWord( &utfBuffer.front(), - Utf8::encode( nextChar, wordSize - ( nextChar - wordBegin ), &utfBuffer.front() ) ); - - string utfPrefix( &utfBuffer.front(), - Utf8::encode( wordBegin, nextChar - wordBegin, &utfBuffer.front() ) ); - + string utfWord = Utf8::encode( wstring( nextChar, wordSize - ( nextChar - wordBegin ) ) ); + + string utfPrefix = Utf8::encode( wstring( wordBegin, nextChar - wordBegin ) ); + i->second.push_back( WordArticleLink( utfWord, articleOffset, utfPrefix ) ); }