From b771f0e340ced72b3d45a3ffc75bd4f27ae6e8c7 Mon Sep 17 00:00:00 2001
From: YiFang Xiao <yifang.xiao@outlook.com>
Date: Thu, 31 Mar 2022 17:51:22 +0800
Subject: [PATCH] optimize the utf8::encoding method invocation.

---
 article_maker.cc |  2 +-
 bgl.cc           | 15 +-------
 btreeidx.cc      | 89 ++++++++++++------------------------------------
 3 files changed, 23 insertions(+), 83 deletions(-)
diff --git a/article_maker.cc b/article_maker.cc
index e6b04075..db56ca2b 100644
--- a/article_maker.cc
+++ b/article_maker.cc
@@ -126,7 +126,7 @@ std::string ArticleMaker::makeHtmlHeader( QString const & word,
     }
   }
 
-  result += "<title>" + Html::escape( Utf8::encode( gd::toWString( word ) ) ) + "</title>";
+  result += "<title>" + Html::escape( word.toStdString()) + "</title>";
 
   // This doesn't seem to be much of influence right now, but we'll keep
   // it anyway.
diff --git a/bgl.cc b/bgl.cc
index bec116d6..ec11ba0b 100644
--- a/bgl.cc
+++ b/bgl.cc
@@ -178,20 +178,7 @@ namespace
     }
 
     // Convert the word from utf8 to wide chars
-
-    if ( wcharBuffer.size() <= word.size() )
-      wcharBuffer.resize( word.size() + 1 );
-
-    long result = Utf8::decode( word.c_str(), word.size(),
-                                &wcharBuffer.front() );
-
-    if ( result < 0 )
-    {
-      gdWarning( "Failed to decode utf8 of headword \"%s\", skipping it.", word.c_str() );
-      return;
-    }
-
-    indexedWords.addWord( wstring( &wcharBuffer.front(), result ), articleOffset );
+    indexedWords.addWord( Utf8::decode( word ), articleOffset );
   }
 
 
diff --git a/btreeidx.cc b/btreeidx.cc
index 16cb687c..645ae4e1 100644
--- a/btreeidx.cc
+++ b/btreeidx.cc
@@ -526,8 +526,8 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix( wstring const & target,
   
   // Lookup the index by traversing the index btree
 
-  vector< wchar > wcharBuffer;
-
+  // vector< wchar > wcharBuffer;
+  wstring w_word;
   exactMatch = false;
 
   // Read a node
@@ -615,20 +615,10 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix( wstring const & target,
           --closestString;
   
         size_t wordSize = strlen( closestString );
-  
-        if ( wcharBuffer.size() <= wordSize )
-          wcharBuffer.resize( wordSize + 1 );
-  
-        long result = Utf8::decode( closestString, wordSize, &wcharBuffer.front() );
 
-        if ( result < 0 )
-          throw Utf8::exCantDecode( closestString );
-  
-        wcharBuffer[ result ] = 0;
+        w_word = Utf8::decode( string( closestString, wordSize ) );
 
-        //GD_DPRINTF( "Checking against %s\n", closestString );
-
-        compareResult = target.compare( &wcharBuffer.front() );
+        compareResult = target.compare( w_word);
   
         if ( !compareResult )
         {
@@ -749,22 +739,12 @@ char const * BtreeIndex::findChainOffsetExactOrPrefix( wstring const & target,
         ptr += sizeof( uint32_t );
   
         size_t wordSize = strlen( ptr );
+
+        w_word = Utf8::decode( string( ptr, wordSize ) );
   
-        if ( wcharBuffer.size() <= wordSize )
-          wcharBuffer.resize( wordSize + 1 );
-  
-        //GD_DPRINTF( "checking against word %s, left = %u\n", ptr, leafEntries );
-  
-        long result = Utf8::decode( ptr, wordSize, &wcharBuffer.front() );
-  
-        if ( result < 0 )
-          throw Utf8::exCantDecode( ptr );
-  
-        wcharBuffer[ result ] = 0;
-  
-        wstring foldedWord = Folding::apply( &wcharBuffer.front() );
+        wstring foldedWord = Folding::apply( w_word );
         if( foldedWord.empty() )
-          foldedWord = Folding::applyWhitespaceOnly( &wcharBuffer.front() );
+          foldedWord = Folding::applyWhitespaceOnly( w_word );
   
         int compareResult = target.compare( foldedWord );
   
@@ -1071,22 +1051,10 @@ void IndexedWords::addWord( wstring const & word, uint32_t articleOffset, unsign
 
   // Safeguard us against various bugs here. Don't attempt adding words
   // which are freakishly huge.
-  if ( wordSize > maxHeadwordSize )
+  if( wordSize > maxHeadwordSize )
   {
-#define MAX_LOG_WORD_SIZE 500
-    string headword;
-    if( wordSize <= MAX_LOG_WORD_SIZE )
-      headword = Utf8::encode( word );
-    else
-    {
-      std::vector< char > buffer( MAX_LOG_WORD_SIZE * 4 );
-      headword = string( &buffer.front(),
-                         Utf8::encode( wordBegin, MAX_LOG_WORD_SIZE, &buffer.front() ) );
-      headword += "...";
-    }
-    gdWarning( "Skipped too long headword: \"%s\"", headword.c_str() );
+    qWarning() << "Skipped too long headword: " << word.substr( 0, 100 ).c_str() << "size:" << wordSize;
     return;
-#undef MAX_LOG_WORD_SIZE
   }
 
   // Skip any leading whitespace
@@ -1118,17 +1086,11 @@ void IndexedWords::addWord( wstring const & word, uint32_t articleOffset, unsign
               wstring folded = Folding::applyWhitespaceOnly( wstring( wordBegin, wordSize ) );
               if( !folded.empty() )
               {
-                  iterator i = insert(
-                    IndexedWords::value_type(
-                      string( &utfBuffer.front(),
-                              Utf8::encode( folded.data(), folded.size(), &utfBuffer.front() ) ),
-                      vector< WordArticleLink >() ) ).first;
+                iterator i = insert( { Utf8::encode( folded ),
+                                       vector< WordArticleLink >() } )
+                               .first;
 
-                  // Try to conserve memory somewhat -- slow insertions are ok
-                  i->second.reserve( i->second.size() + 1 );
-
-                  string utfWord( &utfBuffer.front(),
-                                  Utf8::encode( wordBegin, wordSize, &utfBuffer.front() ) );
+                string utfWord=Utf8::encode( wstring(wordBegin, wordSize )) ;
                   string utfPrefix;
                   i->second.push_back( WordArticleLink( utfWord, articleOffset, utfPrefix ) );
               }
@@ -1142,24 +1104,15 @@ void IndexedWords::addWord( wstring const & word, uint32_t articleOffset, unsign
 
     // Insert this word
     wstring folded = Folding::apply( nextChar );
-    
-    iterator i = insert(
-      IndexedWords::value_type(
-        string( &utfBuffer.front(),
-                Utf8::encode( folded.data(), folded.size(), &utfBuffer.front() ) ),
-        vector< WordArticleLink >() ) ).first;
 
-    if ( ( i->second.size() < 1024 ) || ( nextChar == wordBegin ) ) // Don't overpopulate chains with middle matches
+    iterator i = insert( { Utf8::encode( folded ), vector< WordArticleLink >() } ).first;
+
+    if( ( i->second.size() < 1024 ) || ( nextChar == wordBegin ) ) // Don't overpopulate chains with middle matches
     {
-      // Try to conserve memory somewhat -- slow insertions are ok
-      i->second.reserve( i->second.size() + 1 );
-  
-      string utfWord( &utfBuffer.front(),
-                      Utf8::encode( nextChar, wordSize - ( nextChar - wordBegin ), &utfBuffer.front() ) );
-  
-      string utfPrefix( &utfBuffer.front(),
-                        Utf8::encode( wordBegin, nextChar - wordBegin, &utfBuffer.front() ) );
-  
+      string utfWord = Utf8::encode( wstring( nextChar, wordSize - ( nextChar - wordBegin ) ) );
+
+      string utfPrefix = Utf8::encode( wstring( wordBegin, nextChar - wordBegin ) );
+
       i->second.push_back( WordArticleLink( utfWord, articleOffset, utfPrefix ) );
     }