diff --git a/src/btreeidx.cc b/src/btreeidx.cc index 98f00ec0..f9da0ffe 100644 --- a/src/btreeidx.cc +++ b/src/btreeidx.cc @@ -780,8 +780,6 @@ static uint32_t buildBtreeNode( IndexedWords::const_iterator & nextIndex, unsigned prevEntry = 0; - vector< char > charBuffer; - for( unsigned x = 0; x < maxElements; ++x ) { unsigned curEntry = (uint64_t) indexSize * ( x + 1 ) / ( maxElements + 1 ); @@ -793,18 +791,13 @@ static uint32_t buildBtreeNode( IndexedWords::const_iterator & nextIndex, memcpy( &uncompressedData.front() + sizeof( uint32_t ) + x * sizeof( uint32_t ), &offset, sizeof( uint32_t ) ); - if ( charBuffer.size() < nextIndex->first.size() * 4 ) - charBuffer.resize( nextIndex->first.size() * 4 ); - - size_t sz = Utf8::encode( nextIndex->first.data(), nextIndex->first.size(), - &charBuffer.front() ); + size_t sz = nextIndex->first.size() + 1; size_t prevSize = uncompressedData.size(); - uncompressedData.resize( prevSize + sz + 1 ); + uncompressedData.resize( prevSize + sz ); - memcpy( &uncompressedData.front() + prevSize, &charBuffer.front(), sz ); - - uncompressedData.back() = 0; + memcpy( &uncompressedData.front() + prevSize, nextIndex->first.c_str(), + sz ); prevEntry = curEntry; } @@ -914,9 +907,12 @@ void IndexedWords::addWord( wstring const & word, uint32_t articleOffset ) } // Insert this word + wstring folded = Folding::apply( nextChar ); + iterator i = insert( IndexedWords::value_type( - Folding::apply( nextChar ), + string( &utfBuffer.front(), + Utf8::encode( folded.data(), folded.size(), &utfBuffer.front() ) ), vector< WordArticleLink >() ) ).first; if ( ( i->second.size() < 1024 ) || ( nextChar == wordBegin ) ) // Don't overpopulate chains with middle matches diff --git a/src/btreeidx.hh b/src/btreeidx.hh index 403a18c3..89a1bede 100644 --- a/src/btreeidx.hh +++ b/src/btreeidx.hh @@ -139,8 +139,9 @@ private: /// This represents the index in its source form, as a map which binds folded /// words to sequences of their unfolded source forms and the corresponding -/// article offsets. -struct IndexedWords: public map< wstring, vector< WordArticleLink > > +/// article offsets. The words are utf8-encoded -- it doesn't break Unicode +/// sorting, but conserves space. +struct IndexedWords: public map< string, vector< WordArticleLink > > { /// Instead of adding to the map directly, use this function. It does folding /// itself, and for phrases/sentences it adds additional entries beginning with