mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-27 19:24:08 +00:00
* Use utf8-encoded strings instead of wide strings for word keys during
the indexing process -- this conserves memory while still preserving the correct entry order.
This commit is contained in:
parent
a432f40093
commit
89fd4ffa31
|
@ -780,8 +780,6 @@ static uint32_t buildBtreeNode( IndexedWords::const_iterator & nextIndex,
|
|||
|
||||
unsigned prevEntry = 0;
|
||||
|
||||
vector< char > charBuffer;
|
||||
|
||||
for( unsigned x = 0; x < maxElements; ++x )
|
||||
{
|
||||
unsigned curEntry = (uint64_t) indexSize * ( x + 1 ) / ( maxElements + 1 );
|
||||
|
@ -793,18 +791,13 @@ static uint32_t buildBtreeNode( IndexedWords::const_iterator & nextIndex,
|
|||
|
||||
memcpy( &uncompressedData.front() + sizeof( uint32_t ) + x * sizeof( uint32_t ), &offset, sizeof( uint32_t ) );
|
||||
|
||||
if ( charBuffer.size() < nextIndex->first.size() * 4 )
|
||||
charBuffer.resize( nextIndex->first.size() * 4 );
|
||||
|
||||
size_t sz = Utf8::encode( nextIndex->first.data(), nextIndex->first.size(),
|
||||
&charBuffer.front() );
|
||||
size_t sz = nextIndex->first.size() + 1;
|
||||
|
||||
size_t prevSize = uncompressedData.size();
|
||||
uncompressedData.resize( prevSize + sz + 1 );
|
||||
uncompressedData.resize( prevSize + sz );
|
||||
|
||||
memcpy( &uncompressedData.front() + prevSize, &charBuffer.front(), sz );
|
||||
|
||||
uncompressedData.back() = 0;
|
||||
memcpy( &uncompressedData.front() + prevSize, nextIndex->first.c_str(),
|
||||
sz );
|
||||
|
||||
prevEntry = curEntry;
|
||||
}
|
||||
|
@ -914,9 +907,12 @@ void IndexedWords::addWord( wstring const & word, uint32_t articleOffset )
|
|||
}
|
||||
|
||||
// Insert this word
|
||||
wstring folded = Folding::apply( nextChar );
|
||||
|
||||
iterator i = insert(
|
||||
IndexedWords::value_type(
|
||||
Folding::apply( nextChar ),
|
||||
string( &utfBuffer.front(),
|
||||
Utf8::encode( folded.data(), folded.size(), &utfBuffer.front() ) ),
|
||||
vector< WordArticleLink >() ) ).first;
|
||||
|
||||
if ( ( i->second.size() < 1024 ) || ( nextChar == wordBegin ) ) // Don't overpopulate chains with middle matches
|
||||
|
|
|
@ -139,8 +139,9 @@ private:
|
|||
|
||||
/// This represents the index in its source form, as a map which binds folded
|
||||
/// words to sequences of their unfolded source forms and the corresponding
|
||||
/// article offsets.
|
||||
struct IndexedWords: public map< wstring, vector< WordArticleLink > >
|
||||
/// article offsets. The words are utf8-encoded -- it doesn't break Unicode
|
||||
/// sorting, but conserves space.
|
||||
struct IndexedWords: public map< string, vector< WordArticleLink > >
|
||||
{
|
||||
/// Instead of adding to the map directly, use this function. It does folding
|
||||
/// itself, and for phrases/sentences it adds additional entries beginning with
|
||||
|
|
Loading…
Reference in a new issue