mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-27 15:24:05 +00:00
* A lot of changes aimed to make lookups faster and to reduce startup times.
This commit is contained in:
parent
68c5c73b37
commit
32fe5dff9e
15
src/bgl.cc
15
src/bgl.cc
|
@ -26,6 +26,7 @@ using std::pair;
|
|||
|
||||
using BtreeIndexing::WordArticleLink;
|
||||
using BtreeIndexing::IndexedWords;
|
||||
using BtreeIndexing::IndexInfo;
|
||||
|
||||
namespace
|
||||
{
|
||||
|
@ -49,7 +50,8 @@ namespace
|
|||
uint32_t wordCount; // Total number of words, for informative purposes only
|
||||
/// Add more fields here, like name, description, author and such.
|
||||
uint32_t chunksOffset; // The offset to chunks' storage
|
||||
uint32_t indexOffset; // The offset of the index in the file.
|
||||
uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
|
||||
uint32_t indexRootOffset;
|
||||
uint32_t resourceListOffset; // The offset of the list of resources
|
||||
uint32_t resourcesCount; // Number of resources stored
|
||||
} __attribute__((packed));
|
||||
|
@ -239,9 +241,9 @@ namespace
|
|||
|
||||
// Initialize the index
|
||||
|
||||
idx.seek( idxHeader.indexOffset );
|
||||
|
||||
openIndex( idx, idxMutex );
|
||||
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
|
||||
idxHeader.indexRootOffset ),
|
||||
idx, idxMutex );
|
||||
}
|
||||
|
||||
|
||||
|
@ -739,7 +741,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
|
|||
|
||||
// Good. Now build the index
|
||||
|
||||
idxHeader.indexOffset = BtreeIndexing::buildIndex( indexedWords, idx );
|
||||
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
|
||||
|
||||
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
|
||||
idxHeader.indexRootOffset = idxInfo.rootOffset;
|
||||
|
||||
// Save the resource's list.
|
||||
|
||||
|
|
341
src/btreeidx.cc
341
src/btreeidx.cc
|
@ -42,19 +42,21 @@ enum
|
|||
|
||||
BtreeDictionary::BtreeDictionary( string const & id,
|
||||
vector< string > const & dictionaryFiles ):
|
||||
Dictionary::Class( id, dictionaryFiles ), idxFile( 0 )
|
||||
Dictionary::Class( id, dictionaryFiles ), idxFile( 0 ), rootNodeLoaded( false )
|
||||
{
|
||||
}
|
||||
|
||||
void BtreeDictionary::openIndex( File::Class & file, Mutex & mutex )
|
||||
void BtreeDictionary::openIndex( IndexInfo const & indexInfo,
|
||||
File::Class & file, Mutex & mutex )
|
||||
{
|
||||
Mutex::Lock _( mutex );
|
||||
|
||||
indexNodeSize = file.read< uint32_t >();
|
||||
rootOffset = file.read< uint32_t >();
|
||||
indexNodeSize = indexInfo.btreeMaxElements;
|
||||
rootOffset = indexInfo.rootOffset;
|
||||
|
||||
idxFile = &file;
|
||||
idxFileMutex = &mutex;
|
||||
|
||||
rootNodeLoaded = false;
|
||||
rootNode.clear();
|
||||
}
|
||||
|
||||
vector< WordArticleLink > BtreeDictionary::findArticles( wstring const & str )
|
||||
|
@ -68,8 +70,11 @@ vector< WordArticleLink > BtreeDictionary::findArticles( wstring const & str )
|
|||
vector< char > leaf;
|
||||
uint32_t nextLeaf;
|
||||
|
||||
char const * leafEnd;
|
||||
|
||||
char const * chainOffset = findChainOffsetExactOrPrefix( folded, exactMatch,
|
||||
leaf, nextLeaf );
|
||||
leaf, nextLeaf,
|
||||
leafEnd );
|
||||
|
||||
if ( chainOffset && exactMatch )
|
||||
{
|
||||
|
@ -157,9 +162,11 @@ void BtreeWordSearchRequest::run()
|
|||
|
||||
vector< char > leaf;
|
||||
uint32_t nextLeaf;
|
||||
char const * leafEnd;
|
||||
|
||||
char const * chainOffset = dict.findChainOffsetExactOrPrefix( folded, exactMatch,
|
||||
leaf, nextLeaf );
|
||||
leaf, nextLeaf,
|
||||
leafEnd );
|
||||
|
||||
if ( chainOffset )
|
||||
for( ; ; )
|
||||
|
@ -198,7 +205,7 @@ void BtreeWordSearchRequest::run()
|
|||
|
||||
// Fetch new leaf if we're out of chains here
|
||||
|
||||
if ( chainOffset > &leaf.back() )
|
||||
if ( chainOffset >= leafEnd )
|
||||
{
|
||||
// We're past the current leaf, fetch the next one
|
||||
|
||||
|
@ -209,6 +216,8 @@ void BtreeWordSearchRequest::run()
|
|||
Mutex::Lock _( *dict.idxFileMutex );
|
||||
|
||||
dict.readNode( nextLeaf, leaf );
|
||||
leafEnd = &leaf.front() + leaf.size();
|
||||
|
||||
nextLeaf = dict.idxFile->read< uint32_t >();
|
||||
chainOffset = &leaf.front() + sizeof( uint32_t );
|
||||
|
||||
|
@ -274,8 +283,9 @@ void BtreeDictionary::readNode( uint32_t offset, vector< char > & out )
|
|||
|
||||
char const * BtreeDictionary::findChainOffsetExactOrPrefix( wstring const & target,
|
||||
bool & exactMatch,
|
||||
vector< char > & leaf,
|
||||
uint32_t & nextLeaf )
|
||||
vector< char > & extLeaf,
|
||||
uint32_t & nextLeaf,
|
||||
char const * & leafEnd )
|
||||
{
|
||||
if ( !idxFile )
|
||||
throw exIndexWasNotOpened();
|
||||
|
@ -294,14 +304,21 @@ char const * BtreeDictionary::findChainOffsetExactOrPrefix( wstring const & targ
|
|||
|
||||
uint32_t currentNodeOffset = rootOffset;
|
||||
|
||||
if ( !rootNodeLoaded )
|
||||
{
|
||||
// Time to load our root node. We do it only once, at the first request.
|
||||
readNode( rootOffset, rootNode );
|
||||
rootNodeLoaded = true;
|
||||
}
|
||||
|
||||
char const * leaf = &rootNode.front();
|
||||
leafEnd = leaf + rootNode.size();
|
||||
|
||||
for( ; ; )
|
||||
{
|
||||
//printf( "reading node at %x\n", currentNodeOffset );
|
||||
readNode( currentNodeOffset, leaf );
|
||||
|
||||
// Is it a leaf or a node?
|
||||
|
||||
uint32_t leafEntries = *(uint32_t *)&leaf.front();
|
||||
uint32_t leafEntries = *(uint32_t *)leaf;
|
||||
|
||||
if ( leafEntries == 0xffffFFFF )
|
||||
{
|
||||
|
@ -309,124 +326,266 @@ char const * BtreeDictionary::findChainOffsetExactOrPrefix( wstring const & targ
|
|||
|
||||
//printf( "=>a node\n" );
|
||||
|
||||
uint32_t const * offsets = (uint32_t *)&leaf.front() + 1;
|
||||
uint32_t const * offsets = (uint32_t *)leaf + 1;
|
||||
|
||||
char const * ptr = &leaf.front() + sizeof( uint32_t ) +
|
||||
char const * ptr = leaf + sizeof( uint32_t ) +
|
||||
( indexNodeSize + 1 ) * sizeof( uint32_t );
|
||||
|
||||
unsigned entry;
|
||||
// ptr now points to a span of zero-separated strings, up to leafEnd.
|
||||
// We find our match using a binary search.
|
||||
|
||||
for( entry = 0; entry < indexNodeSize; ++entry )
|
||||
{
|
||||
//printf( "checking node agaist word %s\n", ptr );
|
||||
size_t wordSize = strlen( ptr );
|
||||
char const * closestString;
|
||||
|
||||
int compareResult;
|
||||
|
||||
char const * window = ptr;
|
||||
unsigned windowSize = leafEnd - ptr;
|
||||
|
||||
for( ; ; )
|
||||
{
|
||||
// We boldly shoot in the middle of the whole mess, and then adjust
|
||||
// to the beginning of the string that we've hit.
|
||||
char const * testPoint = window + windowSize/2;
|
||||
|
||||
closestString = testPoint;
|
||||
|
||||
while( closestString > ptr && closestString[ -1 ] )
|
||||
--closestString;
|
||||
|
||||
size_t wordSize = strlen( closestString );
|
||||
|
||||
if ( wcharBuffer.size() <= wordSize )
|
||||
wcharBuffer.resize( wordSize + 1 );
|
||||
|
||||
long result = Utf8::decode( ptr, wordSize, &wcharBuffer.front() );
|
||||
|
||||
|
||||
long result = Utf8::decode( closestString, wordSize, &wcharBuffer.front() );
|
||||
|
||||
if ( result < 0 )
|
||||
throw Utf8::exCantDecode( ptr );
|
||||
|
||||
throw Utf8::exCantDecode( closestString );
|
||||
|
||||
wcharBuffer[ result ] = 0;
|
||||
|
||||
int compareResult = target.compare( &wcharBuffer.front() );
|
||||
//printf( "Checking against %s\n", closestString );
|
||||
|
||||
compareResult = target.compare( &wcharBuffer.front() );
|
||||
|
||||
if ( !compareResult )
|
||||
{
|
||||
// The target string matches the current one.
|
||||
// Go to the right, since it's there where we store such results.
|
||||
currentNodeOffset = offsets[ entry + 1 ];
|
||||
// The target string matches the current one. Finish the search.
|
||||
break;
|
||||
}
|
||||
if ( compareResult < 0 )
|
||||
{
|
||||
// The target string is smaller than the current one.
|
||||
// Go to the left.
|
||||
currentNodeOffset = offsets[ entry ];
|
||||
break;
|
||||
}
|
||||
windowSize = closestString - window;
|
||||
|
||||
ptr += wordSize + 1;
|
||||
if ( !windowSize )
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
// The target string is larger than the current one.
|
||||
// Go to the right.
|
||||
windowSize -= ( closestString - window ) + wordSize + 1;
|
||||
window = closestString + wordSize + 1;
|
||||
|
||||
if ( !windowSize )
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( entry == indexNodeSize )
|
||||
#if 0
|
||||
printf( "The winner is %s, compareResult = %d\n", closestString, compareResult );
|
||||
|
||||
if ( closestString != ptr )
|
||||
{
|
||||
// We iterated through all entries, but our string is larger than
|
||||
// all of them. Go the the rightmost node.
|
||||
char const * left = closestString -1;
|
||||
|
||||
while( left != ptr && left[ -1 ] )
|
||||
--left;
|
||||
|
||||
printf( "To the left: %s\n", left );
|
||||
}
|
||||
else
|
||||
printf( "To the lest -- nothing\n" );
|
||||
|
||||
char const * right = closestString + strlen( closestString ) + 1;
|
||||
|
||||
if ( right != leafEnd )
|
||||
{
|
||||
printf( "To the right: %s\n", right );
|
||||
}
|
||||
else
|
||||
printf( "To the right -- nothing\n" );
|
||||
#endif
|
||||
|
||||
// Now, whatever the outcome (compareResult) is, we need to find
|
||||
// entry number for the closestMatch string.
|
||||
|
||||
unsigned entry = 0;
|
||||
|
||||
for( char const * next = ptr; next != closestString;
|
||||
next += strlen( next ) + 1, ++entry ) ;
|
||||
|
||||
// Ok, now check the outcome
|
||||
|
||||
if ( !compareResult )
|
||||
{
|
||||
// The target string matches the one found.
|
||||
// Go to the right, since it's there where we store such results.
|
||||
currentNodeOffset = offsets[ entry + 1 ];
|
||||
}
|
||||
if ( compareResult < 0 )
|
||||
{
|
||||
// The target string is smaller than the one found.
|
||||
// Go to the left.
|
||||
currentNodeOffset = offsets[ entry ];
|
||||
}
|
||||
else
|
||||
{
|
||||
// The target string is larger than the one found.
|
||||
// Go to the right.
|
||||
currentNodeOffset = offsets[ entry + 1 ];
|
||||
}
|
||||
|
||||
//printf( "reading node at %x\n", currentNodeOffset );
|
||||
readNode( currentNodeOffset, extLeaf );
|
||||
leaf = &extLeaf.front();
|
||||
leafEnd = leaf + extLeaf.size();
|
||||
}
|
||||
else
|
||||
{
|
||||
//printf( "=>a leaf\n" );
|
||||
// A leaf
|
||||
nextLeaf = idxFile->read< uint32_t >();
|
||||
|
||||
// Iterate through chains until we find one that matches
|
||||
// If this leaf is the root, there's no next leaf, it just can't be.
|
||||
// We do this check because the file's position indicator just won't
|
||||
// be in the right place for root node anyway, since we precache it.
|
||||
nextLeaf = ( currentNodeOffset != rootOffset ? idxFile->read< uint32_t >() : 0 );
|
||||
|
||||
char const * ptr = &leaf.front() + sizeof( uint32_t );
|
||||
if ( !leafEntries )
|
||||
{
|
||||
// Empty leaf? This may only be possible for entirely empty trees only.
|
||||
if ( currentNodeOffset != rootOffset )
|
||||
throw exCorruptedChainData();
|
||||
else
|
||||
return 0; // No match
|
||||
}
|
||||
|
||||
// Build an array containing all chain pointers
|
||||
char const * ptr = leaf + sizeof( uint32_t );
|
||||
|
||||
uint32_t chainSize;
|
||||
|
||||
while( leafEntries-- )
|
||||
vector< char const * > chainOffsets( leafEntries );
|
||||
|
||||
{
|
||||
memcpy( &chainSize, ptr, sizeof( uint32_t ) );
|
||||
ptr += sizeof( uint32_t );
|
||||
char const ** nextOffset = &chainOffsets.front();
|
||||
|
||||
if( chainSize )
|
||||
while( leafEntries-- )
|
||||
{
|
||||
size_t wordSize = strlen( ptr );
|
||||
*nextOffset++ = ptr;
|
||||
|
||||
if ( wcharBuffer.size() <= wordSize )
|
||||
wcharBuffer.resize( wordSize + 1 );
|
||||
memcpy( &chainSize, ptr, sizeof( uint32_t ) );
|
||||
|
||||
//printf( "checking agaist word %s, left = %u\n", ptr, leafEntries );
|
||||
//printf( "%s + %s\n", ptr + sizeof( uint32_t ), ptr + sizeof( uint32_t ) + strlen( ptr + sizeof( uint32_t ) ) + 1 );
|
||||
|
||||
long result = Utf8::decode( ptr, wordSize, &wcharBuffer.front() );
|
||||
|
||||
if ( result < 0 )
|
||||
throw Utf8::exCantDecode( ptr );
|
||||
|
||||
wcharBuffer[ result ] = 0;
|
||||
|
||||
wstring foldedWord = Folding::apply( &wcharBuffer.front() );
|
||||
|
||||
int compareResult = target.compare( foldedWord );
|
||||
|
||||
if ( !compareResult )
|
||||
{
|
||||
// Exact match -- return and be done
|
||||
exactMatch = true;
|
||||
|
||||
return ptr - sizeof( uint32_t );
|
||||
}
|
||||
else
|
||||
if ( compareResult < 0 )
|
||||
{
|
||||
// The target string is smaller than the current one.
|
||||
// No point in travering further, return this result.
|
||||
|
||||
return ptr - sizeof( uint32_t );
|
||||
}
|
||||
ptr += chainSize;
|
||||
ptr += sizeof( uint32_t ) + chainSize;
|
||||
}
|
||||
}
|
||||
|
||||
// Well, our target is larger than all the chains here. This would mean
|
||||
// that the next leaf is the right one.
|
||||
// Now do a binary search in it, aiming to find where our target
|
||||
// string lands.
|
||||
|
||||
if ( nextLeaf )
|
||||
char const ** window = &chainOffsets.front();
|
||||
unsigned windowSize = chainOffsets.size();
|
||||
|
||||
for( ; ; )
|
||||
{
|
||||
readNode( nextLeaf, leaf );
|
||||
//printf( "window = %u, ws = %u\n", window - &chainOffsets.front(), windowSize );
|
||||
|
||||
nextLeaf = idxFile->read< uint32_t >();
|
||||
char const ** chainToCheck = window + windowSize/2;
|
||||
ptr = *chainToCheck;
|
||||
|
||||
memcpy( &chainSize, ptr, sizeof( uint32_t ) );
|
||||
ptr += sizeof( uint32_t );
|
||||
|
||||
size_t wordSize = strlen( ptr );
|
||||
|
||||
if ( wcharBuffer.size() <= wordSize )
|
||||
wcharBuffer.resize( wordSize + 1 );
|
||||
|
||||
//printf( "checking agaist word %s, left = %u\n", ptr, leafEntries );
|
||||
|
||||
long result = Utf8::decode( ptr, wordSize, &wcharBuffer.front() );
|
||||
|
||||
if ( result < 0 )
|
||||
throw Utf8::exCantDecode( ptr );
|
||||
|
||||
wcharBuffer[ result ] = 0;
|
||||
|
||||
wstring foldedWord = Folding::apply( &wcharBuffer.front() );
|
||||
|
||||
int compareResult = target.compare( foldedWord );
|
||||
|
||||
if ( !compareResult )
|
||||
{
|
||||
// Exact match -- return and be done
|
||||
exactMatch = true;
|
||||
|
||||
return ptr - sizeof( uint32_t );
|
||||
}
|
||||
else
|
||||
if ( compareResult < 0 )
|
||||
{
|
||||
// The target string is smaller than the current one.
|
||||
// Go to the first half
|
||||
|
||||
windowSize /= 2;
|
||||
|
||||
return &leaf.front() + sizeof( uint32_t );
|
||||
if ( !windowSize )
|
||||
{
|
||||
// That finishes our search. Since our target string
|
||||
// landed before the last tested chain, we return a possible
|
||||
// prefix match against that chain.
|
||||
return ptr - sizeof( uint32_t );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// The target string is larger than the current one.
|
||||
// Go to the second half
|
||||
|
||||
windowSize -= windowSize/2 + 1;
|
||||
|
||||
if ( !windowSize )
|
||||
{
|
||||
// That finishes our search. Since our target string
|
||||
// landed after the last tested chain, we return the next
|
||||
// chain. If there's no next chain in this leaf, this
|
||||
// would mean the first element in the next leaf.
|
||||
if ( chainToCheck == &chainOffsets.back() )
|
||||
{
|
||||
if ( nextLeaf )
|
||||
{
|
||||
readNode( nextLeaf, extLeaf );
|
||||
|
||||
leafEnd = &extLeaf.front() + extLeaf.size();
|
||||
|
||||
nextLeaf = idxFile->read< uint32_t >();
|
||||
|
||||
return &extLeaf.front() + sizeof( uint32_t );
|
||||
}
|
||||
else
|
||||
return 0; // This was the last leaf
|
||||
}
|
||||
else
|
||||
return chainToCheck[ 1 ];
|
||||
}
|
||||
|
||||
window = chainToCheck + 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
return 0; // This was the last leaf
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -764,7 +923,7 @@ void IndexedWords::addWord( wstring const & word, uint32_t articleOffset )
|
|||
}
|
||||
}
|
||||
|
||||
uint32_t buildIndex( IndexedWords const & indexedWords, File::Class & file )
|
||||
IndexInfo buildIndex( IndexedWords const & indexedWords, File::Class & file )
|
||||
{
|
||||
size_t indexSize = indexedWords.size();
|
||||
IndexedWords::const_iterator nextIndex = indexedWords.begin();
|
||||
|
@ -798,17 +957,7 @@ uint32_t buildIndex( IndexedWords const & indexedWords, File::Class & file )
|
|||
file, btreeMaxElements,
|
||||
lastLeafOffset );
|
||||
|
||||
// We need to save btreeMaxElements. For simplicity, we just save it here
|
||||
// along with root offset, and then return that record's offset as the
|
||||
// offset of the index itself.
|
||||
|
||||
uint32_t indexOffset = file.tell();
|
||||
|
||||
file.write( (uint32_t) btreeMaxElements );
|
||||
file.write( (uint32_t) rootOffset );
|
||||
|
||||
return indexOffset;
|
||||
return IndexInfo( btreeMaxElements, rootOffset );
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -25,7 +25,7 @@ enum
|
|||
/// This is to be bumped up each time the internal format changes.
|
||||
/// The value isn't used here by itself, it is supposed to be added
|
||||
/// to each dictionary's internal format version.
|
||||
FormatVersion = 2
|
||||
FormatVersion = 3
|
||||
};
|
||||
|
||||
// These exceptions which might be thrown during the index traversal
|
||||
|
@ -49,6 +49,16 @@ struct WordArticleLink
|
|||
{}
|
||||
};
|
||||
|
||||
/// Information needed to open the index
|
||||
struct IndexInfo
|
||||
{
|
||||
uint32_t btreeMaxElements, rootOffset;
|
||||
|
||||
IndexInfo( uint32_t btreeMaxElements_, uint32_t rootOffset_ ):
|
||||
btreeMaxElements( btreeMaxElements_ ), rootOffset( rootOffset_ )
|
||||
{}
|
||||
};
|
||||
|
||||
class BtreeWordSearchRequest;
|
||||
|
||||
/// A base for the dictionary that utilizes a btree index build using
|
||||
|
@ -67,11 +77,10 @@ public:
|
|||
|
||||
protected:
|
||||
|
||||
/// Opens the index. The file must be positioned at the offset previously
|
||||
/// returned by buildIndex(). The file reference is saved to be used for
|
||||
/// Opens the index. The file reference is saved to be used for
|
||||
/// subsequent lookups.
|
||||
/// The mutex is the one to be locked when working with the file.
|
||||
void openIndex( File::Class &, Mutex & );
|
||||
void openIndex( IndexInfo const &, File::Class &, Mutex & );
|
||||
|
||||
/// Finds articles that match the given string. A case-insensitive search
|
||||
/// is performed.
|
||||
|
@ -83,6 +92,9 @@ private:
|
|||
File::Class * idxFile;
|
||||
uint32_t indexNodeSize;
|
||||
uint32_t rootOffset;
|
||||
bool rootNodeLoaded;
|
||||
vector< char > rootNode; // We load root note here and keep it at all times,
|
||||
// since all searches always start with it.
|
||||
|
||||
/// Finds the offset in the btree leaf for the given word, either matching
|
||||
/// by an exact match, or by finding the smallest entry that might match
|
||||
|
@ -91,10 +103,16 @@ private:
|
|||
/// to true when an exact match is located, and to false otherwise.
|
||||
/// The located leaf is loaded to 'leaf', and the pointer to the next
|
||||
/// leaf is saved to 'nextLeaf'.
|
||||
/// However, due to root node being permanently cached, the 'leaf' passed
|
||||
/// might not get used at all if the root node was the terminal one. In that
|
||||
/// case, the returned pointer wouldn't belong to 'leaf' at all. To that end,
|
||||
/// the leafEnd pointer always holds the pointer to the first byte outside
|
||||
/// the node data.
|
||||
char const * findChainOffsetExactOrPrefix( wstring const & target,
|
||||
bool & exactMatch,
|
||||
vector< char > & leaf,
|
||||
uint32_t & nextLeaf );
|
||||
uint32_t & nextLeaf,
|
||||
char const * & leafEnd );
|
||||
|
||||
/// Reads a node or leaf at the given offset. Just uncompresses its data
|
||||
/// to the given vector and does nothing more.
|
||||
|
@ -128,10 +146,10 @@ struct IndexedWords: public map< wstring, vector< WordArticleLink > >
|
|||
void addWord( wstring const & word, uint32_t articleOffset );
|
||||
};
|
||||
|
||||
/// Builds the index, as a compressed btree. Returns offset to its root.
|
||||
/// Builds the index, as a compressed btree. Returns IndexInfo.
|
||||
/// All the data is stored to the given file, beginning from its current
|
||||
/// position.
|
||||
uint32_t buildIndex( IndexedWords const &, File::Class & file );
|
||||
IndexInfo buildIndex( IndexedWords const &, File::Class & file );
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -15,6 +15,17 @@ enum
|
|||
Writer::Writer( File::Class & f ):
|
||||
file( f ), chunkStarted( false ), bufferUsed( 0 )
|
||||
{
|
||||
// Create a sratchpad at the beginning of file. We use it to write chunk
|
||||
// table if it would fit, in order to save some seek times.
|
||||
|
||||
char zero[ 4096 ];
|
||||
|
||||
memset( zero, 0, sizeof( zero ) );
|
||||
|
||||
scratchPadOffset = file.tell();
|
||||
scratchPadSize = sizeof( zero );
|
||||
|
||||
file.write( zero, sizeof( zero ) );
|
||||
}
|
||||
|
||||
uint32_t Writer::startNewBlock()
|
||||
|
@ -77,10 +88,25 @@ uint32_t Writer::finish()
|
|||
if ( bufferUsed || chunkStarted )
|
||||
saveCurrentChunk();
|
||||
|
||||
bool useScratchPad = false;
|
||||
uint32_t savedOffset = 0;
|
||||
|
||||
if ( scratchPadSize >= offsets.size() * sizeof( uint32_t ) + sizeof( uint32_t ) )
|
||||
{
|
||||
useScratchPad = true;
|
||||
savedOffset = file.tell();
|
||||
file.seek( scratchPadOffset );
|
||||
}
|
||||
|
||||
uint32_t offset = file.tell();
|
||||
|
||||
file.write( (uint32_t) offsets.size() );
|
||||
file.write( &offsets.front(), offsets.size() * sizeof( uint32_t ) );
|
||||
|
||||
if ( offsets.size() )
|
||||
file.write( &offsets.front(), offsets.size() * sizeof( uint32_t ) );
|
||||
|
||||
if ( useScratchPad )
|
||||
file.seek( savedOffset );
|
||||
|
||||
offsets.clear();
|
||||
chunkStarted = false;
|
||||
|
|
|
@ -29,6 +29,7 @@ class Writer
|
|||
{
|
||||
vector< uint32_t > offsets;
|
||||
File::Class & file;
|
||||
size_t scratchPadOffset, scratchPadSize;
|
||||
|
||||
public:
|
||||
|
||||
|
|
|
@ -29,6 +29,7 @@ using std::list;
|
|||
|
||||
using BtreeIndexing::WordArticleLink;
|
||||
using BtreeIndexing::IndexedWords;
|
||||
using BtreeIndexing::IndexInfo;
|
||||
|
||||
namespace {
|
||||
|
||||
|
@ -48,7 +49,8 @@ struct IdxHeader
|
|||
uint32_t signature; // First comes the signature, DCDX
|
||||
uint32_t formatVersion; // File format version (CurrentFormatVersion)
|
||||
uint32_t wordCount; // Total number of words
|
||||
uint32_t indexOffset; // The offset of the index in the file
|
||||
uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
|
||||
uint32_t indexRootOffset;
|
||||
} __attribute__((packed));
|
||||
|
||||
bool indexIsOldOrBad( string const & indexFile )
|
||||
|
@ -109,9 +111,9 @@ DictdDictionary::DictdDictionary( string const & id,
|
|||
|
||||
// Initialize the index
|
||||
|
||||
idx.seek( idxHeader.indexOffset );
|
||||
|
||||
openIndex( idx, idxMutex );
|
||||
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
|
||||
idxHeader.indexRootOffset ),
|
||||
idx, idxMutex );
|
||||
}
|
||||
|
||||
DictdDictionary::~DictdDictionary()
|
||||
|
@ -380,7 +382,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
|
|||
|
||||
// Build index
|
||||
|
||||
idxHeader.indexOffset = BtreeIndexing::buildIndex( indexedWords, idx );
|
||||
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
|
||||
|
||||
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
|
||||
idxHeader.indexRootOffset = idxInfo.rootOffset;
|
||||
|
||||
// That concludes it. Update the header.
|
||||
|
||||
|
|
15
src/dsl.cc
15
src/dsl.cc
|
@ -47,6 +47,7 @@ using std::list;
|
|||
|
||||
using BtreeIndexing::WordArticleLink;
|
||||
using BtreeIndexing::IndexedWords;
|
||||
using BtreeIndexing::IndexInfo;
|
||||
|
||||
namespace {
|
||||
|
||||
|
@ -66,7 +67,8 @@ struct IdxHeader
|
|||
uint32_t chunksOffset; // The offset to chunks' storage
|
||||
uint32_t hasAbrv; // Non-zero means file has abrvs at abrvAddress
|
||||
uint32_t abrvAddress; // Address of abrv map in the chunked storage
|
||||
uint32_t indexOffset; // The offset of the index in the file
|
||||
uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
|
||||
uint32_t indexRootOffset;
|
||||
} __attribute__((packed));
|
||||
|
||||
bool indexIsOldOrBad( string const & indexFile )
|
||||
|
@ -201,9 +203,9 @@ DslDictionary::DslDictionary( string const & id,
|
|||
|
||||
// Initialize the index
|
||||
|
||||
idx.seek( idxHeader.indexOffset );
|
||||
|
||||
openIndex( idx, idxMutex );
|
||||
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
|
||||
idxHeader.indexRootOffset ),
|
||||
idx, idxMutex );
|
||||
|
||||
// Open a resource zip file, if there's one
|
||||
resourceZip = zip_open( ( getDictionaryFilenames()[ 0 ] + ".files.zip" ).c_str(), 0, 0 );
|
||||
|
@ -1184,7 +1186,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
|
|||
|
||||
// Build index
|
||||
|
||||
idxHeader.indexOffset = BtreeIndexing::buildIndex( indexedWords, idx );
|
||||
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
|
||||
|
||||
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
|
||||
idxHeader.indexRootOffset = idxInfo.rootOffset;
|
||||
|
||||
// That concludes it. Update the header.
|
||||
|
||||
|
|
15
src/lsa.cc
15
src/lsa.cc
|
@ -23,6 +23,7 @@ using std::multimap;
|
|||
using std::set;
|
||||
using BtreeIndexing::WordArticleLink;
|
||||
using BtreeIndexing::IndexedWords;
|
||||
using BtreeIndexing::IndexInfo;
|
||||
|
||||
namespace {
|
||||
|
||||
|
@ -43,7 +44,8 @@ struct IdxHeader
|
|||
uint32_t formatVersion; // File format version, currently 1.
|
||||
uint32_t soundsCount; // Total number of sounds, for informative purposes only
|
||||
uint32_t vorbisOffset; // Offset of the vorbis file which contains all snds
|
||||
uint32_t indexOffset; // The offset of the index in the file
|
||||
uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
|
||||
uint32_t indexRootOffset;
|
||||
} __attribute__((packed));
|
||||
|
||||
bool indexIsOldOrBad( string const & indexFile )
|
||||
|
@ -174,9 +176,9 @@ LsaDictionary::LsaDictionary( string const & id,
|
|||
{
|
||||
// Initialize the index
|
||||
|
||||
idx.seek( idxHeader.indexOffset );
|
||||
|
||||
openIndex( idx, idxMutex );
|
||||
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
|
||||
idxHeader.indexRootOffset ),
|
||||
idx, idxMutex );
|
||||
}
|
||||
|
||||
sptr< Dictionary::DataRequest > LsaDictionary::getArticle( wstring const & word,
|
||||
|
@ -546,7 +548,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
|
|||
|
||||
// Build the index
|
||||
|
||||
idxHeader.indexOffset = BtreeIndexing::buildIndex( indexedWords, idx );
|
||||
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
|
||||
|
||||
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
|
||||
idxHeader.indexRootOffset = idxInfo.rootOffset;
|
||||
|
||||
// That concludes it. Update the header.
|
||||
|
||||
|
|
|
@ -23,6 +23,7 @@ using std::multimap;
|
|||
using std::set;
|
||||
using BtreeIndexing::WordArticleLink;
|
||||
using BtreeIndexing::IndexedWords;
|
||||
using BtreeIndexing::IndexInfo;
|
||||
|
||||
namespace {
|
||||
|
||||
|
@ -38,7 +39,8 @@ struct IdxHeader
|
|||
uint32_t formatVersion; // File format version, is to be CurrentFormatVersion
|
||||
uint32_t soundsCount; // Total number of sounds, for informative purposes only
|
||||
uint32_t chunksOffset; // The offset to chunks' storage
|
||||
uint32_t indexOffset; // The offset of the index in the file
|
||||
uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
|
||||
uint32_t indexRootOffset;
|
||||
} __attribute__((packed));
|
||||
|
||||
bool indexIsOldOrBad( string const & indexFile )
|
||||
|
@ -98,9 +100,9 @@ SoundDirDictionary::SoundDirDictionary( string const & id,
|
|||
{
|
||||
// Initialize the index
|
||||
|
||||
idx.seek( idxHeader.indexOffset );
|
||||
|
||||
openIndex( idx, idxMutex );
|
||||
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
|
||||
idxHeader.indexRootOffset ),
|
||||
idx, idxMutex );
|
||||
}
|
||||
|
||||
sptr< Dictionary::DataRequest > SoundDirDictionary::getArticle( wstring const & word,
|
||||
|
@ -365,7 +367,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries( Config::SoundDirs const &
|
|||
|
||||
// Build the index
|
||||
|
||||
idxHeader.indexOffset = BtreeIndexing::buildIndex( indexedWords, idx );
|
||||
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
|
||||
|
||||
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
|
||||
idxHeader.indexRootOffset = idxInfo.rootOffset;
|
||||
|
||||
// That concludes it. Update the header.
|
||||
|
||||
|
|
134
src/stardict.cc
134
src/stardict.cc
|
@ -33,6 +33,7 @@ using std::wstring;
|
|||
|
||||
using BtreeIndexing::WordArticleLink;
|
||||
using BtreeIndexing::IndexedWords;
|
||||
using BtreeIndexing::IndexInfo;
|
||||
|
||||
namespace {
|
||||
|
||||
|
@ -65,7 +66,7 @@ struct Ifo
|
|||
enum
|
||||
{
|
||||
Signature = 0x58444953, // SIDX on little-endian, XDIS on big-endian
|
||||
CurrentFormatVersion = 4 + BtreeIndexing::FormatVersion + Folding::Version
|
||||
CurrentFormatVersion = 5 + BtreeIndexing::FormatVersion + Folding::Version
|
||||
};
|
||||
|
||||
struct IdxHeader
|
||||
|
@ -73,7 +74,12 @@ struct IdxHeader
|
|||
uint32_t signature; // First comes the signature, SIDX
|
||||
uint32_t formatVersion; // File format version (CurrentFormatVersion)
|
||||
uint32_t chunksOffset; // The offset to chunks' storage
|
||||
uint32_t indexOffset; // The offset of the index in the file
|
||||
uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
|
||||
uint32_t indexRootOffset;
|
||||
uint32_t wordCount; // Saved from Ifo::wordcount
|
||||
uint32_t synWordCount; // Saved from Ifo::synwordcount
|
||||
uint32_t bookNameSize; // Book name's length. Used to read it then.
|
||||
uint32_t sameTypeSequenceSize; // That string's size. Used to read it then.
|
||||
} __attribute__((packed));
|
||||
|
||||
bool indexIsOldOrBad( string const & indexFile )
|
||||
|
@ -90,32 +96,32 @@ bool indexIsOldOrBad( string const & indexFile )
|
|||
|
||||
class StardictDictionary: public BtreeIndexing::BtreeDictionary
|
||||
{
|
||||
Ifo ifo;
|
||||
Mutex idxMutex;
|
||||
File::Class idx;
|
||||
IdxHeader idxHeader;
|
||||
string bookName;
|
||||
string sameTypeSequence;
|
||||
ChunkedStorage::Reader chunks;
|
||||
dictData * dz;
|
||||
|
||||
public:
|
||||
|
||||
StardictDictionary( string const & id, string const & indexFile,
|
||||
vector< string > const & dictionaryFiles,
|
||||
Ifo const & );
|
||||
vector< string > const & dictionaryFiles );
|
||||
|
||||
~StardictDictionary();
|
||||
|
||||
virtual string getName() throw()
|
||||
{ return ifo.bookname; }
|
||||
{ return bookName; }
|
||||
|
||||
virtual map< Dictionary::Property, string > getProperties() throw()
|
||||
{ return map< Dictionary::Property, string >(); }
|
||||
|
||||
virtual unsigned long getArticleCount() throw()
|
||||
{ return ifo.wordcount; }
|
||||
{ return idxHeader.wordCount; }
|
||||
|
||||
virtual unsigned long getWordCount() throw()
|
||||
{ return ifo.wordcount + ifo.synwordcount; }
|
||||
{ return idxHeader.wordCount + idxHeader.synWordCount; }
|
||||
|
||||
virtual sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( wstring const & )
|
||||
throw( std::exception );
|
||||
|
@ -136,16 +142,18 @@ private:
|
|||
void loadArticle( uint32_t address,
|
||||
string & headword,
|
||||
string & articleText );
|
||||
|
||||
string loadString( size_t size );
|
||||
};
|
||||
|
||||
StardictDictionary::StardictDictionary( string const & id,
|
||||
string const & indexFile,
|
||||
vector< string > const & dictionaryFiles,
|
||||
Ifo const & ifo_ ):
|
||||
vector< string > const & dictionaryFiles ):
|
||||
BtreeDictionary( id, dictionaryFiles ),
|
||||
ifo( ifo_ ),
|
||||
idx( indexFile, "rb" ),
|
||||
idxHeader( idx.read< IdxHeader >() ),
|
||||
bookName( loadString( idxHeader.bookNameSize ) ),
|
||||
sameTypeSequence( loadString( idxHeader.sameTypeSequenceSize ) ),
|
||||
chunks( idx, idxHeader.chunksOffset )
|
||||
{
|
||||
// Open the .dict file
|
||||
|
@ -157,9 +165,9 @@ StardictDictionary::StardictDictionary( string const & id,
|
|||
|
||||
// Initialize the index
|
||||
|
||||
idx.seek( idxHeader.indexOffset );
|
||||
|
||||
openIndex( idx, idxMutex );
|
||||
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
|
||||
idxHeader.indexRootOffset ),
|
||||
idx, idxMutex );
|
||||
}
|
||||
|
||||
StardictDictionary::~StardictDictionary()
|
||||
|
@ -168,6 +176,15 @@ StardictDictionary::~StardictDictionary()
|
|||
dict_data_close( dz );
|
||||
}
|
||||
|
||||
string StardictDictionary::loadString( size_t size )
|
||||
{
|
||||
vector< char > data( size );
|
||||
|
||||
idx.read( &data.front(), data.size() );
|
||||
|
||||
return string( &data.front(), data.size() );
|
||||
}
|
||||
|
||||
void StardictDictionary::getArticleProps( uint32_t articleAddress,
|
||||
string & headword,
|
||||
uint32_t & offset, uint32_t & size )
|
||||
|
@ -252,14 +269,14 @@ void StardictDictionary::loadArticle( uint32_t address,
|
|||
|
||||
char * ptr = articleBody;
|
||||
|
||||
if ( ifo.sametypesequence.size() )
|
||||
if ( sameTypeSequence.size() )
|
||||
{
|
||||
/// The sequence is known, it's not stored in the article itself
|
||||
for( unsigned seq = 0; seq < ifo.sametypesequence.size(); ++seq )
|
||||
for( unsigned seq = 0; seq < sameTypeSequence.size(); ++seq )
|
||||
{
|
||||
// Last entry doesn't have size info -- it is inferred from
|
||||
// the bytes left
|
||||
bool entrySizeKnown = ( seq == ifo.sametypesequence.size() - 1 );
|
||||
bool entrySizeKnown = ( seq == sameTypeSequence.size() - 1 );
|
||||
|
||||
uint32_t entrySize;
|
||||
|
||||
|
@ -272,7 +289,7 @@ void StardictDictionary::loadArticle( uint32_t address,
|
|||
break;
|
||||
}
|
||||
|
||||
char type = ifo.sametypesequence[ seq ];
|
||||
char type = sameTypeSequence[ seq ];
|
||||
|
||||
if ( islower( type ) )
|
||||
{
|
||||
|
@ -610,8 +627,7 @@ static bool tryPossibleName( string const & name, string & copyTo )
|
|||
}
|
||||
|
||||
static void findCorrespondingFiles( string const & ifo,
|
||||
string & idx, string & dict, string & syn,
|
||||
bool needSyn )
|
||||
string & idx, string & dict, string & syn )
|
||||
{
|
||||
string base( ifo, 0, ifo.size() - 3 );
|
||||
|
||||
|
@ -633,15 +649,15 @@ static void findCorrespondingFiles( string const & ifo,
|
|||
) )
|
||||
throw exNoDictFile( ifo );
|
||||
|
||||
if ( needSyn && !(
|
||||
tryPossibleName( base + "syn", syn ) ||
|
||||
tryPossibleName( base + "syn.gz", syn ) ||
|
||||
tryPossibleName( base + "syn.dz", syn ) ||
|
||||
tryPossibleName( base + "SYN", syn ) ||
|
||||
tryPossibleName( base + "SYN.GZ", syn ) ||
|
||||
tryPossibleName( base + "SYN.DZ", syn )
|
||||
if ( !(
|
||||
tryPossibleName( base + "syn", syn ) ||
|
||||
tryPossibleName( base + "syn.gz", syn ) ||
|
||||
tryPossibleName( base + "syn.dz", syn ) ||
|
||||
tryPossibleName( base + "SYN", syn ) ||
|
||||
tryPossibleName( base + "SYN.GZ", syn ) ||
|
||||
tryPossibleName( base + "SYN.DZ", syn )
|
||||
) )
|
||||
throw exNoSynFile( ifo );
|
||||
syn.clear();
|
||||
}
|
||||
|
||||
static void handleIdxSynFile( string const & fileName,
|
||||
|
@ -764,30 +780,16 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
|
|||
|
||||
try
|
||||
{
|
||||
File::Class ifoFile( *i, "r" );
|
||||
|
||||
Ifo ifo( ifoFile );
|
||||
|
||||
if ( ifo.idxoffsetbits == 64 )
|
||||
throw ex64BitsNotSupported();
|
||||
|
||||
if ( ifo.dicttype.size() )
|
||||
throw exDicttypeNotSupported();
|
||||
|
||||
printf( "bookname = %s\n", ifo.bookname.c_str() );
|
||||
printf( "wordcount = %u\n", ifo.wordcount );
|
||||
|
||||
vector< string > dictFiles( 1, *i );
|
||||
|
||||
string idxFileName, dictFileName, synFileName;
|
||||
|
||||
findCorrespondingFiles( *i, idxFileName, dictFileName, synFileName,
|
||||
ifo.synwordcount );
|
||||
findCorrespondingFiles( *i, idxFileName, dictFileName, synFileName );
|
||||
|
||||
dictFiles.push_back( idxFileName );
|
||||
dictFiles.push_back( dictFileName );
|
||||
|
||||
if ( ifo.synwordcount )
|
||||
if ( synFileName.size() )
|
||||
dictFiles.push_back( synFileName );
|
||||
|
||||
string dictId = Dictionary::makeDictionaryId( dictFiles );
|
||||
|
@ -798,6 +800,33 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
|
|||
indexIsOldOrBad( indexFile ) )
|
||||
{
|
||||
// Building the index
|
||||
|
||||
File::Class ifoFile( *i, "r" );
|
||||
|
||||
Ifo ifo( ifoFile );
|
||||
|
||||
if ( ifo.idxoffsetbits == 64 )
|
||||
throw ex64BitsNotSupported();
|
||||
|
||||
if ( ifo.dicttype.size() )
|
||||
throw exDicttypeNotSupported();
|
||||
|
||||
if( synFileName.empty() )
|
||||
{
|
||||
if ( ifo.synwordcount )
|
||||
throw exNoSynFile( *i );
|
||||
}
|
||||
else
|
||||
if ( !ifo.synwordcount )
|
||||
{
|
||||
printf( "Warning: ignoring .syn file %s, since there's no synwordcount in .ifo specified\n",
|
||||
synFileName.c_str() );
|
||||
}
|
||||
|
||||
|
||||
printf( "bookname = %s\n", ifo.bookname.c_str() );
|
||||
printf( "wordcount = %u\n", ifo.wordcount );
|
||||
|
||||
initializing.indexingDictionary( ifo.bookname );
|
||||
|
||||
File::Class idx( indexFile, "wb" );
|
||||
|
@ -811,6 +840,9 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
|
|||
|
||||
idx.write( idxHeader );
|
||||
|
||||
idx.write( ifo.bookname.data(), ifo.bookname.size() );
|
||||
idx.write( ifo.sametypesequence.data(), ifo.sametypesequence.size() );
|
||||
|
||||
IndexedWords indexedWords;
|
||||
|
||||
ChunkedStorage::Writer chunks( idx );
|
||||
|
@ -837,13 +869,21 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
|
|||
|
||||
// Build index
|
||||
|
||||
idxHeader.indexOffset = BtreeIndexing::buildIndex( indexedWords, idx );
|
||||
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
|
||||
|
||||
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
|
||||
idxHeader.indexRootOffset = idxInfo.rootOffset;
|
||||
|
||||
// That concludes it. Update the header.
|
||||
|
||||
idxHeader.signature = Signature;
|
||||
idxHeader.formatVersion = CurrentFormatVersion;
|
||||
|
||||
idxHeader.wordCount = ifo.wordcount;
|
||||
idxHeader.synWordCount = ifo.synwordcount;
|
||||
idxHeader.bookNameSize = ifo.bookname.size();
|
||||
idxHeader.sameTypeSequenceSize = ifo.sametypesequence.size();
|
||||
|
||||
idx.rewind();
|
||||
|
||||
idx.write( &idxHeader, sizeof( idxHeader ) );
|
||||
|
@ -851,9 +891,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
|
|||
|
||||
dictionaries.push_back( new StardictDictionary( dictId,
|
||||
indexFile,
|
||||
dictFiles,
|
||||
ifo ) );
|
||||
|
||||
dictFiles ) );
|
||||
}
|
||||
catch( std::exception & e )
|
||||
{
|
||||
|
|
Loading…
Reference in a new issue