From 32fe5dff9ee0c09476d79a21266f74fa231d3db0 Mon Sep 17 00:00:00 2001 From: Konstantin Isakov Date: Tue, 14 Apr 2009 16:35:47 +0000 Subject: [PATCH] * A lot of changes aimed to make lookups faster and to reduce startup times. --- src/bgl.cc | 15 +- src/btreeidx.cc | 341 ++++++++++++++++++++++++++++++------------ src/btreeidx.hh | 32 +++- src/chunkedstorage.cc | 28 +++- src/chunkedstorage.hh | 1 + src/dictdfiles.cc | 15 +- src/dsl.cc | 15 +- src/lsa.cc | 15 +- src/sounddir.cc | 15 +- src/stardict.cc | 134 +++++++++++------ 10 files changed, 434 insertions(+), 177 deletions(-) diff --git a/src/bgl.cc b/src/bgl.cc index 8d0c838a..b56dc1eb 100644 --- a/src/bgl.cc +++ b/src/bgl.cc @@ -26,6 +26,7 @@ using std::pair; using BtreeIndexing::WordArticleLink; using BtreeIndexing::IndexedWords; +using BtreeIndexing::IndexInfo; namespace { @@ -49,7 +50,8 @@ namespace uint32_t wordCount; // Total number of words, for informative purposes only /// Add more fields here, like name, description, author and such. uint32_t chunksOffset; // The offset to chunks' storage - uint32_t indexOffset; // The offset of the index in the file. + uint32_t indexBtreeMaxElements; // Two fields from IndexInfo + uint32_t indexRootOffset; uint32_t resourceListOffset; // The offset of the list of resources uint32_t resourcesCount; // Number of resources stored } __attribute__((packed)); @@ -239,9 +241,9 @@ namespace // Initialize the index - idx.seek( idxHeader.indexOffset ); - - openIndex( idx, idxMutex ); + openIndex( IndexInfo( idxHeader.indexBtreeMaxElements, + idxHeader.indexRootOffset ), + idx, idxMutex ); } @@ -739,7 +741,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries( // Good. Now build the index - idxHeader.indexOffset = BtreeIndexing::buildIndex( indexedWords, idx ); + IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx ); + + idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements; + idxHeader.indexRootOffset = idxInfo.rootOffset; // Save the resource's list. diff --git a/src/btreeidx.cc b/src/btreeidx.cc index 83025e12..5c988230 100644 --- a/src/btreeidx.cc +++ b/src/btreeidx.cc @@ -42,19 +42,21 @@ enum BtreeDictionary::BtreeDictionary( string const & id, vector< string > const & dictionaryFiles ): - Dictionary::Class( id, dictionaryFiles ), idxFile( 0 ) + Dictionary::Class( id, dictionaryFiles ), idxFile( 0 ), rootNodeLoaded( false ) { } -void BtreeDictionary::openIndex( File::Class & file, Mutex & mutex ) +void BtreeDictionary::openIndex( IndexInfo const & indexInfo, + File::Class & file, Mutex & mutex ) { - Mutex::Lock _( mutex ); - - indexNodeSize = file.read< uint32_t >(); - rootOffset = file.read< uint32_t >(); + indexNodeSize = indexInfo.btreeMaxElements; + rootOffset = indexInfo.rootOffset; idxFile = &file; idxFileMutex = &mutex; + + rootNodeLoaded = false; + rootNode.clear(); } vector< WordArticleLink > BtreeDictionary::findArticles( wstring const & str ) @@ -68,8 +70,11 @@ vector< WordArticleLink > BtreeDictionary::findArticles( wstring const & str ) vector< char > leaf; uint32_t nextLeaf; + char const * leafEnd; + char const * chainOffset = findChainOffsetExactOrPrefix( folded, exactMatch, - leaf, nextLeaf ); + leaf, nextLeaf, + leafEnd ); if ( chainOffset && exactMatch ) { @@ -157,9 +162,11 @@ void BtreeWordSearchRequest::run() vector< char > leaf; uint32_t nextLeaf; + char const * leafEnd; char const * chainOffset = dict.findChainOffsetExactOrPrefix( folded, exactMatch, - leaf, nextLeaf ); + leaf, nextLeaf, + leafEnd ); if ( chainOffset ) for( ; ; ) @@ -198,7 +205,7 @@ void BtreeWordSearchRequest::run() // Fetch new leaf if we're out of chains here - if ( chainOffset > &leaf.back() ) + if ( chainOffset >= leafEnd ) { // We're past the current leaf, fetch the next one @@ -209,6 +216,8 @@ void BtreeWordSearchRequest::run() Mutex::Lock _( *dict.idxFileMutex ); dict.readNode( nextLeaf, leaf ); + leafEnd = &leaf.front() + leaf.size(); + nextLeaf = dict.idxFile->read< uint32_t >(); chainOffset = &leaf.front() + sizeof( uint32_t ); @@ -274,8 +283,9 @@ void BtreeDictionary::readNode( uint32_t offset, vector< char > & out ) char const * BtreeDictionary::findChainOffsetExactOrPrefix( wstring const & target, bool & exactMatch, - vector< char > & leaf, - uint32_t & nextLeaf ) + vector< char > & extLeaf, + uint32_t & nextLeaf, + char const * & leafEnd ) { if ( !idxFile ) throw exIndexWasNotOpened(); @@ -294,14 +304,21 @@ char const * BtreeDictionary::findChainOffsetExactOrPrefix( wstring const & targ uint32_t currentNodeOffset = rootOffset; + if ( !rootNodeLoaded ) + { + // Time to load our root node. We do it only once, at the first request. + readNode( rootOffset, rootNode ); + rootNodeLoaded = true; + } + + char const * leaf = &rootNode.front(); + leafEnd = leaf + rootNode.size(); + for( ; ; ) { - //printf( "reading node at %x\n", currentNodeOffset ); - readNode( currentNodeOffset, leaf ); - // Is it a leaf or a node? - uint32_t leafEntries = *(uint32_t *)&leaf.front(); + uint32_t leafEntries = *(uint32_t *)leaf; if ( leafEntries == 0xffffFFFF ) { @@ -309,124 +326,266 @@ char const * BtreeDictionary::findChainOffsetExactOrPrefix( wstring const & targ //printf( "=>a node\n" ); - uint32_t const * offsets = (uint32_t *)&leaf.front() + 1; + uint32_t const * offsets = (uint32_t *)leaf + 1; - char const * ptr = &leaf.front() + sizeof( uint32_t ) + + char const * ptr = leaf + sizeof( uint32_t ) + ( indexNodeSize + 1 ) * sizeof( uint32_t ); - unsigned entry; + // ptr now points to a span of zero-separated strings, up to leafEnd. + // We find our match using a binary search. - for( entry = 0; entry < indexNodeSize; ++entry ) - { - //printf( "checking node agaist word %s\n", ptr ); - size_t wordSize = strlen( ptr ); + char const * closestString; + int compareResult; + + char const * window = ptr; + unsigned windowSize = leafEnd - ptr; + + for( ; ; ) + { + // We boldly shoot in the middle of the whole mess, and then adjust + // to the beginning of the string that we've hit. + char const * testPoint = window + windowSize/2; + + closestString = testPoint; + + while( closestString > ptr && closestString[ -1 ] ) + --closestString; + + size_t wordSize = strlen( closestString ); + if ( wcharBuffer.size() <= wordSize ) wcharBuffer.resize( wordSize + 1 ); - - long result = Utf8::decode( ptr, wordSize, &wcharBuffer.front() ); - + + long result = Utf8::decode( closestString, wordSize, &wcharBuffer.front() ); + if ( result < 0 ) - throw Utf8::exCantDecode( ptr ); - + throw Utf8::exCantDecode( closestString ); + wcharBuffer[ result ] = 0; - int compareResult = target.compare( &wcharBuffer.front() ); + //printf( "Checking against %s\n", closestString ); + compareResult = target.compare( &wcharBuffer.front() ); + if ( !compareResult ) { - // The target string matches the current one. - // Go to the right, since it's there where we store such results. - currentNodeOffset = offsets[ entry + 1 ]; + // The target string matches the current one. Finish the search. break; } if ( compareResult < 0 ) { // The target string is smaller than the current one. // Go to the left. - currentNodeOffset = offsets[ entry ]; - break; - } + windowSize = closestString - window; - ptr += wordSize + 1; + if ( !windowSize ) + break; + } + else + { + // The target string is larger than the current one. + // Go to the right. + windowSize -= ( closestString - window ) + wordSize + 1; + window = closestString + wordSize + 1; + + if ( !windowSize ) + break; + } } - if ( entry == indexNodeSize ) + #if 0 + printf( "The winner is %s, compareResult = %d\n", closestString, compareResult ); + + if ( closestString != ptr ) { - // We iterated through all entries, but our string is larger than - // all of them. Go the the rightmost node. + char const * left = closestString -1; + + while( left != ptr && left[ -1 ] ) + --left; + + printf( "To the left: %s\n", left ); + } + else + printf( "To the lest -- nothing\n" ); + + char const * right = closestString + strlen( closestString ) + 1; + + if ( right != leafEnd ) + { + printf( "To the right: %s\n", right ); + } + else + printf( "To the right -- nothing\n" ); + #endif + + // Now, whatever the outcome (compareResult) is, we need to find + // entry number for the closestMatch string. + + unsigned entry = 0; + + for( char const * next = ptr; next != closestString; + next += strlen( next ) + 1, ++entry ) ; + + // Ok, now check the outcome + + if ( !compareResult ) + { + // The target string matches the one found. + // Go to the right, since it's there where we store such results. + currentNodeOffset = offsets[ entry + 1 ]; + } + if ( compareResult < 0 ) + { + // The target string is smaller than the one found. + // Go to the left. currentNodeOffset = offsets[ entry ]; } + else + { + // The target string is larger than the one found. + // Go to the right. + currentNodeOffset = offsets[ entry + 1 ]; + } + + //printf( "reading node at %x\n", currentNodeOffset ); + readNode( currentNodeOffset, extLeaf ); + leaf = &extLeaf.front(); + leafEnd = leaf + extLeaf.size(); } else { //printf( "=>a leaf\n" ); // A leaf - nextLeaf = idxFile->read< uint32_t >(); - // Iterate through chains until we find one that matches + // If this leaf is the root, there's no next leaf, it just can't be. + // We do this check because the file's position indicator just won't + // be in the right place for root node anyway, since we precache it. + nextLeaf = ( currentNodeOffset != rootOffset ? idxFile->read< uint32_t >() : 0 ); - char const * ptr = &leaf.front() + sizeof( uint32_t ); + if ( !leafEntries ) + { + // Empty leaf? This may only be possible for entirely empty trees only. + if ( currentNodeOffset != rootOffset ) + throw exCorruptedChainData(); + else + return 0; // No match + } + + // Build an array containing all chain pointers + char const * ptr = leaf + sizeof( uint32_t ); uint32_t chainSize; - while( leafEntries-- ) + vector< char const * > chainOffsets( leafEntries ); + { - memcpy( &chainSize, ptr, sizeof( uint32_t ) ); - ptr += sizeof( uint32_t ); + char const ** nextOffset = &chainOffsets.front(); - if( chainSize ) + while( leafEntries-- ) { - size_t wordSize = strlen( ptr ); + *nextOffset++ = ptr; - if ( wcharBuffer.size() <= wordSize ) - wcharBuffer.resize( wordSize + 1 ); + memcpy( &chainSize, ptr, sizeof( uint32_t ) ); - //printf( "checking agaist word %s, left = %u\n", ptr, leafEntries ); + //printf( "%s + %s\n", ptr + sizeof( uint32_t ), ptr + sizeof( uint32_t ) + strlen( ptr + sizeof( uint32_t ) ) + 1 ); - long result = Utf8::decode( ptr, wordSize, &wcharBuffer.front() ); - - if ( result < 0 ) - throw Utf8::exCantDecode( ptr ); - - wcharBuffer[ result ] = 0; - - wstring foldedWord = Folding::apply( &wcharBuffer.front() ); - - int compareResult = target.compare( foldedWord ); - - if ( !compareResult ) - { - // Exact match -- return and be done - exactMatch = true; - - return ptr - sizeof( uint32_t ); - } - else - if ( compareResult < 0 ) - { - // The target string is smaller than the current one. - // No point in travering further, return this result. - - return ptr - sizeof( uint32_t ); - } - ptr += chainSize; + ptr += sizeof( uint32_t ) + chainSize; } } - // Well, our target is larger than all the chains here. This would mean - // that the next leaf is the right one. + // Now do a binary search in it, aiming to find where our target + // string lands. - if ( nextLeaf ) + char const ** window = &chainOffsets.front(); + unsigned windowSize = chainOffsets.size(); + + for( ; ; ) { - readNode( nextLeaf, leaf ); + //printf( "window = %u, ws = %u\n", window - &chainOffsets.front(), windowSize ); - nextLeaf = idxFile->read< uint32_t >(); + char const ** chainToCheck = window + windowSize/2; + ptr = *chainToCheck; + + memcpy( &chainSize, ptr, sizeof( uint32_t ) ); + ptr += sizeof( uint32_t ); + + size_t wordSize = strlen( ptr ); + + if ( wcharBuffer.size() <= wordSize ) + wcharBuffer.resize( wordSize + 1 ); + + //printf( "checking agaist word %s, left = %u\n", ptr, leafEntries ); + + long result = Utf8::decode( ptr, wordSize, &wcharBuffer.front() ); + + if ( result < 0 ) + throw Utf8::exCantDecode( ptr ); + + wcharBuffer[ result ] = 0; + + wstring foldedWord = Folding::apply( &wcharBuffer.front() ); + + int compareResult = target.compare( foldedWord ); + + if ( !compareResult ) + { + // Exact match -- return and be done + exactMatch = true; + + return ptr - sizeof( uint32_t ); + } + else + if ( compareResult < 0 ) + { + // The target string is smaller than the current one. + // Go to the first half + + windowSize /= 2; - return &leaf.front() + sizeof( uint32_t ); + if ( !windowSize ) + { + // That finishes our search. Since our target string + // landed before the last tested chain, we return a possible + // prefix match against that chain. + return ptr - sizeof( uint32_t ); + } + } + else + { + // The target string is larger than the current one. + // Go to the second half + + windowSize -= windowSize/2 + 1; + + if ( !windowSize ) + { + // That finishes our search. Since our target string + // landed after the last tested chain, we return the next + // chain. If there's no next chain in this leaf, this + // would mean the first element in the next leaf. + if ( chainToCheck == &chainOffsets.back() ) + { + if ( nextLeaf ) + { + readNode( nextLeaf, extLeaf ); + + leafEnd = &extLeaf.front() + extLeaf.size(); + + nextLeaf = idxFile->read< uint32_t >(); + + return &extLeaf.front() + sizeof( uint32_t ); + } + else + return 0; // This was the last leaf + } + else + return chainToCheck[ 1 ]; + } + + window = chainToCheck + 1; + } } - else - return 0; // This was the last leaf } } } @@ -764,7 +923,7 @@ void IndexedWords::addWord( wstring const & word, uint32_t articleOffset ) } } -uint32_t buildIndex( IndexedWords const & indexedWords, File::Class & file ) +IndexInfo buildIndex( IndexedWords const & indexedWords, File::Class & file ) { size_t indexSize = indexedWords.size(); IndexedWords::const_iterator nextIndex = indexedWords.begin(); @@ -798,17 +957,7 @@ uint32_t buildIndex( IndexedWords const & indexedWords, File::Class & file ) file, btreeMaxElements, lastLeafOffset ); - // We need to save btreeMaxElements. For simplicity, we just save it here - // along with root offset, and then return that record's offset as the - // offset of the index itself. - - uint32_t indexOffset = file.tell(); - - file.write( (uint32_t) btreeMaxElements ); - file.write( (uint32_t) rootOffset ); - - return indexOffset; + return IndexInfo( btreeMaxElements, rootOffset ); } - } diff --git a/src/btreeidx.hh b/src/btreeidx.hh index c00c04e2..cf67440a 100644 --- a/src/btreeidx.hh +++ b/src/btreeidx.hh @@ -25,7 +25,7 @@ enum /// This is to be bumped up each time the internal format changes. /// The value isn't used here by itself, it is supposed to be added /// to each dictionary's internal format version. - FormatVersion = 2 + FormatVersion = 3 }; // These exceptions which might be thrown during the index traversal @@ -49,6 +49,16 @@ struct WordArticleLink {} }; +/// Information needed to open the index +struct IndexInfo +{ + uint32_t btreeMaxElements, rootOffset; + + IndexInfo( uint32_t btreeMaxElements_, uint32_t rootOffset_ ): + btreeMaxElements( btreeMaxElements_ ), rootOffset( rootOffset_ ) + {} +}; + class BtreeWordSearchRequest; /// A base for the dictionary that utilizes a btree index build using @@ -67,11 +77,10 @@ public: protected: - /// Opens the index. The file must be positioned at the offset previously - /// returned by buildIndex(). The file reference is saved to be used for + /// Opens the index. The file reference is saved to be used for /// subsequent lookups. /// The mutex is the one to be locked when working with the file. - void openIndex( File::Class &, Mutex & ); + void openIndex( IndexInfo const &, File::Class &, Mutex & ); /// Finds articles that match the given string. A case-insensitive search /// is performed. @@ -83,6 +92,9 @@ private: File::Class * idxFile; uint32_t indexNodeSize; uint32_t rootOffset; + bool rootNodeLoaded; + vector< char > rootNode; // We load root note here and keep it at all times, + // since all searches always start with it. /// Finds the offset in the btree leaf for the given word, either matching /// by an exact match, or by finding the smallest entry that might match @@ -91,10 +103,16 @@ private: /// to true when an exact match is located, and to false otherwise. /// The located leaf is loaded to 'leaf', and the pointer to the next /// leaf is saved to 'nextLeaf'. + /// However, due to root node being permanently cached, the 'leaf' passed + /// might not get used at all if the root node was the terminal one. In that + /// case, the returned pointer wouldn't belong to 'leaf' at all. To that end, + /// the leafEnd pointer always holds the pointer to the first byte outside + /// the node data. char const * findChainOffsetExactOrPrefix( wstring const & target, bool & exactMatch, vector< char > & leaf, - uint32_t & nextLeaf ); + uint32_t & nextLeaf, + char const * & leafEnd ); /// Reads a node or leaf at the given offset. Just uncompresses its data /// to the given vector and does nothing more. @@ -128,10 +146,10 @@ struct IndexedWords: public map< wstring, vector< WordArticleLink > > void addWord( wstring const & word, uint32_t articleOffset ); }; -/// Builds the index, as a compressed btree. Returns offset to its root. +/// Builds the index, as a compressed btree. Returns IndexInfo. /// All the data is stored to the given file, beginning from its current /// position. -uint32_t buildIndex( IndexedWords const &, File::Class & file ); +IndexInfo buildIndex( IndexedWords const &, File::Class & file ); } diff --git a/src/chunkedstorage.cc b/src/chunkedstorage.cc index d97d6097..91658c4a 100644 --- a/src/chunkedstorage.cc +++ b/src/chunkedstorage.cc @@ -15,6 +15,17 @@ enum Writer::Writer( File::Class & f ): file( f ), chunkStarted( false ), bufferUsed( 0 ) { + // Create a sratchpad at the beginning of file. We use it to write chunk + // table if it would fit, in order to save some seek times. + + char zero[ 4096 ]; + + memset( zero, 0, sizeof( zero ) ); + + scratchPadOffset = file.tell(); + scratchPadSize = sizeof( zero ); + + file.write( zero, sizeof( zero ) ); } uint32_t Writer::startNewBlock() @@ -77,10 +88,25 @@ uint32_t Writer::finish() if ( bufferUsed || chunkStarted ) saveCurrentChunk(); + bool useScratchPad = false; + uint32_t savedOffset = 0; + + if ( scratchPadSize >= offsets.size() * sizeof( uint32_t ) + sizeof( uint32_t ) ) + { + useScratchPad = true; + savedOffset = file.tell(); + file.seek( scratchPadOffset ); + } + uint32_t offset = file.tell(); file.write( (uint32_t) offsets.size() ); - file.write( &offsets.front(), offsets.size() * sizeof( uint32_t ) ); + + if ( offsets.size() ) + file.write( &offsets.front(), offsets.size() * sizeof( uint32_t ) ); + + if ( useScratchPad ) + file.seek( savedOffset ); offsets.clear(); chunkStarted = false; diff --git a/src/chunkedstorage.hh b/src/chunkedstorage.hh index 367fba67..c47e878d 100644 --- a/src/chunkedstorage.hh +++ b/src/chunkedstorage.hh @@ -29,6 +29,7 @@ class Writer { vector< uint32_t > offsets; File::Class & file; + size_t scratchPadOffset, scratchPadSize; public: diff --git a/src/dictdfiles.cc b/src/dictdfiles.cc index d112ab1d..dbdfc3a7 100644 --- a/src/dictdfiles.cc +++ b/src/dictdfiles.cc @@ -29,6 +29,7 @@ using std::list; using BtreeIndexing::WordArticleLink; using BtreeIndexing::IndexedWords; +using BtreeIndexing::IndexInfo; namespace { @@ -48,7 +49,8 @@ struct IdxHeader uint32_t signature; // First comes the signature, DCDX uint32_t formatVersion; // File format version (CurrentFormatVersion) uint32_t wordCount; // Total number of words - uint32_t indexOffset; // The offset of the index in the file + uint32_t indexBtreeMaxElements; // Two fields from IndexInfo + uint32_t indexRootOffset; } __attribute__((packed)); bool indexIsOldOrBad( string const & indexFile ) @@ -109,9 +111,9 @@ DictdDictionary::DictdDictionary( string const & id, // Initialize the index - idx.seek( idxHeader.indexOffset ); - - openIndex( idx, idxMutex ); + openIndex( IndexInfo( idxHeader.indexBtreeMaxElements, + idxHeader.indexRootOffset ), + idx, idxMutex ); } DictdDictionary::~DictdDictionary() @@ -380,7 +382,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries( // Build index - idxHeader.indexOffset = BtreeIndexing::buildIndex( indexedWords, idx ); + IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx ); + + idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements; + idxHeader.indexRootOffset = idxInfo.rootOffset; // That concludes it. Update the header. diff --git a/src/dsl.cc b/src/dsl.cc index 145631d3..58b32177 100644 --- a/src/dsl.cc +++ b/src/dsl.cc @@ -47,6 +47,7 @@ using std::list; using BtreeIndexing::WordArticleLink; using BtreeIndexing::IndexedWords; +using BtreeIndexing::IndexInfo; namespace { @@ -66,7 +67,8 @@ struct IdxHeader uint32_t chunksOffset; // The offset to chunks' storage uint32_t hasAbrv; // Non-zero means file has abrvs at abrvAddress uint32_t abrvAddress; // Address of abrv map in the chunked storage - uint32_t indexOffset; // The offset of the index in the file + uint32_t indexBtreeMaxElements; // Two fields from IndexInfo + uint32_t indexRootOffset; } __attribute__((packed)); bool indexIsOldOrBad( string const & indexFile ) @@ -201,9 +203,9 @@ DslDictionary::DslDictionary( string const & id, // Initialize the index - idx.seek( idxHeader.indexOffset ); - - openIndex( idx, idxMutex ); + openIndex( IndexInfo( idxHeader.indexBtreeMaxElements, + idxHeader.indexRootOffset ), + idx, idxMutex ); // Open a resource zip file, if there's one resourceZip = zip_open( ( getDictionaryFilenames()[ 0 ] + ".files.zip" ).c_str(), 0, 0 ); @@ -1184,7 +1186,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries( // Build index - idxHeader.indexOffset = BtreeIndexing::buildIndex( indexedWords, idx ); + IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx ); + + idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements; + idxHeader.indexRootOffset = idxInfo.rootOffset; // That concludes it. Update the header. diff --git a/src/lsa.cc b/src/lsa.cc index 8f205a9a..9a0e2023 100644 --- a/src/lsa.cc +++ b/src/lsa.cc @@ -23,6 +23,7 @@ using std::multimap; using std::set; using BtreeIndexing::WordArticleLink; using BtreeIndexing::IndexedWords; +using BtreeIndexing::IndexInfo; namespace { @@ -43,7 +44,8 @@ struct IdxHeader uint32_t formatVersion; // File format version, currently 1. uint32_t soundsCount; // Total number of sounds, for informative purposes only uint32_t vorbisOffset; // Offset of the vorbis file which contains all snds - uint32_t indexOffset; // The offset of the index in the file + uint32_t indexBtreeMaxElements; // Two fields from IndexInfo + uint32_t indexRootOffset; } __attribute__((packed)); bool indexIsOldOrBad( string const & indexFile ) @@ -174,9 +176,9 @@ LsaDictionary::LsaDictionary( string const & id, { // Initialize the index - idx.seek( idxHeader.indexOffset ); - - openIndex( idx, idxMutex ); + openIndex( IndexInfo( idxHeader.indexBtreeMaxElements, + idxHeader.indexRootOffset ), + idx, idxMutex ); } sptr< Dictionary::DataRequest > LsaDictionary::getArticle( wstring const & word, @@ -546,7 +548,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries( // Build the index - idxHeader.indexOffset = BtreeIndexing::buildIndex( indexedWords, idx ); + IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx ); + + idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements; + idxHeader.indexRootOffset = idxInfo.rootOffset; // That concludes it. Update the header. diff --git a/src/sounddir.cc b/src/sounddir.cc index 3d61c13c..e45b2edb 100644 --- a/src/sounddir.cc +++ b/src/sounddir.cc @@ -23,6 +23,7 @@ using std::multimap; using std::set; using BtreeIndexing::WordArticleLink; using BtreeIndexing::IndexedWords; +using BtreeIndexing::IndexInfo; namespace { @@ -38,7 +39,8 @@ struct IdxHeader uint32_t formatVersion; // File format version, is to be CurrentFormatVersion uint32_t soundsCount; // Total number of sounds, for informative purposes only uint32_t chunksOffset; // The offset to chunks' storage - uint32_t indexOffset; // The offset of the index in the file + uint32_t indexBtreeMaxElements; // Two fields from IndexInfo + uint32_t indexRootOffset; } __attribute__((packed)); bool indexIsOldOrBad( string const & indexFile ) @@ -98,9 +100,9 @@ SoundDirDictionary::SoundDirDictionary( string const & id, { // Initialize the index - idx.seek( idxHeader.indexOffset ); - - openIndex( idx, idxMutex ); + openIndex( IndexInfo( idxHeader.indexBtreeMaxElements, + idxHeader.indexRootOffset ), + idx, idxMutex ); } sptr< Dictionary::DataRequest > SoundDirDictionary::getArticle( wstring const & word, @@ -365,7 +367,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries( Config::SoundDirs const & // Build the index - idxHeader.indexOffset = BtreeIndexing::buildIndex( indexedWords, idx ); + IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx ); + + idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements; + idxHeader.indexRootOffset = idxInfo.rootOffset; // That concludes it. Update the header. diff --git a/src/stardict.cc b/src/stardict.cc index ef490ffa..bc038e69 100644 --- a/src/stardict.cc +++ b/src/stardict.cc @@ -33,6 +33,7 @@ using std::wstring; using BtreeIndexing::WordArticleLink; using BtreeIndexing::IndexedWords; +using BtreeIndexing::IndexInfo; namespace { @@ -65,7 +66,7 @@ struct Ifo enum { Signature = 0x58444953, // SIDX on little-endian, XDIS on big-endian - CurrentFormatVersion = 4 + BtreeIndexing::FormatVersion + Folding::Version + CurrentFormatVersion = 5 + BtreeIndexing::FormatVersion + Folding::Version }; struct IdxHeader @@ -73,7 +74,12 @@ struct IdxHeader uint32_t signature; // First comes the signature, SIDX uint32_t formatVersion; // File format version (CurrentFormatVersion) uint32_t chunksOffset; // The offset to chunks' storage - uint32_t indexOffset; // The offset of the index in the file + uint32_t indexBtreeMaxElements; // Two fields from IndexInfo + uint32_t indexRootOffset; + uint32_t wordCount; // Saved from Ifo::wordcount + uint32_t synWordCount; // Saved from Ifo::synwordcount + uint32_t bookNameSize; // Book name's length. Used to read it then. + uint32_t sameTypeSequenceSize; // That string's size. Used to read it then. } __attribute__((packed)); bool indexIsOldOrBad( string const & indexFile ) @@ -90,32 +96,32 @@ bool indexIsOldOrBad( string const & indexFile ) class StardictDictionary: public BtreeIndexing::BtreeDictionary { - Ifo ifo; Mutex idxMutex; File::Class idx; IdxHeader idxHeader; + string bookName; + string sameTypeSequence; ChunkedStorage::Reader chunks; dictData * dz; public: StardictDictionary( string const & id, string const & indexFile, - vector< string > const & dictionaryFiles, - Ifo const & ); + vector< string > const & dictionaryFiles ); ~StardictDictionary(); virtual string getName() throw() - { return ifo.bookname; } + { return bookName; } virtual map< Dictionary::Property, string > getProperties() throw() { return map< Dictionary::Property, string >(); } virtual unsigned long getArticleCount() throw() - { return ifo.wordcount; } + { return idxHeader.wordCount; } virtual unsigned long getWordCount() throw() - { return ifo.wordcount + ifo.synwordcount; } + { return idxHeader.wordCount + idxHeader.synWordCount; } virtual sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( wstring const & ) throw( std::exception ); @@ -136,16 +142,18 @@ private: void loadArticle( uint32_t address, string & headword, string & articleText ); + + string loadString( size_t size ); }; StardictDictionary::StardictDictionary( string const & id, string const & indexFile, - vector< string > const & dictionaryFiles, - Ifo const & ifo_ ): + vector< string > const & dictionaryFiles ): BtreeDictionary( id, dictionaryFiles ), - ifo( ifo_ ), idx( indexFile, "rb" ), idxHeader( idx.read< IdxHeader >() ), + bookName( loadString( idxHeader.bookNameSize ) ), + sameTypeSequence( loadString( idxHeader.sameTypeSequenceSize ) ), chunks( idx, idxHeader.chunksOffset ) { // Open the .dict file @@ -157,9 +165,9 @@ StardictDictionary::StardictDictionary( string const & id, // Initialize the index - idx.seek( idxHeader.indexOffset ); - - openIndex( idx, idxMutex ); + openIndex( IndexInfo( idxHeader.indexBtreeMaxElements, + idxHeader.indexRootOffset ), + idx, idxMutex ); } StardictDictionary::~StardictDictionary() @@ -168,6 +176,15 @@ StardictDictionary::~StardictDictionary() dict_data_close( dz ); } +string StardictDictionary::loadString( size_t size ) +{ + vector< char > data( size ); + + idx.read( &data.front(), data.size() ); + + return string( &data.front(), data.size() ); +} + void StardictDictionary::getArticleProps( uint32_t articleAddress, string & headword, uint32_t & offset, uint32_t & size ) @@ -252,14 +269,14 @@ void StardictDictionary::loadArticle( uint32_t address, char * ptr = articleBody; - if ( ifo.sametypesequence.size() ) + if ( sameTypeSequence.size() ) { /// The sequence is known, it's not stored in the article itself - for( unsigned seq = 0; seq < ifo.sametypesequence.size(); ++seq ) + for( unsigned seq = 0; seq < sameTypeSequence.size(); ++seq ) { // Last entry doesn't have size info -- it is inferred from // the bytes left - bool entrySizeKnown = ( seq == ifo.sametypesequence.size() - 1 ); + bool entrySizeKnown = ( seq == sameTypeSequence.size() - 1 ); uint32_t entrySize; @@ -272,7 +289,7 @@ void StardictDictionary::loadArticle( uint32_t address, break; } - char type = ifo.sametypesequence[ seq ]; + char type = sameTypeSequence[ seq ]; if ( islower( type ) ) { @@ -610,8 +627,7 @@ static bool tryPossibleName( string const & name, string & copyTo ) } static void findCorrespondingFiles( string const & ifo, - string & idx, string & dict, string & syn, - bool needSyn ) + string & idx, string & dict, string & syn ) { string base( ifo, 0, ifo.size() - 3 ); @@ -633,15 +649,15 @@ static void findCorrespondingFiles( string const & ifo, ) ) throw exNoDictFile( ifo ); - if ( needSyn && !( - tryPossibleName( base + "syn", syn ) || - tryPossibleName( base + "syn.gz", syn ) || - tryPossibleName( base + "syn.dz", syn ) || - tryPossibleName( base + "SYN", syn ) || - tryPossibleName( base + "SYN.GZ", syn ) || - tryPossibleName( base + "SYN.DZ", syn ) + if ( !( + tryPossibleName( base + "syn", syn ) || + tryPossibleName( base + "syn.gz", syn ) || + tryPossibleName( base + "syn.dz", syn ) || + tryPossibleName( base + "SYN", syn ) || + tryPossibleName( base + "SYN.GZ", syn ) || + tryPossibleName( base + "SYN.DZ", syn ) ) ) - throw exNoSynFile( ifo ); + syn.clear(); } static void handleIdxSynFile( string const & fileName, @@ -764,30 +780,16 @@ vector< sptr< Dictionary::Class > > makeDictionaries( try { - File::Class ifoFile( *i, "r" ); - - Ifo ifo( ifoFile ); - - if ( ifo.idxoffsetbits == 64 ) - throw ex64BitsNotSupported(); - - if ( ifo.dicttype.size() ) - throw exDicttypeNotSupported(); - - printf( "bookname = %s\n", ifo.bookname.c_str() ); - printf( "wordcount = %u\n", ifo.wordcount ); - vector< string > dictFiles( 1, *i ); string idxFileName, dictFileName, synFileName; - findCorrespondingFiles( *i, idxFileName, dictFileName, synFileName, - ifo.synwordcount ); + findCorrespondingFiles( *i, idxFileName, dictFileName, synFileName ); dictFiles.push_back( idxFileName ); dictFiles.push_back( dictFileName ); - if ( ifo.synwordcount ) + if ( synFileName.size() ) dictFiles.push_back( synFileName ); string dictId = Dictionary::makeDictionaryId( dictFiles ); @@ -798,6 +800,33 @@ vector< sptr< Dictionary::Class > > makeDictionaries( indexIsOldOrBad( indexFile ) ) { // Building the index + + File::Class ifoFile( *i, "r" ); + + Ifo ifo( ifoFile ); + + if ( ifo.idxoffsetbits == 64 ) + throw ex64BitsNotSupported(); + + if ( ifo.dicttype.size() ) + throw exDicttypeNotSupported(); + + if( synFileName.empty() ) + { + if ( ifo.synwordcount ) + throw exNoSynFile( *i ); + } + else + if ( !ifo.synwordcount ) + { + printf( "Warning: ignoring .syn file %s, since there's no synwordcount in .ifo specified\n", + synFileName.c_str() ); + } + + + printf( "bookname = %s\n", ifo.bookname.c_str() ); + printf( "wordcount = %u\n", ifo.wordcount ); + initializing.indexingDictionary( ifo.bookname ); File::Class idx( indexFile, "wb" ); @@ -811,6 +840,9 @@ vector< sptr< Dictionary::Class > > makeDictionaries( idx.write( idxHeader ); + idx.write( ifo.bookname.data(), ifo.bookname.size() ); + idx.write( ifo.sametypesequence.data(), ifo.sametypesequence.size() ); + IndexedWords indexedWords; ChunkedStorage::Writer chunks( idx ); @@ -837,13 +869,21 @@ vector< sptr< Dictionary::Class > > makeDictionaries( // Build index - idxHeader.indexOffset = BtreeIndexing::buildIndex( indexedWords, idx ); + IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx ); + + idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements; + idxHeader.indexRootOffset = idxInfo.rootOffset; // That concludes it. Update the header. idxHeader.signature = Signature; idxHeader.formatVersion = CurrentFormatVersion; + idxHeader.wordCount = ifo.wordcount; + idxHeader.synWordCount = ifo.synwordcount; + idxHeader.bookNameSize = ifo.bookname.size(); + idxHeader.sameTypeSequenceSize = ifo.sametypesequence.size(); + idx.rewind(); idx.write( &idxHeader, sizeof( idxHeader ) ); @@ -851,9 +891,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( dictionaries.push_back( new StardictDictionary( dictId, indexFile, - dictFiles, - ifo ) ); - + dictFiles ) ); } catch( std::exception & e ) {