* A lot of changes aimed to make lookups faster and to reduce startup times.

This commit is contained in:
Konstantin Isakov 2009-04-14 16:35:47 +00:00
parent 68c5c73b37
commit 32fe5dff9e
10 changed files with 434 additions and 177 deletions

View file

@ -26,6 +26,7 @@ using std::pair;
using BtreeIndexing::WordArticleLink;
using BtreeIndexing::IndexedWords;
using BtreeIndexing::IndexInfo;
namespace
{
@ -49,7 +50,8 @@ namespace
uint32_t wordCount; // Total number of words, for informative purposes only
/// Add more fields here, like name, description, author and such.
uint32_t chunksOffset; // The offset to chunks' storage
uint32_t indexOffset; // The offset of the index in the file.
uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
uint32_t indexRootOffset;
uint32_t resourceListOffset; // The offset of the list of resources
uint32_t resourcesCount; // Number of resources stored
} __attribute__((packed));
@ -239,9 +241,9 @@ namespace
// Initialize the index
idx.seek( idxHeader.indexOffset );
openIndex( idx, idxMutex );
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
idxHeader.indexRootOffset ),
idx, idxMutex );
}
@ -739,7 +741,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
// Good. Now build the index
idxHeader.indexOffset = BtreeIndexing::buildIndex( indexedWords, idx );
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
idxHeader.indexRootOffset = idxInfo.rootOffset;
// Save the resource's list.

View file

@ -42,19 +42,21 @@ enum
BtreeDictionary::BtreeDictionary( string const & id,
vector< string > const & dictionaryFiles ):
Dictionary::Class( id, dictionaryFiles ), idxFile( 0 )
Dictionary::Class( id, dictionaryFiles ), idxFile( 0 ), rootNodeLoaded( false )
{
}
void BtreeDictionary::openIndex( File::Class & file, Mutex & mutex )
void BtreeDictionary::openIndex( IndexInfo const & indexInfo,
File::Class & file, Mutex & mutex )
{
Mutex::Lock _( mutex );
indexNodeSize = file.read< uint32_t >();
rootOffset = file.read< uint32_t >();
indexNodeSize = indexInfo.btreeMaxElements;
rootOffset = indexInfo.rootOffset;
idxFile = &file;
idxFileMutex = &mutex;
rootNodeLoaded = false;
rootNode.clear();
}
vector< WordArticleLink > BtreeDictionary::findArticles( wstring const & str )
@ -68,8 +70,11 @@ vector< WordArticleLink > BtreeDictionary::findArticles( wstring const & str )
vector< char > leaf;
uint32_t nextLeaf;
char const * leafEnd;
char const * chainOffset = findChainOffsetExactOrPrefix( folded, exactMatch,
leaf, nextLeaf );
leaf, nextLeaf,
leafEnd );
if ( chainOffset && exactMatch )
{
@ -157,9 +162,11 @@ void BtreeWordSearchRequest::run()
vector< char > leaf;
uint32_t nextLeaf;
char const * leafEnd;
char const * chainOffset = dict.findChainOffsetExactOrPrefix( folded, exactMatch,
leaf, nextLeaf );
leaf, nextLeaf,
leafEnd );
if ( chainOffset )
for( ; ; )
@ -198,7 +205,7 @@ void BtreeWordSearchRequest::run()
// Fetch new leaf if we're out of chains here
if ( chainOffset > &leaf.back() )
if ( chainOffset >= leafEnd )
{
// We're past the current leaf, fetch the next one
@ -209,6 +216,8 @@ void BtreeWordSearchRequest::run()
Mutex::Lock _( *dict.idxFileMutex );
dict.readNode( nextLeaf, leaf );
leafEnd = &leaf.front() + leaf.size();
nextLeaf = dict.idxFile->read< uint32_t >();
chainOffset = &leaf.front() + sizeof( uint32_t );
@ -274,8 +283,9 @@ void BtreeDictionary::readNode( uint32_t offset, vector< char > & out )
char const * BtreeDictionary::findChainOffsetExactOrPrefix( wstring const & target,
bool & exactMatch,
vector< char > & leaf,
uint32_t & nextLeaf )
vector< char > & extLeaf,
uint32_t & nextLeaf,
char const * & leafEnd )
{
if ( !idxFile )
throw exIndexWasNotOpened();
@ -294,14 +304,21 @@ char const * BtreeDictionary::findChainOffsetExactOrPrefix( wstring const & targ
uint32_t currentNodeOffset = rootOffset;
if ( !rootNodeLoaded )
{
// Time to load our root node. We do it only once, at the first request.
readNode( rootOffset, rootNode );
rootNodeLoaded = true;
}
char const * leaf = &rootNode.front();
leafEnd = leaf + rootNode.size();
for( ; ; )
{
//printf( "reading node at %x\n", currentNodeOffset );
readNode( currentNodeOffset, leaf );
// Is it a leaf or a node?
uint32_t leafEntries = *(uint32_t *)&leaf.front();
uint32_t leafEntries = *(uint32_t *)leaf;
if ( leafEntries == 0xffffFFFF )
{
@ -309,124 +326,266 @@ char const * BtreeDictionary::findChainOffsetExactOrPrefix( wstring const & targ
//printf( "=>a node\n" );
uint32_t const * offsets = (uint32_t *)&leaf.front() + 1;
uint32_t const * offsets = (uint32_t *)leaf + 1;
char const * ptr = &leaf.front() + sizeof( uint32_t ) +
char const * ptr = leaf + sizeof( uint32_t ) +
( indexNodeSize + 1 ) * sizeof( uint32_t );
unsigned entry;
// ptr now points to a span of zero-separated strings, up to leafEnd.
// We find our match using a binary search.
for( entry = 0; entry < indexNodeSize; ++entry )
{
//printf( "checking node agaist word %s\n", ptr );
size_t wordSize = strlen( ptr );
char const * closestString;
int compareResult;
char const * window = ptr;
unsigned windowSize = leafEnd - ptr;
for( ; ; )
{
// We boldly shoot in the middle of the whole mess, and then adjust
// to the beginning of the string that we've hit.
char const * testPoint = window + windowSize/2;
closestString = testPoint;
while( closestString > ptr && closestString[ -1 ] )
--closestString;
size_t wordSize = strlen( closestString );
if ( wcharBuffer.size() <= wordSize )
wcharBuffer.resize( wordSize + 1 );
long result = Utf8::decode( ptr, wordSize, &wcharBuffer.front() );
long result = Utf8::decode( closestString, wordSize, &wcharBuffer.front() );
if ( result < 0 )
throw Utf8::exCantDecode( ptr );
throw Utf8::exCantDecode( closestString );
wcharBuffer[ result ] = 0;
int compareResult = target.compare( &wcharBuffer.front() );
//printf( "Checking against %s\n", closestString );
compareResult = target.compare( &wcharBuffer.front() );
if ( !compareResult )
{
// The target string matches the current one.
// Go to the right, since it's there where we store such results.
currentNodeOffset = offsets[ entry + 1 ];
// The target string matches the current one. Finish the search.
break;
}
if ( compareResult < 0 )
{
// The target string is smaller than the current one.
// Go to the left.
currentNodeOffset = offsets[ entry ];
break;
}
windowSize = closestString - window;
ptr += wordSize + 1;
if ( !windowSize )
break;
}
else
{
// The target string is larger than the current one.
// Go to the right.
windowSize -= ( closestString - window ) + wordSize + 1;
window = closestString + wordSize + 1;
if ( !windowSize )
break;
}
}
if ( entry == indexNodeSize )
#if 0
printf( "The winner is %s, compareResult = %d\n", closestString, compareResult );
if ( closestString != ptr )
{
// We iterated through all entries, but our string is larger than
// all of them. Go the the rightmost node.
char const * left = closestString -1;
while( left != ptr && left[ -1 ] )
--left;
printf( "To the left: %s\n", left );
}
else
printf( "To the lest -- nothing\n" );
char const * right = closestString + strlen( closestString ) + 1;
if ( right != leafEnd )
{
printf( "To the right: %s\n", right );
}
else
printf( "To the right -- nothing\n" );
#endif
// Now, whatever the outcome (compareResult) is, we need to find
// entry number for the closestMatch string.
unsigned entry = 0;
for( char const * next = ptr; next != closestString;
next += strlen( next ) + 1, ++entry ) ;
// Ok, now check the outcome
if ( !compareResult )
{
// The target string matches the one found.
// Go to the right, since it's there where we store such results.
currentNodeOffset = offsets[ entry + 1 ];
}
if ( compareResult < 0 )
{
// The target string is smaller than the one found.
// Go to the left.
currentNodeOffset = offsets[ entry ];
}
else
{
// The target string is larger than the one found.
// Go to the right.
currentNodeOffset = offsets[ entry + 1 ];
}
//printf( "reading node at %x\n", currentNodeOffset );
readNode( currentNodeOffset, extLeaf );
leaf = &extLeaf.front();
leafEnd = leaf + extLeaf.size();
}
else
{
//printf( "=>a leaf\n" );
// A leaf
nextLeaf = idxFile->read< uint32_t >();
// Iterate through chains until we find one that matches
// If this leaf is the root, there's no next leaf, it just can't be.
// We do this check because the file's position indicator just won't
// be in the right place for root node anyway, since we precache it.
nextLeaf = ( currentNodeOffset != rootOffset ? idxFile->read< uint32_t >() : 0 );
char const * ptr = &leaf.front() + sizeof( uint32_t );
if ( !leafEntries )
{
// Empty leaf? This may only be possible for entirely empty trees only.
if ( currentNodeOffset != rootOffset )
throw exCorruptedChainData();
else
return 0; // No match
}
// Build an array containing all chain pointers
char const * ptr = leaf + sizeof( uint32_t );
uint32_t chainSize;
while( leafEntries-- )
vector< char const * > chainOffsets( leafEntries );
{
memcpy( &chainSize, ptr, sizeof( uint32_t ) );
ptr += sizeof( uint32_t );
char const ** nextOffset = &chainOffsets.front();
if( chainSize )
while( leafEntries-- )
{
size_t wordSize = strlen( ptr );
*nextOffset++ = ptr;
if ( wcharBuffer.size() <= wordSize )
wcharBuffer.resize( wordSize + 1 );
memcpy( &chainSize, ptr, sizeof( uint32_t ) );
//printf( "checking agaist word %s, left = %u\n", ptr, leafEntries );
//printf( "%s + %s\n", ptr + sizeof( uint32_t ), ptr + sizeof( uint32_t ) + strlen( ptr + sizeof( uint32_t ) ) + 1 );
long result = Utf8::decode( ptr, wordSize, &wcharBuffer.front() );
if ( result < 0 )
throw Utf8::exCantDecode( ptr );
wcharBuffer[ result ] = 0;
wstring foldedWord = Folding::apply( &wcharBuffer.front() );
int compareResult = target.compare( foldedWord );
if ( !compareResult )
{
// Exact match -- return and be done
exactMatch = true;
return ptr - sizeof( uint32_t );
}
else
if ( compareResult < 0 )
{
// The target string is smaller than the current one.
// No point in travering further, return this result.
return ptr - sizeof( uint32_t );
}
ptr += chainSize;
ptr += sizeof( uint32_t ) + chainSize;
}
}
// Well, our target is larger than all the chains here. This would mean
// that the next leaf is the right one.
// Now do a binary search in it, aiming to find where our target
// string lands.
if ( nextLeaf )
char const ** window = &chainOffsets.front();
unsigned windowSize = chainOffsets.size();
for( ; ; )
{
readNode( nextLeaf, leaf );
//printf( "window = %u, ws = %u\n", window - &chainOffsets.front(), windowSize );
nextLeaf = idxFile->read< uint32_t >();
char const ** chainToCheck = window + windowSize/2;
ptr = *chainToCheck;
memcpy( &chainSize, ptr, sizeof( uint32_t ) );
ptr += sizeof( uint32_t );
size_t wordSize = strlen( ptr );
if ( wcharBuffer.size() <= wordSize )
wcharBuffer.resize( wordSize + 1 );
//printf( "checking agaist word %s, left = %u\n", ptr, leafEntries );
long result = Utf8::decode( ptr, wordSize, &wcharBuffer.front() );
if ( result < 0 )
throw Utf8::exCantDecode( ptr );
wcharBuffer[ result ] = 0;
wstring foldedWord = Folding::apply( &wcharBuffer.front() );
int compareResult = target.compare( foldedWord );
if ( !compareResult )
{
// Exact match -- return and be done
exactMatch = true;
return ptr - sizeof( uint32_t );
}
else
if ( compareResult < 0 )
{
// The target string is smaller than the current one.
// Go to the first half
windowSize /= 2;
return &leaf.front() + sizeof( uint32_t );
if ( !windowSize )
{
// That finishes our search. Since our target string
// landed before the last tested chain, we return a possible
// prefix match against that chain.
return ptr - sizeof( uint32_t );
}
}
else
{
// The target string is larger than the current one.
// Go to the second half
windowSize -= windowSize/2 + 1;
if ( !windowSize )
{
// That finishes our search. Since our target string
// landed after the last tested chain, we return the next
// chain. If there's no next chain in this leaf, this
// would mean the first element in the next leaf.
if ( chainToCheck == &chainOffsets.back() )
{
if ( nextLeaf )
{
readNode( nextLeaf, extLeaf );
leafEnd = &extLeaf.front() + extLeaf.size();
nextLeaf = idxFile->read< uint32_t >();
return &extLeaf.front() + sizeof( uint32_t );
}
else
return 0; // This was the last leaf
}
else
return chainToCheck[ 1 ];
}
window = chainToCheck + 1;
}
}
else
return 0; // This was the last leaf
}
}
}
@ -764,7 +923,7 @@ void IndexedWords::addWord( wstring const & word, uint32_t articleOffset )
}
}
uint32_t buildIndex( IndexedWords const & indexedWords, File::Class & file )
IndexInfo buildIndex( IndexedWords const & indexedWords, File::Class & file )
{
size_t indexSize = indexedWords.size();
IndexedWords::const_iterator nextIndex = indexedWords.begin();
@ -798,17 +957,7 @@ uint32_t buildIndex( IndexedWords const & indexedWords, File::Class & file )
file, btreeMaxElements,
lastLeafOffset );
// We need to save btreeMaxElements. For simplicity, we just save it here
// along with root offset, and then return that record's offset as the
// offset of the index itself.
uint32_t indexOffset = file.tell();
file.write( (uint32_t) btreeMaxElements );
file.write( (uint32_t) rootOffset );
return indexOffset;
return IndexInfo( btreeMaxElements, rootOffset );
}
}

View file

@ -25,7 +25,7 @@ enum
/// This is to be bumped up each time the internal format changes.
/// The value isn't used here by itself, it is supposed to be added
/// to each dictionary's internal format version.
FormatVersion = 2
FormatVersion = 3
};
// These exceptions which might be thrown during the index traversal
@ -49,6 +49,16 @@ struct WordArticleLink
{}
};
/// Information needed to open the index
struct IndexInfo
{
uint32_t btreeMaxElements, rootOffset;
IndexInfo( uint32_t btreeMaxElements_, uint32_t rootOffset_ ):
btreeMaxElements( btreeMaxElements_ ), rootOffset( rootOffset_ )
{}
};
class BtreeWordSearchRequest;
/// A base for the dictionary that utilizes a btree index build using
@ -67,11 +77,10 @@ public:
protected:
/// Opens the index. The file must be positioned at the offset previously
/// returned by buildIndex(). The file reference is saved to be used for
/// Opens the index. The file reference is saved to be used for
/// subsequent lookups.
/// The mutex is the one to be locked when working with the file.
void openIndex( File::Class &, Mutex & );
void openIndex( IndexInfo const &, File::Class &, Mutex & );
/// Finds articles that match the given string. A case-insensitive search
/// is performed.
@ -83,6 +92,9 @@ private:
File::Class * idxFile;
uint32_t indexNodeSize;
uint32_t rootOffset;
bool rootNodeLoaded;
vector< char > rootNode; // We load root note here and keep it at all times,
// since all searches always start with it.
/// Finds the offset in the btree leaf for the given word, either matching
/// by an exact match, or by finding the smallest entry that might match
@ -91,10 +103,16 @@ private:
/// to true when an exact match is located, and to false otherwise.
/// The located leaf is loaded to 'leaf', and the pointer to the next
/// leaf is saved to 'nextLeaf'.
/// However, due to root node being permanently cached, the 'leaf' passed
/// might not get used at all if the root node was the terminal one. In that
/// case, the returned pointer wouldn't belong to 'leaf' at all. To that end,
/// the leafEnd pointer always holds the pointer to the first byte outside
/// the node data.
char const * findChainOffsetExactOrPrefix( wstring const & target,
bool & exactMatch,
vector< char > & leaf,
uint32_t & nextLeaf );
uint32_t & nextLeaf,
char const * & leafEnd );
/// Reads a node or leaf at the given offset. Just uncompresses its data
/// to the given vector and does nothing more.
@ -128,10 +146,10 @@ struct IndexedWords: public map< wstring, vector< WordArticleLink > >
void addWord( wstring const & word, uint32_t articleOffset );
};
/// Builds the index, as a compressed btree. Returns offset to its root.
/// Builds the index, as a compressed btree. Returns IndexInfo.
/// All the data is stored to the given file, beginning from its current
/// position.
uint32_t buildIndex( IndexedWords const &, File::Class & file );
IndexInfo buildIndex( IndexedWords const &, File::Class & file );
}

View file

@ -15,6 +15,17 @@ enum
Writer::Writer( File::Class & f ):
file( f ), chunkStarted( false ), bufferUsed( 0 )
{
// Create a sratchpad at the beginning of file. We use it to write chunk
// table if it would fit, in order to save some seek times.
char zero[ 4096 ];
memset( zero, 0, sizeof( zero ) );
scratchPadOffset = file.tell();
scratchPadSize = sizeof( zero );
file.write( zero, sizeof( zero ) );
}
uint32_t Writer::startNewBlock()
@ -77,10 +88,25 @@ uint32_t Writer::finish()
if ( bufferUsed || chunkStarted )
saveCurrentChunk();
bool useScratchPad = false;
uint32_t savedOffset = 0;
if ( scratchPadSize >= offsets.size() * sizeof( uint32_t ) + sizeof( uint32_t ) )
{
useScratchPad = true;
savedOffset = file.tell();
file.seek( scratchPadOffset );
}
uint32_t offset = file.tell();
file.write( (uint32_t) offsets.size() );
file.write( &offsets.front(), offsets.size() * sizeof( uint32_t ) );
if ( offsets.size() )
file.write( &offsets.front(), offsets.size() * sizeof( uint32_t ) );
if ( useScratchPad )
file.seek( savedOffset );
offsets.clear();
chunkStarted = false;

View file

@ -29,6 +29,7 @@ class Writer
{
vector< uint32_t > offsets;
File::Class & file;
size_t scratchPadOffset, scratchPadSize;
public:

View file

@ -29,6 +29,7 @@ using std::list;
using BtreeIndexing::WordArticleLink;
using BtreeIndexing::IndexedWords;
using BtreeIndexing::IndexInfo;
namespace {
@ -48,7 +49,8 @@ struct IdxHeader
uint32_t signature; // First comes the signature, DCDX
uint32_t formatVersion; // File format version (CurrentFormatVersion)
uint32_t wordCount; // Total number of words
uint32_t indexOffset; // The offset of the index in the file
uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
uint32_t indexRootOffset;
} __attribute__((packed));
bool indexIsOldOrBad( string const & indexFile )
@ -109,9 +111,9 @@ DictdDictionary::DictdDictionary( string const & id,
// Initialize the index
idx.seek( idxHeader.indexOffset );
openIndex( idx, idxMutex );
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
idxHeader.indexRootOffset ),
idx, idxMutex );
}
DictdDictionary::~DictdDictionary()
@ -380,7 +382,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
// Build index
idxHeader.indexOffset = BtreeIndexing::buildIndex( indexedWords, idx );
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
idxHeader.indexRootOffset = idxInfo.rootOffset;
// That concludes it. Update the header.

View file

@ -47,6 +47,7 @@ using std::list;
using BtreeIndexing::WordArticleLink;
using BtreeIndexing::IndexedWords;
using BtreeIndexing::IndexInfo;
namespace {
@ -66,7 +67,8 @@ struct IdxHeader
uint32_t chunksOffset; // The offset to chunks' storage
uint32_t hasAbrv; // Non-zero means file has abrvs at abrvAddress
uint32_t abrvAddress; // Address of abrv map in the chunked storage
uint32_t indexOffset; // The offset of the index in the file
uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
uint32_t indexRootOffset;
} __attribute__((packed));
bool indexIsOldOrBad( string const & indexFile )
@ -201,9 +203,9 @@ DslDictionary::DslDictionary( string const & id,
// Initialize the index
idx.seek( idxHeader.indexOffset );
openIndex( idx, idxMutex );
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
idxHeader.indexRootOffset ),
idx, idxMutex );
// Open a resource zip file, if there's one
resourceZip = zip_open( ( getDictionaryFilenames()[ 0 ] + ".files.zip" ).c_str(), 0, 0 );
@ -1184,7 +1186,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
// Build index
idxHeader.indexOffset = BtreeIndexing::buildIndex( indexedWords, idx );
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
idxHeader.indexRootOffset = idxInfo.rootOffset;
// That concludes it. Update the header.

View file

@ -23,6 +23,7 @@ using std::multimap;
using std::set;
using BtreeIndexing::WordArticleLink;
using BtreeIndexing::IndexedWords;
using BtreeIndexing::IndexInfo;
namespace {
@ -43,7 +44,8 @@ struct IdxHeader
uint32_t formatVersion; // File format version, currently 1.
uint32_t soundsCount; // Total number of sounds, for informative purposes only
uint32_t vorbisOffset; // Offset of the vorbis file which contains all snds
uint32_t indexOffset; // The offset of the index in the file
uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
uint32_t indexRootOffset;
} __attribute__((packed));
bool indexIsOldOrBad( string const & indexFile )
@ -174,9 +176,9 @@ LsaDictionary::LsaDictionary( string const & id,
{
// Initialize the index
idx.seek( idxHeader.indexOffset );
openIndex( idx, idxMutex );
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
idxHeader.indexRootOffset ),
idx, idxMutex );
}
sptr< Dictionary::DataRequest > LsaDictionary::getArticle( wstring const & word,
@ -546,7 +548,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
// Build the index
idxHeader.indexOffset = BtreeIndexing::buildIndex( indexedWords, idx );
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
idxHeader.indexRootOffset = idxInfo.rootOffset;
// That concludes it. Update the header.

View file

@ -23,6 +23,7 @@ using std::multimap;
using std::set;
using BtreeIndexing::WordArticleLink;
using BtreeIndexing::IndexedWords;
using BtreeIndexing::IndexInfo;
namespace {
@ -38,7 +39,8 @@ struct IdxHeader
uint32_t formatVersion; // File format version, is to be CurrentFormatVersion
uint32_t soundsCount; // Total number of sounds, for informative purposes only
uint32_t chunksOffset; // The offset to chunks' storage
uint32_t indexOffset; // The offset of the index in the file
uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
uint32_t indexRootOffset;
} __attribute__((packed));
bool indexIsOldOrBad( string const & indexFile )
@ -98,9 +100,9 @@ SoundDirDictionary::SoundDirDictionary( string const & id,
{
// Initialize the index
idx.seek( idxHeader.indexOffset );
openIndex( idx, idxMutex );
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
idxHeader.indexRootOffset ),
idx, idxMutex );
}
sptr< Dictionary::DataRequest > SoundDirDictionary::getArticle( wstring const & word,
@ -365,7 +367,10 @@ vector< sptr< Dictionary::Class > > makeDictionaries( Config::SoundDirs const &
// Build the index
idxHeader.indexOffset = BtreeIndexing::buildIndex( indexedWords, idx );
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
idxHeader.indexRootOffset = idxInfo.rootOffset;
// That concludes it. Update the header.

View file

@ -33,6 +33,7 @@ using std::wstring;
using BtreeIndexing::WordArticleLink;
using BtreeIndexing::IndexedWords;
using BtreeIndexing::IndexInfo;
namespace {
@ -65,7 +66,7 @@ struct Ifo
enum
{
Signature = 0x58444953, // SIDX on little-endian, XDIS on big-endian
CurrentFormatVersion = 4 + BtreeIndexing::FormatVersion + Folding::Version
CurrentFormatVersion = 5 + BtreeIndexing::FormatVersion + Folding::Version
};
struct IdxHeader
@ -73,7 +74,12 @@ struct IdxHeader
uint32_t signature; // First comes the signature, SIDX
uint32_t formatVersion; // File format version (CurrentFormatVersion)
uint32_t chunksOffset; // The offset to chunks' storage
uint32_t indexOffset; // The offset of the index in the file
uint32_t indexBtreeMaxElements; // Two fields from IndexInfo
uint32_t indexRootOffset;
uint32_t wordCount; // Saved from Ifo::wordcount
uint32_t synWordCount; // Saved from Ifo::synwordcount
uint32_t bookNameSize; // Book name's length. Used to read it then.
uint32_t sameTypeSequenceSize; // That string's size. Used to read it then.
} __attribute__((packed));
bool indexIsOldOrBad( string const & indexFile )
@ -90,32 +96,32 @@ bool indexIsOldOrBad( string const & indexFile )
class StardictDictionary: public BtreeIndexing::BtreeDictionary
{
Ifo ifo;
Mutex idxMutex;
File::Class idx;
IdxHeader idxHeader;
string bookName;
string sameTypeSequence;
ChunkedStorage::Reader chunks;
dictData * dz;
public:
StardictDictionary( string const & id, string const & indexFile,
vector< string > const & dictionaryFiles,
Ifo const & );
vector< string > const & dictionaryFiles );
~StardictDictionary();
virtual string getName() throw()
{ return ifo.bookname; }
{ return bookName; }
virtual map< Dictionary::Property, string > getProperties() throw()
{ return map< Dictionary::Property, string >(); }
virtual unsigned long getArticleCount() throw()
{ return ifo.wordcount; }
{ return idxHeader.wordCount; }
virtual unsigned long getWordCount() throw()
{ return ifo.wordcount + ifo.synwordcount; }
{ return idxHeader.wordCount + idxHeader.synWordCount; }
virtual sptr< Dictionary::WordSearchRequest > findHeadwordsForSynonym( wstring const & )
throw( std::exception );
@ -136,16 +142,18 @@ private:
void loadArticle( uint32_t address,
string & headword,
string & articleText );
string loadString( size_t size );
};
StardictDictionary::StardictDictionary( string const & id,
string const & indexFile,
vector< string > const & dictionaryFiles,
Ifo const & ifo_ ):
vector< string > const & dictionaryFiles ):
BtreeDictionary( id, dictionaryFiles ),
ifo( ifo_ ),
idx( indexFile, "rb" ),
idxHeader( idx.read< IdxHeader >() ),
bookName( loadString( idxHeader.bookNameSize ) ),
sameTypeSequence( loadString( idxHeader.sameTypeSequenceSize ) ),
chunks( idx, idxHeader.chunksOffset )
{
// Open the .dict file
@ -157,9 +165,9 @@ StardictDictionary::StardictDictionary( string const & id,
// Initialize the index
idx.seek( idxHeader.indexOffset );
openIndex( idx, idxMutex );
openIndex( IndexInfo( idxHeader.indexBtreeMaxElements,
idxHeader.indexRootOffset ),
idx, idxMutex );
}
StardictDictionary::~StardictDictionary()
@ -168,6 +176,15 @@ StardictDictionary::~StardictDictionary()
dict_data_close( dz );
}
string StardictDictionary::loadString( size_t size )
{
vector< char > data( size );
idx.read( &data.front(), data.size() );
return string( &data.front(), data.size() );
}
void StardictDictionary::getArticleProps( uint32_t articleAddress,
string & headword,
uint32_t & offset, uint32_t & size )
@ -252,14 +269,14 @@ void StardictDictionary::loadArticle( uint32_t address,
char * ptr = articleBody;
if ( ifo.sametypesequence.size() )
if ( sameTypeSequence.size() )
{
/// The sequence is known, it's not stored in the article itself
for( unsigned seq = 0; seq < ifo.sametypesequence.size(); ++seq )
for( unsigned seq = 0; seq < sameTypeSequence.size(); ++seq )
{
// Last entry doesn't have size info -- it is inferred from
// the bytes left
bool entrySizeKnown = ( seq == ifo.sametypesequence.size() - 1 );
bool entrySizeKnown = ( seq == sameTypeSequence.size() - 1 );
uint32_t entrySize;
@ -272,7 +289,7 @@ void StardictDictionary::loadArticle( uint32_t address,
break;
}
char type = ifo.sametypesequence[ seq ];
char type = sameTypeSequence[ seq ];
if ( islower( type ) )
{
@ -610,8 +627,7 @@ static bool tryPossibleName( string const & name, string & copyTo )
}
static void findCorrespondingFiles( string const & ifo,
string & idx, string & dict, string & syn,
bool needSyn )
string & idx, string & dict, string & syn )
{
string base( ifo, 0, ifo.size() - 3 );
@ -633,15 +649,15 @@ static void findCorrespondingFiles( string const & ifo,
) )
throw exNoDictFile( ifo );
if ( needSyn && !(
tryPossibleName( base + "syn", syn ) ||
tryPossibleName( base + "syn.gz", syn ) ||
tryPossibleName( base + "syn.dz", syn ) ||
tryPossibleName( base + "SYN", syn ) ||
tryPossibleName( base + "SYN.GZ", syn ) ||
tryPossibleName( base + "SYN.DZ", syn )
if ( !(
tryPossibleName( base + "syn", syn ) ||
tryPossibleName( base + "syn.gz", syn ) ||
tryPossibleName( base + "syn.dz", syn ) ||
tryPossibleName( base + "SYN", syn ) ||
tryPossibleName( base + "SYN.GZ", syn ) ||
tryPossibleName( base + "SYN.DZ", syn )
) )
throw exNoSynFile( ifo );
syn.clear();
}
static void handleIdxSynFile( string const & fileName,
@ -764,30 +780,16 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
try
{
File::Class ifoFile( *i, "r" );
Ifo ifo( ifoFile );
if ( ifo.idxoffsetbits == 64 )
throw ex64BitsNotSupported();
if ( ifo.dicttype.size() )
throw exDicttypeNotSupported();
printf( "bookname = %s\n", ifo.bookname.c_str() );
printf( "wordcount = %u\n", ifo.wordcount );
vector< string > dictFiles( 1, *i );
string idxFileName, dictFileName, synFileName;
findCorrespondingFiles( *i, idxFileName, dictFileName, synFileName,
ifo.synwordcount );
findCorrespondingFiles( *i, idxFileName, dictFileName, synFileName );
dictFiles.push_back( idxFileName );
dictFiles.push_back( dictFileName );
if ( ifo.synwordcount )
if ( synFileName.size() )
dictFiles.push_back( synFileName );
string dictId = Dictionary::makeDictionaryId( dictFiles );
@ -798,6 +800,33 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
indexIsOldOrBad( indexFile ) )
{
// Building the index
File::Class ifoFile( *i, "r" );
Ifo ifo( ifoFile );
if ( ifo.idxoffsetbits == 64 )
throw ex64BitsNotSupported();
if ( ifo.dicttype.size() )
throw exDicttypeNotSupported();
if( synFileName.empty() )
{
if ( ifo.synwordcount )
throw exNoSynFile( *i );
}
else
if ( !ifo.synwordcount )
{
printf( "Warning: ignoring .syn file %s, since there's no synwordcount in .ifo specified\n",
synFileName.c_str() );
}
printf( "bookname = %s\n", ifo.bookname.c_str() );
printf( "wordcount = %u\n", ifo.wordcount );
initializing.indexingDictionary( ifo.bookname );
File::Class idx( indexFile, "wb" );
@ -811,6 +840,9 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
idx.write( idxHeader );
idx.write( ifo.bookname.data(), ifo.bookname.size() );
idx.write( ifo.sametypesequence.data(), ifo.sametypesequence.size() );
IndexedWords indexedWords;
ChunkedStorage::Writer chunks( idx );
@ -837,13 +869,21 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
// Build index
idxHeader.indexOffset = BtreeIndexing::buildIndex( indexedWords, idx );
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
idxHeader.indexRootOffset = idxInfo.rootOffset;
// That concludes it. Update the header.
idxHeader.signature = Signature;
idxHeader.formatVersion = CurrentFormatVersion;
idxHeader.wordCount = ifo.wordcount;
idxHeader.synWordCount = ifo.synwordcount;
idxHeader.bookNameSize = ifo.bookname.size();
idxHeader.sameTypeSequenceSize = ifo.sametypesequence.size();
idx.rewind();
idx.write( &idxHeader, sizeof( idxHeader ) );
@ -851,9 +891,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
dictionaries.push_back( new StardictDictionary( dictId,
indexFile,
dictFiles,
ifo ) );
dictFiles ) );
}
catch( std::exception & e )
{