goldendict-ng/btreeidx.hh
Konstantin Isakov 953d9237a3 Update copyright to span to 2011.
Better early than late, this year is ending anyway.
2010-11-14 18:47:03 +03:00

197 lines
6.9 KiB
C++

/* This file is (c) 2008-2011 Konstantin Isakov <ikm@users.berlios.de>
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#ifndef __BTREEIDX_HH_INCLUDED__
#define __BTREEIDX_HH_INCLUDED__
#include "dictionary.hh"
#include "file.hh"
#include <string>
#include <vector>
#include <map>
#ifdef _MSC_VER
#include <stdint_msvc.h>
#else
#include <stdint.h>
#endif
/// A base for the dictionary which creates a btree index to look up
/// the words.
namespace BtreeIndexing {
using std::string;
using gd::wstring;
using std::vector;
using std::map;
enum
{
/// This is to be bumped up each time the internal format changes.
/// The value isn't used here by itself, it is supposed to be added
/// to each dictionary's internal format version.
FormatVersion = 3
};
// These exceptions which might be thrown during the index traversal
DEF_EX( exIndexWasNotOpened, "The index wasn't opened", Dictionary::Ex )
DEF_EX( exFailedToDecompressNode, "Failed to decompress a btree's node", Dictionary::Ex )
DEF_EX( exCorruptedChainData, "Corrupted chain data in the leaf of a btree encountered", Dictionary::Ex )
/// This structure describes a word linked to its translation. The
/// translation is represented as an abstract 32-bit offset.
struct WordArticleLink
{
string word, prefix; // in utf8
uint32_t articleOffset;
WordArticleLink()
{}
WordArticleLink( string const & word_, uint32_t articleOffset_, string const & prefix_ = string() ):
word( word_ ), prefix( prefix_ ), articleOffset( articleOffset_ )
{}
};
/// Information needed to open the index
struct IndexInfo
{
uint32_t btreeMaxElements, rootOffset;
IndexInfo( uint32_t btreeMaxElements_, uint32_t rootOffset_ ):
btreeMaxElements( btreeMaxElements_ ), rootOffset( rootOffset_ )
{}
};
/// Base btree indexing class which allows using what buildIndex() function
/// created. It's quite low-lovel and is basically a set of 'bulding blocks'
/// functions.
class BtreeIndex
{
public:
BtreeIndex();
protected:
/// Opens the index. The file reference is saved to be used for
/// subsequent lookups.
/// The mutex is the one to be locked when working with the file.
void openIndex( IndexInfo const &, File::Class &, Mutex & );
/// Finds articles that match the given string. A case-insensitive search
/// is performed.
vector< WordArticleLink > findArticles( wstring const & );
/// Finds the offset in the btree leaf for the given word, either matching
/// by an exact match, or by finding the smallest entry that might match
/// by prefix. It can return zero if there isn't even a possible prefx
/// match. The input string must already be folded. The exactMatch is set
/// to true when an exact match is located, and to false otherwise.
/// The located leaf is loaded to 'leaf', and the pointer to the next
/// leaf is saved to 'nextLeaf'.
/// However, due to root node being permanently cached, the 'leaf' passed
/// might not get used at all if the root node was the terminal one. In that
/// case, the returned pointer wouldn't belong to 'leaf' at all. To that end,
/// the leafEnd pointer always holds the pointer to the first byte outside
/// the node data.
char const * findChainOffsetExactOrPrefix( wstring const & target,
bool & exactMatch,
vector< char > & leaf,
uint32_t & nextLeaf,
char const * & leafEnd );
/// Reads a node or leaf at the given offset. Just uncompresses its data
/// to the given vector and does nothing more.
void readNode( uint32_t offset, vector< char > & out );
/// Reads the word-article links' chain at the given offset. The pointer
/// is updated to point to the next chain, if there's any.
vector< WordArticleLink > readChain( char const * & );
/// Drops any alises which arose due to folding. Only case-folded aliases
/// are left.
void antialias( wstring const &, vector< WordArticleLink > & );
protected:
Mutex * idxFileMutex;
File::Class * idxFile;
private:
uint32_t indexNodeSize;
uint32_t rootOffset;
bool rootNodeLoaded;
vector< char > rootNode; // We load root note here and keep it at all times,
// since all searches always start with it.
};
class BtreeWordSearchRequest;
/// A base for the dictionary that utilizes a btree index build using
/// buildIndex() function declared below.
class BtreeDictionary: public Dictionary::Class, public BtreeIndex
{
public:
BtreeDictionary( string const & id, vector< string > const & dictionaryFiles );
/// Btree-indexed dictionaries are usually a good source for compound searches.
virtual Dictionary::Features getFeatures() const throw()
{ return Dictionary::SuitableForCompoundSearching; }
/// This function does the search using the btree index. Derivatives
/// need not to implement this function.
virtual sptr< Dictionary::WordSearchRequest > prefixMatch( wstring const &,
unsigned long )
throw( std::exception );
virtual sptr< Dictionary::WordSearchRequest > stemmedMatch( wstring const &,
unsigned minLength,
unsigned maxSuffixVariation,
unsigned long maxResults )
throw( std::exception );
protected:
/// Called before each matching operation to ensure that any child init
/// has completed. Mainly used for deferred init. The default implementation
/// does nothing.
/// The function returns an empty string if the initialization is or was
/// successful, or a human-readable error string otherwise.
virtual string const & ensureInitDone();
friend class BtreeWordSearchRequest;
};
// Everything below is for building the index data.
/// This represents the index in its source form, as a map which binds folded
/// words to sequences of their unfolded source forms and the corresponding
/// article offsets. The words are utf8-encoded -- it doesn't break Unicode
/// sorting, but conserves space.
struct IndexedWords: public map< string, vector< WordArticleLink > >
{
/// Instead of adding to the map directly, use this function. It does folding
/// itself, and for phrases/sentences it adds additional entries beginning with
/// each new word.
void addWord( wstring const & word, uint32_t articleOffset );
/// Differs from addWord() in that it only adds a single entry. We use this
/// for zip's file names.
void addSingleWord( wstring const & word, uint32_t articleOffset );
};
/// Builds the index, as a compressed btree. Returns IndexInfo.
/// All the data is stored to the given file, beginning from its current
/// position.
IndexInfo buildIndex( IndexedWords const &, File::Class & file );
}
#endif