2012-02-20 21:47:14 +00:00
|
|
|
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
|
2009-01-28 20:55:45 +00:00
|
|
|
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
|
|
|
|
|
|
|
|
#ifndef __BTREEIDX_HH_INCLUDED__
|
|
|
|
#define __BTREEIDX_HH_INCLUDED__
|
|
|
|
|
|
|
|
#include "dictionary.hh"
|
|
|
|
#include "file.hh"
|
2009-04-29 23:18:26 +00:00
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
|
|
|
#include <map>
|
2021-02-01 15:00:31 +00:00
|
|
|
#include <algorithm>
|
2014-02-28 12:36:28 +00:00
|
|
|
#include <QVector>
|
|
|
|
#include <QSet>
|
2014-04-16 16:18:28 +00:00
|
|
|
#include <QList>
|
2009-01-28 20:55:45 +00:00
|
|
|
|
2009-04-29 23:18:26 +00:00
|
|
|
#include <stdint.h>
|
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
/// A base for the dictionary which creates a btree index to look up
|
|
|
|
/// the words.
|
|
|
|
namespace BtreeIndexing {
|
|
|
|
|
|
|
|
using std::string;
|
2009-04-18 17:20:12 +00:00
|
|
|
using gd::wstring;
|
2009-01-28 20:55:45 +00:00
|
|
|
using std::vector;
|
|
|
|
using std::map;
|
|
|
|
|
|
|
|
enum
|
|
|
|
{
|
|
|
|
/// This is to be bumped up each time the internal format changes.
|
|
|
|
/// The value isn't used here by itself, it is supposed to be added
|
|
|
|
/// to each dictionary's internal format version.
|
2012-09-14 23:07:01 +00:00
|
|
|
FormatVersion = 4
|
2009-01-28 20:55:45 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
// These exceptions which might be thrown during the index traversal
|
|
|
|
|
|
|
|
DEF_EX( exIndexWasNotOpened, "The index wasn't opened", Dictionary::Ex )
|
|
|
|
DEF_EX( exFailedToDecompressNode, "Failed to decompress a btree's node", Dictionary::Ex )
|
|
|
|
DEF_EX( exCorruptedChainData, "Corrupted chain data in the leaf of a btree encountered", Dictionary::Ex )
|
|
|
|
|
|
|
|
/// This structure describes a word linked to its translation. The
|
|
|
|
/// translation is represented as an abstract 32-bit offset.
|
|
|
|
struct WordArticleLink
|
|
|
|
{
|
2009-04-08 16:02:12 +00:00
|
|
|
string word, prefix; // in utf8
|
2009-01-28 20:55:45 +00:00
|
|
|
uint32_t articleOffset;
|
|
|
|
|
|
|
|
WordArticleLink()
|
|
|
|
{}
|
|
|
|
|
2009-04-08 16:02:12 +00:00
|
|
|
WordArticleLink( string const & word_, uint32_t articleOffset_, string const & prefix_ = string() ):
|
|
|
|
word( word_ ), prefix( prefix_ ), articleOffset( articleOffset_ )
|
2009-01-28 20:55:45 +00:00
|
|
|
{}
|
|
|
|
};
|
|
|
|
|
2009-04-14 16:35:47 +00:00
|
|
|
/// Information needed to open the index
|
|
|
|
struct IndexInfo
|
|
|
|
{
|
|
|
|
uint32_t btreeMaxElements, rootOffset;
|
|
|
|
|
|
|
|
IndexInfo( uint32_t btreeMaxElements_, uint32_t rootOffset_ ):
|
|
|
|
btreeMaxElements( btreeMaxElements_ ), rootOffset( rootOffset_ )
|
|
|
|
{}
|
|
|
|
};
|
|
|
|
|
2009-10-25 22:49:24 +00:00
|
|
|
/// Base btree indexing class which allows using what buildIndex() function
|
2020-10-20 02:49:37 +00:00
|
|
|
/// created. It's quite low-lovel and is basically a set of 'building blocks'
|
2009-10-25 22:49:24 +00:00
|
|
|
/// functions.
|
|
|
|
class BtreeIndex
|
2009-01-28 20:55:45 +00:00
|
|
|
{
|
|
|
|
public:
|
|
|
|
|
2009-10-25 22:49:24 +00:00
|
|
|
BtreeIndex();
|
2009-04-17 13:51:50 +00:00
|
|
|
|
2009-04-14 16:35:47 +00:00
|
|
|
/// Opens the index. The file reference is saved to be used for
|
2009-01-28 20:55:45 +00:00
|
|
|
/// subsequent lookups.
|
2009-03-26 19:00:08 +00:00
|
|
|
/// The mutex is the one to be locked when working with the file.
|
2009-04-14 16:35:47 +00:00
|
|
|
void openIndex( IndexInfo const &, File::Class &, Mutex & );
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
/// Finds articles that match the given string. A case-insensitive search
|
|
|
|
/// is performed.
|
2018-04-10 14:49:52 +00:00
|
|
|
vector< WordArticleLink > findArticles( wstring const &, bool ignoreDiacritics = false );
|
2009-01-28 20:55:45 +00:00
|
|
|
|
2014-04-16 16:18:28 +00:00
|
|
|
/// Find all unique article links in the index
|
|
|
|
void findAllArticleLinks( QVector< WordArticleLink > & articleLinks );
|
2014-02-28 12:36:28 +00:00
|
|
|
|
2020-10-20 02:49:37 +00:00
|
|
|
/// Retrieve all unique headwords from index
|
2014-02-28 12:36:28 +00:00
|
|
|
void getAllHeadwords( QSet< QString > & headwords );
|
2014-04-16 16:18:28 +00:00
|
|
|
|
|
|
|
/// Find all article links and/or headwords in the index
|
|
|
|
void findArticleLinks( QVector< WordArticleLink > * articleLinks,
|
|
|
|
QSet< uint32_t > * offsets,
|
2014-04-17 14:18:15 +00:00
|
|
|
QSet< QString > * headwords,
|
|
|
|
QAtomicInt * isCancelled = 0 );
|
2014-04-16 16:18:28 +00:00
|
|
|
|
2018-07-07 09:33:15 +00:00
|
|
|
/// Retrieve headwords for presented article addresses
|
2014-04-17 14:18:15 +00:00
|
|
|
void getHeadwordsFromOffsets( QList< uint32_t > & offsets,
|
|
|
|
QVector< QString > & headwords,
|
|
|
|
QAtomicInt * isCancelled = 0 );
|
2014-04-16 16:18:28 +00:00
|
|
|
|
2013-09-14 16:17:32 +00:00
|
|
|
protected:
|
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
/// Finds the offset in the btree leaf for the given word, either matching
|
|
|
|
/// by an exact match, or by finding the smallest entry that might match
|
|
|
|
/// by prefix. It can return zero if there isn't even a possible prefx
|
|
|
|
/// match. The input string must already be folded. The exactMatch is set
|
|
|
|
/// to true when an exact match is located, and to false otherwise.
|
|
|
|
/// The located leaf is loaded to 'leaf', and the pointer to the next
|
|
|
|
/// leaf is saved to 'nextLeaf'.
|
2009-04-14 16:35:47 +00:00
|
|
|
/// However, due to root node being permanently cached, the 'leaf' passed
|
|
|
|
/// might not get used at all if the root node was the terminal one. In that
|
|
|
|
/// case, the returned pointer wouldn't belong to 'leaf' at all. To that end,
|
|
|
|
/// the leafEnd pointer always holds the pointer to the first byte outside
|
|
|
|
/// the node data.
|
2009-01-28 20:55:45 +00:00
|
|
|
char const * findChainOffsetExactOrPrefix( wstring const & target,
|
|
|
|
bool & exactMatch,
|
|
|
|
vector< char > & leaf,
|
2009-04-14 16:35:47 +00:00
|
|
|
uint32_t & nextLeaf,
|
|
|
|
char const * & leafEnd );
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
/// Reads a node or leaf at the given offset. Just uncompresses its data
|
|
|
|
/// to the given vector and does nothing more.
|
|
|
|
void readNode( uint32_t offset, vector< char > & out );
|
|
|
|
|
|
|
|
/// Reads the word-article links' chain at the given offset. The pointer
|
|
|
|
/// is updated to point to the next chain, if there's any.
|
|
|
|
vector< WordArticleLink > readChain( char const * & );
|
|
|
|
|
2020-10-20 02:49:37 +00:00
|
|
|
/// Drops any aliases which arose due to folding. Only case-folded aliases
|
2009-01-28 20:55:45 +00:00
|
|
|
/// are left.
|
2018-04-10 14:49:52 +00:00
|
|
|
void antialias( wstring const &, vector< WordArticleLink > &, bool ignoreDiactitics );
|
2009-03-26 19:00:08 +00:00
|
|
|
|
2009-10-25 22:49:24 +00:00
|
|
|
protected:
|
|
|
|
|
|
|
|
Mutex * idxFileMutex;
|
|
|
|
File::Class * idxFile;
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
|
|
|
uint32_t indexNodeSize;
|
|
|
|
uint32_t rootOffset;
|
|
|
|
bool rootNodeLoaded;
|
|
|
|
vector< char > rootNode; // We load root note here and keep it at all times,
|
|
|
|
// since all searches always start with it.
|
|
|
|
};
|
|
|
|
|
|
|
|
/// A base for the dictionary that utilizes a btree index build using
|
|
|
|
/// buildIndex() function declared below.
|
|
|
|
class BtreeDictionary: public Dictionary::Class, public BtreeIndex
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
|
|
|
|
BtreeDictionary( string const & id, vector< string > const & dictionaryFiles );
|
|
|
|
|
2010-05-29 20:50:16 +00:00
|
|
|
/// Btree-indexed dictionaries are usually a good source for compound searches.
|
2022-06-03 13:28:41 +00:00
|
|
|
virtual Dictionary::Features getFeatures() const noexcept
|
2010-05-29 20:50:16 +00:00
|
|
|
{ return Dictionary::SuitableForCompoundSearching; }
|
|
|
|
|
2014-07-10 13:55:14 +00:00
|
|
|
/// This function does the search using the btree index. Derivatives usually
|
2009-10-25 22:49:24 +00:00
|
|
|
/// need not to implement this function.
|
|
|
|
virtual sptr< Dictionary::WordSearchRequest > prefixMatch( wstring const &,
|
|
|
|
unsigned long )
|
2022-01-09 08:35:07 +00:00
|
|
|
;
|
2009-10-25 22:49:24 +00:00
|
|
|
|
|
|
|
virtual sptr< Dictionary::WordSearchRequest > stemmedMatch( wstring const &,
|
|
|
|
unsigned minLength,
|
|
|
|
unsigned maxSuffixVariation,
|
|
|
|
unsigned long maxResults )
|
2022-01-09 08:35:07 +00:00
|
|
|
;
|
2009-10-25 22:49:24 +00:00
|
|
|
|
2013-06-09 13:31:57 +00:00
|
|
|
virtual bool isLocalDictionary()
|
|
|
|
{ return true; }
|
|
|
|
|
2014-02-28 12:36:28 +00:00
|
|
|
virtual bool getHeadwords( QStringList &headwords );
|
|
|
|
|
2014-04-16 16:18:28 +00:00
|
|
|
virtual void getArticleText( uint32_t articleAddress, QString & headword, QString & text );
|
|
|
|
|
|
|
|
string const & ftsIndexName() const
|
|
|
|
{ return ftsIdxName; }
|
|
|
|
|
|
|
|
Mutex & getFtsMutex()
|
|
|
|
{ return ftsIdxMutex; }
|
2009-10-25 22:49:24 +00:00
|
|
|
|
2014-11-22 14:22:04 +00:00
|
|
|
virtual uint32_t getFtsIndexVersion()
|
|
|
|
{ return 0; }
|
|
|
|
|
2018-03-07 21:17:09 +00:00
|
|
|
// Sort articles offsets for full-text search in dictionary-specific order
|
|
|
|
// to increase of articles retrieving speed
|
|
|
|
// Default - simple sorting in increase order
|
2018-03-08 08:46:19 +00:00
|
|
|
virtual void sortArticlesOffsetsForFTS( QVector< uint32_t > & offsets,
|
|
|
|
QAtomicInt & isCancelled )
|
2021-02-01 15:00:31 +00:00
|
|
|
{ Q_UNUSED( isCancelled ); std::sort( offsets.begin(), offsets.end() ); }
|
2018-03-07 21:17:09 +00:00
|
|
|
|
2009-10-25 22:49:24 +00:00
|
|
|
/// Called before each matching operation to ensure that any child init
|
|
|
|
/// has completed. Mainly used for deferred init. The default implementation
|
|
|
|
/// does nothing.
|
|
|
|
/// The function returns an empty string if the initialization is or was
|
|
|
|
/// successful, or a human-readable error string otherwise.
|
|
|
|
virtual string const & ensureInitDone();
|
|
|
|
|
2014-04-16 16:18:28 +00:00
|
|
|
protected:
|
|
|
|
Mutex ftsIdxMutex;
|
|
|
|
string ftsIdxName;
|
|
|
|
|
2009-03-26 19:00:08 +00:00
|
|
|
friend class BtreeWordSearchRequest;
|
2014-04-16 16:18:28 +00:00
|
|
|
friend class FTSResultsRequest;
|
2009-01-28 20:55:45 +00:00
|
|
|
};
|
|
|
|
|
2014-07-10 13:55:14 +00:00
|
|
|
class BtreeWordSearchRequest: public Dictionary::WordSearchRequest
|
|
|
|
{
|
|
|
|
friend class BtreeWordSearchRunnable;
|
|
|
|
protected:
|
|
|
|
BtreeDictionary & dict;
|
|
|
|
wstring str;
|
|
|
|
unsigned long maxResults;
|
|
|
|
unsigned minLength;
|
|
|
|
int maxSuffixVariation;
|
|
|
|
bool allowMiddleMatches;
|
|
|
|
QAtomicInt isCancelled;
|
|
|
|
QSemaphore hasExited;
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
|
|
|
BtreeWordSearchRequest( BtreeDictionary & dict_,
|
|
|
|
wstring const & str_,
|
|
|
|
unsigned minLength_,
|
|
|
|
int maxSuffixVariation_,
|
|
|
|
bool allowMiddleMatches_,
|
|
|
|
unsigned long maxResults_,
|
|
|
|
bool startRunnable = true );
|
|
|
|
|
|
|
|
virtual void findMatches();
|
|
|
|
|
|
|
|
void run(); // Run from another thread by BtreeWordSearchRunnable
|
|
|
|
|
|
|
|
virtual void cancel()
|
|
|
|
{
|
|
|
|
isCancelled.ref();
|
|
|
|
}
|
|
|
|
|
|
|
|
~BtreeWordSearchRequest();
|
|
|
|
};
|
|
|
|
|
2009-01-28 20:55:45 +00:00
|
|
|
// Everything below is for building the index data.
|
|
|
|
|
|
|
|
/// This represents the index in its source form, as a map which binds folded
|
|
|
|
/// words to sequences of their unfolded source forms and the corresponding
|
2009-04-19 13:45:14 +00:00
|
|
|
/// article offsets. The words are utf8-encoded -- it doesn't break Unicode
|
|
|
|
/// sorting, but conserves space.
|
|
|
|
struct IndexedWords: public map< string, vector< WordArticleLink > >
|
2009-04-08 16:02:12 +00:00
|
|
|
{
|
|
|
|
/// Instead of adding to the map directly, use this function. It does folding
|
|
|
|
/// itself, and for phrases/sentences it adds additional entries beginning with
|
|
|
|
/// each new word.
|
2022-08-13 04:41:01 +00:00
|
|
|
void addWord( wstring const & word, uint32_t articleOffset, unsigned int maxHeadwordSize = 100U );
|
2009-10-25 22:49:24 +00:00
|
|
|
|
|
|
|
/// Differs from addWord() in that it only adds a single entry. We use this
|
|
|
|
/// for zip's file names.
|
|
|
|
void addSingleWord( wstring const & word, uint32_t articleOffset );
|
2009-04-08 16:02:12 +00:00
|
|
|
};
|
2009-01-28 20:55:45 +00:00
|
|
|
|
2009-04-14 16:35:47 +00:00
|
|
|
/// Builds the index, as a compressed btree. Returns IndexInfo.
|
2009-01-28 20:55:45 +00:00
|
|
|
/// All the data is stored to the given file, beginning from its current
|
|
|
|
/// position.
|
2009-04-14 16:35:47 +00:00
|
|
|
IndexInfo buildIndex( IndexedWords const &, File::Class & file );
|
2009-01-28 20:55:45 +00:00
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|