mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-23 20:14:05 +00:00
414 lines
14 KiB
C++
414 lines
14 KiB
C++
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
|
|
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
|
|
|
|
#ifndef __DICTIONARY_HH_INCLUDED__
|
|
#define __DICTIONARY_HH_INCLUDED__
|
|
|
|
#include <vector>
|
|
#include <string>
|
|
#include <map>
|
|
#include <QObject>
|
|
#include <QIcon>
|
|
#include "sptr.hh"
|
|
#include "ex.hh"
|
|
#include "mutex.hh"
|
|
#include "wstring.hh"
|
|
|
|
/// Abstract dictionary-related stuff
|
|
namespace Dictionary {
|
|
|
|
using std::vector;
|
|
using std::string;
|
|
using gd::wstring;
|
|
using std::map;
|
|
|
|
enum Property
|
|
{
|
|
Author,
|
|
Copyright,
|
|
Description,
|
|
Email
|
|
};
|
|
|
|
DEF_EX( Ex, "Dictionary error", std::exception )
|
|
DEF_EX( exIndexOutOfRange, "The supplied index is out of range", Ex )
|
|
DEF_EX( exSliceOutOfRange, "The requested data slice is out of range", Ex )
|
|
DEF_EX( exRequestUnfinished, "The request hasn't yet finished", Ex )
|
|
|
|
/// When you request a search to be performed in a dictionary, you get
|
|
/// this structure in return. It accumulates search results over time.
|
|
/// The finished() signal is emitted when the search has finished and there's
|
|
/// no more matches to be expected. Note that before connecting to it, check
|
|
/// the result of isFinished() -- if it's 'true', the search was instantaneous.
|
|
/// Destroy the object when you are not interested in results anymore.
|
|
///
|
|
/// Creating, destroying and calling member functions of the requests is done
|
|
/// in the GUI thread, however. Therefore, it is important to make sure those
|
|
/// operations are fast (this is most important for word searches, where
|
|
/// new requests are created and old ones deleted immediately upon a user
|
|
/// changing query).
|
|
class Request: public QObject
|
|
{
|
|
Q_OBJECT
|
|
|
|
public:
|
|
|
|
/// Returns whether the request has been processed in full and finished.
|
|
/// This means that the data accumulated is final and won't change anymore.
|
|
bool isFinished();
|
|
|
|
/// Either returns an empty string in case there was no error processing
|
|
/// the request, or otherwise a human-readable string describing the problem.
|
|
/// Note that an empty result, such as a lack of word or of an article isn't
|
|
/// an error -- but any kind of failure to connect to, or read the dictionary
|
|
/// is.
|
|
QString getErrorString();
|
|
|
|
/// Cancels the ongoing request. This may make Request destruct faster some
|
|
/// time in the future, Use this in preparation to destruct many Requests,
|
|
/// so that they'd be cancelling in parallel. When the request was fully
|
|
/// cancelled, it must emit the finished() signal, either as a result of an
|
|
/// actual finish which has happened just before the cancellation, or solely as
|
|
/// a result of a request being cancelled (in the latter case, the actual
|
|
/// request result may be empty or incomplete). That is, finish() must be
|
|
/// called by a derivative at least once if cancel() was called, either after
|
|
/// or before it was called.
|
|
virtual void cancel()=0;
|
|
|
|
virtual ~Request()
|
|
{}
|
|
|
|
signals:
|
|
|
|
/// This signal is emitted when more data becomes available. Local
|
|
/// dictionaries typically don't call this, since it is preferred that all
|
|
/// data would be available from them at once, but network dictionaries
|
|
/// might call that.
|
|
void updated();
|
|
|
|
/// This signal is emitted when the request has been processed in full and
|
|
/// finished. That is, it's emitted when isFinished() turns true.
|
|
void finished();
|
|
|
|
protected:
|
|
|
|
/// Called by derivatives to signal update().
|
|
void update();
|
|
|
|
/// Called by derivatives to set isFinished() flag and signal finished().
|
|
void finish();
|
|
|
|
/// Sets the error string to be returned by getErrorString().
|
|
void setErrorString( QString const & );
|
|
|
|
private:
|
|
|
|
QAtomicInt isFinishedFlag;
|
|
|
|
Mutex errorStringMutex;
|
|
QString errorString;
|
|
};
|
|
|
|
/// This structure represents the word found. In addition to holding the
|
|
/// word itself, it also holds its weight. It is 0 by default. Negative
|
|
/// values should be used to store distance from Levenstein-like matching
|
|
/// algorithms. Positive values are used by morphology matches.
|
|
struct WordMatch
|
|
{
|
|
wstring word;
|
|
int weight;
|
|
|
|
WordMatch(): weight( 0 ) {}
|
|
WordMatch( wstring const & word_ ): word( word_ ), weight( 0 ){}
|
|
WordMatch( wstring const & word_, int weight_ ): word( word_ ),
|
|
weight( weight_ ) {}
|
|
};
|
|
|
|
/// This request type corresponds to all types of word searching operations.
|
|
class WordSearchRequest: public Request
|
|
{
|
|
Q_OBJECT
|
|
|
|
public:
|
|
|
|
WordSearchRequest(): uncertain( false )
|
|
{}
|
|
|
|
/// Returns the number of matches found. The value can grow over time
|
|
/// unless isFinished() is true.
|
|
size_t matchesCount();
|
|
|
|
/// Returns the match with the given zero-based index, which should be less
|
|
/// than matchesCount().
|
|
WordMatch operator [] ( size_t index ) throw( exIndexOutOfRange );
|
|
|
|
/// Returns all the matches found. Since no further locking can or would be
|
|
/// done, this can only be called after the request has finished.
|
|
vector< WordMatch > & getAllMatches() throw( exRequestUnfinished );
|
|
|
|
/// Returns true if the match was uncertain -- that is, there may be more
|
|
/// results in the dictionary itself, the dictionary index isn't good enough
|
|
/// to tell that.
|
|
bool isUncertain() const
|
|
{ return uncertain; }
|
|
|
|
protected:
|
|
|
|
// Subclasses should be filling up the 'matches' array, locking the mutex when
|
|
// whey work with it.
|
|
Mutex dataMutex;
|
|
|
|
vector< WordMatch > matches;
|
|
bool uncertain;
|
|
};
|
|
|
|
/// This request type corresponds to any kinds of data responses where a
|
|
/// single large blob of binary data is returned. It currently used of article
|
|
/// bodies and resources.
|
|
class DataRequest: public Request
|
|
{
|
|
Q_OBJECT
|
|
|
|
public:
|
|
|
|
/// Returns the number of bytes read, with a -1 meaning that so far it's
|
|
/// uncertain whether resource even exists or not, and any non-negative value
|
|
/// meaning that that amount of bytes is not available.
|
|
/// If -1 is still being returned after the request has finished, that means
|
|
/// the resource wasn't found.
|
|
long dataSize();
|
|
|
|
/// Writes "size" bytes starting from "offset" of the data read to the given
|
|
/// buffer. "size + offset" must be <= than dataSize().
|
|
void getDataSlice( size_t offset, size_t size, void * buffer )
|
|
throw( exSliceOutOfRange );
|
|
|
|
/// Returns all the data read. Since no further locking can or would be
|
|
/// done, this can only be called after the request has finished.
|
|
vector< char > & getFullData() throw( exRequestUnfinished );
|
|
|
|
DataRequest(): hasAnyData( false ) {}
|
|
|
|
protected:
|
|
|
|
// Subclasses should be filling up the 'data' array, locking the mutex when
|
|
// whey work with it.
|
|
Mutex dataMutex;
|
|
|
|
bool hasAnyData; // With this being false, dataSize() always returns -1
|
|
vector< char > data;
|
|
};
|
|
|
|
/// A helper class for syncronous word search implementations.
|
|
class WordSearchRequestInstant: public WordSearchRequest
|
|
{
|
|
public:
|
|
|
|
WordSearchRequestInstant()
|
|
{ finish(); }
|
|
|
|
virtual void cancel()
|
|
{}
|
|
|
|
vector< WordMatch > & getMatches()
|
|
{ return matches; }
|
|
|
|
void setUncertain( bool value )
|
|
{ uncertain = value; }
|
|
};
|
|
|
|
/// A helper class for syncronous data read implementations.
|
|
class DataRequestInstant: public DataRequest
|
|
{
|
|
public:
|
|
|
|
DataRequestInstant( bool succeeded )
|
|
{ hasAnyData = succeeded; finish(); }
|
|
|
|
DataRequestInstant( QString const & errorString )
|
|
{ setErrorString( errorString ); finish(); }
|
|
|
|
virtual void cancel()
|
|
{}
|
|
|
|
vector< char > & getData()
|
|
{ return data; }
|
|
};
|
|
|
|
/// Dictionary features. Different dictionaries can possess different features,
|
|
/// which hint at some of their aspects.
|
|
enum Feature
|
|
{
|
|
/// No features
|
|
NoFeatures = 0,
|
|
/// The dictionary is suitable to query when searching for compound expressions.
|
|
SuitableForCompoundSearching = 1
|
|
};
|
|
|
|
Q_DECLARE_FLAGS( Features, Feature )
|
|
Q_DECLARE_OPERATORS_FOR_FLAGS( Features )
|
|
|
|
/// A dictionary. Can be used to query words.
|
|
class Class
|
|
{
|
|
string id;
|
|
vector< string > dictionaryFiles;
|
|
|
|
public:
|
|
|
|
/// Creates a dictionary. The id should be made using
|
|
/// Format::makeDictionaryId(), the dictionaryFiles is the file names the
|
|
/// dictionary consists of.
|
|
Class( string const & id, vector< string > const & dictionaryFiles );
|
|
|
|
/// Called once after the dictionary is constructed. Usually called for each
|
|
/// dictionaries once all dictionaries were made. The implementation should
|
|
/// queue any initialization tasks the dictionary decided to postpone to
|
|
/// threadpools, network requests etc, so the system could complete them
|
|
/// in background.
|
|
/// The default implementation does nothing.
|
|
virtual void deferredInit();
|
|
|
|
/// Returns the dictionary's id.
|
|
string getId() throw()
|
|
{ return id; }
|
|
|
|
/// Returns the list of file names the dictionary consists of.
|
|
vector< string > const & getDictionaryFilenames() throw()
|
|
{ return dictionaryFiles; }
|
|
|
|
/// Returns the dictionary's full name, utf8.
|
|
virtual string getName() throw()=0;
|
|
|
|
/// Returns all the available properties, like the author's name, copyright,
|
|
/// description etc. All strings are in utf8.
|
|
virtual map< Property, string > getProperties() throw()=0;
|
|
|
|
/// Returns the features the dictionary possess. See the Feature enum for
|
|
/// their list.
|
|
virtual Features getFeatures() const throw()
|
|
{ return NoFeatures; }
|
|
|
|
/// Returns the number of articles in the dictionary.
|
|
virtual unsigned long getArticleCount() throw()=0;
|
|
|
|
/// Returns the number of words in the dictionary. This can be equal to
|
|
/// the number of articles, or can be larger if some synonyms are present.
|
|
virtual unsigned long getWordCount() throw()=0;
|
|
|
|
/// Returns the dictionary's icon.
|
|
virtual QIcon getIcon() throw()
|
|
{ return QIcon(); }
|
|
|
|
/// Returns the dictionary's native icon. Dsl icons are usually rectangular,
|
|
/// and are adapted by getIcon() to be square. This function allows getting
|
|
/// the original icon with no geometry transformations applied.
|
|
virtual QIcon getNativeIcon() throw()
|
|
{ return getIcon(); }
|
|
|
|
/// Returns the dictionary's source language.
|
|
virtual quint32 getLangFrom() const
|
|
{ return 0; }
|
|
|
|
/// Returns the dictionary's target language.
|
|
virtual quint32 getLangTo() const
|
|
{ return 0; }
|
|
|
|
/// Looks up a given word in the dictionary, aiming for exact matches and
|
|
/// prefix matches. If it's not possible to locate any prefix matches, no
|
|
/// prefix results should be added. Not more than maxResults results should
|
|
/// be stored. The whole operation is supposed to be fast, though some
|
|
/// dictionaries, the network ones particularly, may of course be slow.
|
|
virtual sptr< WordSearchRequest > prefixMatch( wstring const &,
|
|
unsigned long maxResults ) throw( std::exception )=0;
|
|
|
|
/// Looks up a given word in the dictionary, aiming to find different forms
|
|
/// of the given word by allowing suffix variations. This means allowing words
|
|
/// which can be as short as the input word size minus maxSuffixVariation, or as
|
|
/// long as the input word size plus maxSuffixVariation, which share at least
|
|
/// the input word size minus maxSuffixVariation initial symbols.
|
|
/// Since the goal is to find forms of the words, no matches where a word
|
|
/// in the middle of a phrase got matched should be returned.
|
|
/// The default implementation does nothing, returning an empty result.
|
|
virtual sptr< WordSearchRequest > stemmedMatch( wstring const &,
|
|
unsigned minLength,
|
|
unsigned maxSuffixVariation,
|
|
unsigned long maxResults ) throw( std::exception );
|
|
|
|
/// Finds known headwords for the given word, that is, the words for which
|
|
/// the given word is a synonym. If a dictionary can't perform this operation,
|
|
/// it should leave the default implementation which always returns an empty
|
|
/// result.
|
|
virtual sptr< WordSearchRequest > findHeadwordsForSynonym( wstring const & )
|
|
throw( std::exception );
|
|
|
|
/// For a given word, provides alternate writings of it which are to be looked
|
|
/// up alongside with it. Transliteration dictionaries implement this. The
|
|
/// default implementation returns an empty list. Note that this function is
|
|
/// supposed to be very fast and simple, and the results are thus returned
|
|
/// syncronously.
|
|
virtual vector< wstring > getAlternateWritings( wstring const & )
|
|
throw();
|
|
|
|
/// Returns a definition for the given word. The definition should
|
|
/// be an html fragment (without html/head/body tags) in an utf8 encoding.
|
|
/// The 'alts' vector could contain a list of words the definitions of which
|
|
/// should be included in the output as well, being treated as additional
|
|
/// synonyms for the main word.
|
|
/// context is a dictionary-specific data, currently only used for the
|
|
/// 'Websites' feature.
|
|
virtual sptr< DataRequest > getArticle( wstring const &,
|
|
vector< wstring > const & alts,
|
|
wstring const & context = wstring() )
|
|
throw( std::exception )=0;
|
|
|
|
/// Loads contents of a resource named 'name' into the 'data' vector. This is
|
|
/// usually a picture file referenced in the article or something like that.
|
|
/// The default implementation always returns the non-existing resource
|
|
/// response.
|
|
virtual sptr< DataRequest > getResource( string const & /*name*/ )
|
|
throw( std::exception );
|
|
|
|
virtual ~Class()
|
|
{}
|
|
};
|
|
|
|
/// Callbacks to be used when the dictionaries are being initialized.
|
|
class Initializing
|
|
{
|
|
public:
|
|
|
|
/// Called by the Format instance to notify the caller that the given
|
|
/// dictionary is being indexed. Since indexing can take some time, this
|
|
/// is useful to show in some kind of a splash screen.
|
|
/// The dictionaryName is in utf8.
|
|
virtual void indexingDictionary( string const & dictionaryName ) throw()=0;
|
|
|
|
virtual ~Initializing()
|
|
{}
|
|
};
|
|
|
|
/// Generates an id based on the set of file names which the dictionary
|
|
/// consists of. The resulting id is an alphanumeric hex value made by
|
|
/// hashing the file names. This id should be used to identify dictionary
|
|
/// and for the index file name, if one is needed.
|
|
/// This function is supposed to be used by dictionary implementations.
|
|
string makeDictionaryId( vector< string > const & dictionaryFiles ) throw();
|
|
|
|
/// Checks if it is needed to regenerate index file based on its timestamp
|
|
/// and the timestamps of the dictionary files. If some files are newer than
|
|
/// the index file, or the index file doesn't exist, returns true. If some
|
|
/// dictionary files don't exist, returns true, too.
|
|
/// This function is supposed to be used by dictionary implementations.
|
|
bool needToRebuildIndex( vector< string > const & dictionaryFiles,
|
|
string const & indexFile ) throw();
|
|
|
|
/// Returns a random dictionary id useful for interactively created
|
|
/// dictionaries.
|
|
QString generateRandomDictionaryId();
|
|
|
|
}
|
|
|
|
#endif
|
|
|