goldendict-ng/winlibs/include/xapian/postingsource.h
2022-10-06 01:39:59 +08:00

784 lines
29 KiB
C++

/** @file
* @brief External sources of posting information
*/
/* Copyright (C) 2007,2008,2009,2010,2011,2012,2013,2014,2015,2016 Olly Betts
* Copyright (C) 2008,2009 Lemur Consulting Ltd
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef XAPIAN_INCLUDED_POSTINGSOURCE_H
#define XAPIAN_INCLUDED_POSTINGSOURCE_H
#if !defined XAPIAN_IN_XAPIAN_H && !defined XAPIAN_LIB_BUILD
# error Never use <xapian/postingsource.h> directly; include <xapian.h> instead.
#endif
#include <xapian/attributes.h>
#include <xapian/database.h>
#include <xapian/deprecated.h>
#include <xapian/intrusive_ptr.h>
#include <xapian/postingiterator.h>
#include <xapian/types.h>
#include <xapian/valueiterator.h>
#include <xapian/visibility.h>
#include <string>
#include <map>
namespace Xapian {
class Registry;
/** Base class which provides an "external" source of postings.
*/
class XAPIAN_VISIBILITY_DEFAULT PostingSource
: public Xapian::Internal::opt_intrusive_base {
/// Don't allow assignment.
void operator=(const PostingSource &);
/// Don't allow copying.
PostingSource(const PostingSource &);
/// The current upper bound on what get_weight() can return.
double max_weight_;
/** The object to inform of maxweight changes.
*
* We store this as a (void*) to avoid needing to declare an internal
* type in an external header. It's actually (MultiMatch *).
*/
void * matcher_;
public:
/// Allow subclasses to be instantiated.
XAPIAN_NOTHROW(PostingSource())
: max_weight_(0), matcher_(NULL) { }
/** @private @internal Set the object to inform of maxweight changes.
*
* This method is for internal use only - it would be private except that
* would force us to forward declare an internal class in an external API
* header just to make it a friend.
*/
void register_matcher_(void * matcher) { matcher_ = matcher; }
// Destructor.
virtual ~PostingSource();
/** A lower bound on the number of documents this object can return.
*
* Xapian will always call init() on a PostingSource before calling this
* for the first time.
*/
virtual Xapian::doccount get_termfreq_min() const = 0;
/** An estimate of the number of documents this object can return.
*
* It must always be true that:
*
* get_termfreq_min() <= get_termfreq_est() <= get_termfreq_max()
*
* Xapian will always call init() on a PostingSource before calling this
* for the first time.
*/
virtual Xapian::doccount get_termfreq_est() const = 0;
/** An upper bound on the number of documents this object can return.
*
* Xapian will always call init() on a PostingSource before calling this
* for the first time.
*/
virtual Xapian::doccount get_termfreq_max() const = 0;
/** Specify an upper bound on what get_weight() will return from now on.
*
* This upper bound is used by the matcher to perform various
* optimisations, so if you can return a good bound, then matches
* will generally run faster.
*
* This method should be called after calling init(), and may be called
* during iteration if the upper bound drops. It is probably only useful
* to call from subclasses (it was actually a "protected" method prior to
* Xapian 1.3.4, but that makes it tricky to wrap for other languages).
*
* It is valid for the posting source to have returned a higher value from
* get_weight() earlier in the iteration, but the posting source must not
* return a higher value from get_weight() than the currently set upper
* bound, and the upper bound must not be increased (until init() has been
* called).
*
* If you don't call this method, the upper bound will default to 0, for
* convenience when implementing "weight-less" PostingSource subclasses.
*
* @param max_weight The upper bound to set.
*/
void set_maxweight(double max_weight);
/// Return the currently set upper bound on what get_weight() can return.
double XAPIAN_NOTHROW(get_maxweight() const) { return max_weight_; }
/** Return the weight contribution for the current document.
*
* This default implementation always returns 0, for convenience when
* implementing "weight-less" PostingSource subclasses.
*
* This method may assume that it will only be called when there is a
* "current document". In detail: Xapian will always call init() on a
* PostingSource before calling this for the first time. It will also
* only call this if the PostingSource reports that it is pointing to a
* valid document (ie, it will not call it before calling at least one of
* next(), skip_to() or check(), and will ensure that the PostingSource is
* not at the end by calling at_end()).
*/
virtual double get_weight() const;
/** Return the current docid.
*
* This method may assume that it will only be called when there is a
* "current document". See @a get_weight() for details.
*
* Note: in the case of a multi-database search, the returned docid should
* be in the single subdatabase relevant to this posting source. See the
* @a init() method for details.
*/
virtual Xapian::docid get_docid() const = 0;
/** Advance the current position to the next matching document.
*
* The PostingSource starts before the first entry in the list, so next(),
* skip_to() or check() must be called before any methods which need the
* context of the current position.
*
* Xapian will always call init() on a PostingSource before calling this
* for the first time.
*
* @param min_wt The minimum weight contribution that is needed (this is
* just a hint which subclasses may ignore).
*/
virtual void next(double min_wt) = 0;
/** Advance to the specified docid.
*
* If the specified docid isn't in the list, position ourselves on the
* first document after it (or at_end() if no greater docids are present).
*
* If the current position is already the specified docid, this method will
* leave the position unmodified.
*
* If the specified docid is earlier than the current position, the
* behaviour is unspecified. A sensible behaviour would be to leave the
* current position unmodified, but it is also reasonable to move to the
* specified docid.
*
* The default implementation calls next() repeatedly, which works but
* skip_to() can often be implemented much more efficiently.
*
* Xapian will always call init() on a PostingSource before calling this
* for the first time.
*
* Note: in the case of a multi-database search, the docid specified is
* the docid in the single subdatabase relevant to this posting source.
* See the @a init() method for details.
*
* @param did The document id to advance to.
* @param min_wt The minimum weight contribution that is needed (this is
* just a hint which subclasses may ignore).
*/
virtual void skip_to(Xapian::docid did, double min_wt);
/** Check if the specified docid occurs.
*
* The caller is required to ensure that the specified document id @a did
* actually exists in the database. If it does, it must move to that
* document id, and return true. If it does not, it may either:
*
* - return true, having moved to a definite position (including
* "at_end"), which must be the same position as skip_to() would have
* moved to.
*
* or
*
* - return false, having moved to an "indeterminate" position, such that
* a subsequent call to next() or skip_to() will move to the next
* matching position after @a did.
*
* Generally, this method should act like skip_to() and return true if
* that can be done at little extra cost.
*
* Otherwise it should simply check if a particular docid is present,
* returning true if it is, and false if it isn't.
*
* The default implementation calls skip_to() and always returns true.
*
* Xapian will always call init() on a PostingSource before calling this
* for the first time.
*
* Note: in the case of a multi-database search, the docid specified is
* the docid in the single subdatabase relevant to this posting source.
* See the @a init() method for details.
*
* @param did The document id to check.
* @param min_wt The minimum weight contribution that is needed (this is
* just a hint which subclasses may ignore).
*/
virtual bool check(Xapian::docid did, double min_wt);
/** Return true if the current position is past the last entry in this list.
*
* At least one of @a next(), @a skip_to() or @a check() will be called
* before this method is first called.
*/
virtual bool at_end() const = 0;
/** Clone the posting source.
*
* The clone should inherit the configuration of the parent, but need not
* inherit the state. ie, the clone does not need to be in the same
* iteration position as the original: the matcher will always call
* init() on the clone before attempting to move the iterator, or read
* the information about the current position of the iterator.
*
* This may return NULL to indicate that cloning is not supported. In
* this case, the PostingSource may only be used with a single-database
* search.
*
* The default implementation returns NULL.
*
* Note that the returned object will be deallocated by Xapian after use
* with "delete". If you want to handle the deletion in a special way
* (for example when wrapping the Xapian API for use from another
* language) then you can define a static <code>operator delete</code>
* method in your subclass as shown here:
* https://trac.xapian.org/ticket/554#comment:1
*/
virtual PostingSource * clone() const;
/** Name of the posting source class.
*
* This is used when serialising and unserialising posting sources; for
* example, for performing remote searches.
*
* If the subclass is in a C++ namespace, the namespace should be included
* in the name, using "::" as a separator. For example, for a
* PostingSource subclass called "FooPostingSource" in the "Xapian"
* namespace the result of this call should be "Xapian::FooPostingSource".
*
* This should only be implemented if serialise() and unserialise() are
* also implemented. The default implementation returns an empty string.
*
* If this returns an empty string, Xapian will assume that serialise()
* and unserialise() are not implemented.
*/
virtual std::string name() const;
/** Serialise object parameters into a string.
*
* The serialised parameters should represent the configuration of the
* posting source, but need not (indeed, should not) represent the current
* iteration state.
*
* If you don't want to support the remote backend, you can use the
* default implementation which simply throws Xapian::UnimplementedError.
*/
virtual std::string serialise() const;
/** Create object given string serialisation returned by serialise().
*
* Note that the returned object will be deallocated by Xapian after use
* with "delete". If you want to handle the deletion in a special way
* (for example when wrapping the Xapian API for use from another
* language) then you can define a static <code>operator delete</code>
* method in your subclass as shown here:
* https://trac.xapian.org/ticket/554#comment:1
*
* If you don't want to support the remote backend, you can use the
* default implementation which simply throws Xapian::UnimplementedError.
*
* @param serialised A serialised instance of this PostingSource subclass.
*/
virtual PostingSource * unserialise(const std::string &serialised) const;
/** Create object given string serialisation returned by serialise().
*
* Note that the returned object will be deallocated by Xapian after use
* with "delete". If you want to handle the deletion in a special way
* (for example when wrapping the Xapian API for use from another
* language) then you can define a static <code>operator delete</code>
* method in your subclass as shown here:
* https://trac.xapian.org/ticket/554#comment:1
*
* This method is supplied with a Registry object, which can be used when
* unserialising objects contained within the posting source. The default
* implementation simply calls unserialise() which doesn't take the
* Registry object, so you do not need to implement this method unless you
* want to take advantage of the Registry object when unserialising.
*
* @param serialised A serialised instance of this PostingSource subclass.
* @param registry The Xapian::Registry object to use.
*/
virtual PostingSource * unserialise_with_registry(const std::string &serialised,
const Registry & registry) const;
/** Set this PostingSource to the start of the list of postings.
*
* This is called automatically by the matcher prior to each query being
* processed.
*
* If a PostingSource is used for multiple searches, @a init() will
* therefore be called multiple times, and must handle this by using the
* database passed in the most recent call.
*
* @param db The database which the PostingSource should iterate through.
*
* Note: in the case of a multi-database search, a separate PostingSource
* will be used for each database (the separate PostingSources will be
* obtained using @a clone()), and each PostingSource will be passed one of
* the sub-databases as the @a db parameter here. The @a db parameter
* will therefore always refer to a single database. All docids passed
* to, or returned from, the PostingSource refer to docids in that single
* database, rather than in the multi-database.
*/
virtual void init(const Database & db) = 0;
/** Return a string describing this object.
*
* This default implementation returns a generic answer. This default
* it provided to avoid forcing those deriving their own PostingSource
* subclass from having to implement this (they may not care what
* get_description() gives for their subclass).
*/
virtual std::string get_description() const;
/** Start reference counting this object.
*
* You can hand ownership of a dynamically allocated PostingSource
* object to Xapian by calling release() and then passing the object to a
* Xapian method. Xapian will arrange to delete the object once it is no
* longer required.
*/
PostingSource * release() {
opt_intrusive_base::release();
return this;
}
/** Start reference counting this object.
*
* You can hand ownership of a dynamically allocated PostingSource
* object to Xapian by calling release() and then passing the object to a
* Xapian method. Xapian will arrange to delete the object once it is no
* longer required.
*/
const PostingSource * release() const {
opt_intrusive_base::release();
return this;
}
};
/** A posting source which generates weights from a value slot.
*
* This is a base class for classes which generate weights using values stored
* in the specified slot. For example, ValueWeightPostingSource uses
* sortable_unserialise to convert values directly to weights.
*
* The upper bound on the weight returned is set to DBL_MAX. Subclasses
* should call set_maxweight() in their init() methods after calling
* ValuePostingSource::init() if they know a tighter bound on the weight.
*/
class XAPIAN_VISIBILITY_DEFAULT ValuePostingSource : public PostingSource {
// We want to give a deprecation warning for uses of the members from user
// code, but we also want to be able to inline functions to access them,
// without those functions generating deprecated warnings. To achieve
// this, we make the old names references to members with a "real_" prefix
// and then use the latter in the inlined accessor functions. The
// constructor initialises all the references to point to their "real_"
// counterparts.
Xapian::Database real_db;
Xapian::valueno real_slot;
Xapian::ValueIterator real_value_it;
bool real_started;
Xapian::doccount real_termfreq_min;
Xapian::doccount real_termfreq_est;
Xapian::doccount real_termfreq_max;
protected:
/** The database we're reading values from.
*
* @deprecated Use @a get_database() in preference.
*/
XAPIAN_DEPRECATED(Xapian::Database& db);
/** The slot we're reading values from.
*
* @deprecated Use @a get_slot() in preference.
*/
XAPIAN_DEPRECATED(Xapian::valueno& slot);
/** Value stream iterator.
*
* @deprecated Use @a get_value() in preference to *value_it, and other
* methods of ValuePostingSource in preference to calling methods of
* value_it.
*/
XAPIAN_DEPRECATED(Xapian::ValueIterator& value_it);
/** Flag indicating if we've started (true if we have).
*
* @deprecated Use @a get_started() in preference.
*/
XAPIAN_DEPRECATED(bool& started);
/** A lower bound on the term frequency.
*
* Subclasses should set this if they are overriding the next(), skip_to()
* or check() methods to return fewer documents.
*
* @deprecated Use @a set_termfreq_min() in preference.
*/
XAPIAN_DEPRECATED(Xapian::doccount& termfreq_min);
/** An estimate of the term frequency.
*
* Subclasses should set this if they are overriding the next(), skip_to()
* or check() methods.
*
* @deprecated Use @a set_termfreq_est() in preference.
*/
XAPIAN_DEPRECATED(Xapian::doccount& termfreq_est);
/** An upper bound on the term frequency.
*
* Subclasses should set this if they are overriding the next(), skip_to()
* or check() methods.
*
* @deprecated Use @a set_termfreq_max() in preference.
*/
XAPIAN_DEPRECATED(Xapian::doccount& termfreq_max);
public:
/** Construct a ValuePostingSource.
*
* @param slot_ The value slot to read values from.
*/
explicit ValuePostingSource(Xapian::valueno slot_);
Xapian::doccount get_termfreq_min() const;
Xapian::doccount get_termfreq_est() const;
Xapian::doccount get_termfreq_max() const;
void next(double min_wt);
void skip_to(Xapian::docid min_docid, double min_wt);
bool check(Xapian::docid min_docid, double min_wt);
bool at_end() const;
Xapian::docid get_docid() const;
void init(const Database & db_);
/** The database we're reading values from.
*
* Added in 1.2.23 and 1.3.5.
*/
Xapian::Database get_database() const { return real_db; }
/** The slot we're reading values from.
*
* Added in 1.2.23 and 1.3.5.
*/
Xapian::valueno get_slot() const { return real_slot; }
/** Read current value.
*
* Added in 1.2.23 and 1.3.5.
*/
std::string get_value() const { return *real_value_it; }
/** End the iteration.
*
* Calls to at_end() will return true after calling this method.
*
* Added in 1.2.23 and 1.3.5.
*/
void done() {
real_value_it = real_db.valuestream_end(real_slot);
real_started = true;
}
/** Flag indicating if we've started (true if we have).
*
* Added in 1.2.23 and 1.3.5.
*/
bool get_started() const { return real_started; }
/** Set a lower bound on the term frequency.
*
* Subclasses should set this if they are overriding the next(), skip_to()
* or check() methods to return fewer documents.
*
* Added in 1.2.23 and 1.3.5.
*/
void set_termfreq_min(Xapian::doccount termfreq_min_) {
real_termfreq_min = termfreq_min_;
}
/** An estimate of the term frequency.
*
* Subclasses should set this if they are overriding the next(), skip_to()
* or check() methods.
*
* Added in 1.2.23 and 1.3.5.
*/
void set_termfreq_est(Xapian::doccount termfreq_est_) {
real_termfreq_est = termfreq_est_;
}
/** An upper bound on the term frequency.
*
* Subclasses should set this if they are overriding the next(), skip_to()
* or check() methods.
*
* Added in 1.2.23 and 1.3.5.
*/
void set_termfreq_max(Xapian::doccount termfreq_max_) {
real_termfreq_max = termfreq_max_;
}
};
/** A posting source which reads weights from a value slot.
*
* This returns entries for all documents in the given database which have a
* non empty values in the specified slot. It returns a weight calculated by
* applying sortable_unserialise to the value stored in the slot (so the
* values stored should probably have been calculated by applying
* sortable_serialise to a floating point number at index time).
*
* The upper bound on the weight returned is set using the upper bound on the
* values in the specified slot, or DBL_MAX if value bounds aren't supported
* by the current backend.
*
* For efficiency, this posting source doesn't check that the stored values
* are valid in any way, so it will never raise an exception due to invalid
* stored values. In particular, it doesn't ensure that the unserialised
* values are positive, which is a requirement for weights. The behaviour if
* the slot contains values which unserialise to negative values is undefined.
*/
class XAPIAN_VISIBILITY_DEFAULT ValueWeightPostingSource
: public ValuePostingSource {
public:
/** Construct a ValueWeightPostingSource.
*
* @param slot_ The value slot to read values from.
*/
explicit ValueWeightPostingSource(Xapian::valueno slot_);
double get_weight() const;
ValueWeightPostingSource * clone() const;
std::string name() const;
std::string serialise() const;
ValueWeightPostingSource * unserialise(const std::string &serialised) const;
void init(const Database & db_);
std::string get_description() const;
};
/** Read weights from a value which is known to decrease as docid increases.
*
* This posting source can be used, like ValueWeightPostingSource, to add a
* weight contribution to a query based on the values stored in a slot. The
* values in the slot must be serialised as by @a sortable_serialise().
*
* However, this posting source is additionally given a range of document IDs,
* within which the weight is known to be decreasing. ie, for all documents
* with ids A and B within this range (including the endpoints), where A is
* less than B, the weight of A is less than or equal to the weight of B.
* This can allow the posting source to skip to the end of the range quickly
* if insufficient weight is left in the posting source for a particular
* source.
*
* By default, the range is assumed to cover all document IDs.
*
* The ordering property can be arranged at index time, or by sorting an
* indexed database to produce a new, sorted, database.
*/
class XAPIAN_VISIBILITY_DEFAULT DecreasingValueWeightPostingSource
: public Xapian::ValueWeightPostingSource {
protected:
/** Start of range of docids for which weights are known to be decreasing.
*
* 0 => first docid.
*/
Xapian::docid range_start;
/** End of range of docids for which weights are known to be decreasing.
*
* 0 => last docid.
*/
Xapian::docid range_end;
/// Weight at current position.
double curr_weight;
/// Flag, set to true if there are docs after the end of the range.
bool items_at_end;
/// Skip the iterator forward if in the decreasing range, and weight is low.
void skip_if_in_range(double min_wt);
public:
/** Construct a DecreasingValueWeightPostingSource.
*
* @param slot_ The value slot to read values from.
* @param range_start_ Start of range of docids for which weights are
* known to be decreasing (default: first docid)
* @param range_end_ End of range of docids for which weights are
* known to be decreasing (default: last docid)
*/
DecreasingValueWeightPostingSource(Xapian::valueno slot_,
Xapian::docid range_start_ = 0,
Xapian::docid range_end_ = 0);
double get_weight() const;
DecreasingValueWeightPostingSource * clone() const;
std::string name() const;
std::string serialise() const;
DecreasingValueWeightPostingSource * unserialise(const std::string &serialised) const;
void init(const Xapian::Database & db_);
void next(double min_wt);
void skip_to(Xapian::docid min_docid, double min_wt);
bool check(Xapian::docid min_docid, double min_wt);
std::string get_description() const;
};
/** A posting source which looks up weights in a map using values as the key.
*
* This allows will return entries for all documents in the given database
* which have a value in the slot specified. The values will be mapped to the
* corresponding weight in the weight map. If there is no mapping for a
* particular value, the default weight will be returned (which itself
* defaults to 0.0).
*/
class XAPIAN_VISIBILITY_DEFAULT ValueMapPostingSource
: public ValuePostingSource {
/// The default weight
double default_weight;
/// The maximum weight in weight_map.
double max_weight_in_map;
/// The value -> weight map
std::map<std::string, double> weight_map;
public:
/** Construct a ValueMapPostingSource.
*
* @param slot_ The value slot to read values from.
*/
explicit ValueMapPostingSource(Xapian::valueno slot_);
/** Add a mapping.
*
* @param key The key looked up from the value slot.
* @param wt The weight to give this key.
*/
void add_mapping(const std::string &key, double wt);
/** Clear all mappings. */
void clear_mappings();
/** Set a default weight for document values not in the map.
*
* @param wt The weight to set as the default.
*/
void set_default_weight(double wt);
double get_weight() const;
ValueMapPostingSource * clone() const;
std::string name() const;
std::string serialise() const;
ValueMapPostingSource * unserialise(const std::string &serialised) const;
void init(const Database & db_);
std::string get_description() const;
};
/** A posting source which returns a fixed weight for all documents.
*
* This returns entries for all documents in the given database, with a fixed
* weight (specified by a parameter to the constructor).
*/
class XAPIAN_VISIBILITY_DEFAULT FixedWeightPostingSource : public PostingSource {
/// The database we're reading documents from.
Xapian::Database db;
/// Number of documents in the posting source.
Xapian::doccount termfreq;
/// Iterator over all documents.
Xapian::PostingIterator it;
/// Flag indicating if we've started (true if we have).
bool started;
/// The docid last passed to check() (0 if check() wasn't the last move).
Xapian::docid check_docid;
public:
/** Construct a FixedWeightPostingSource.
*
* @param wt The fixed weight to return.
*/
explicit FixedWeightPostingSource(double wt);
Xapian::doccount get_termfreq_min() const;
Xapian::doccount get_termfreq_est() const;
Xapian::doccount get_termfreq_max() const;
double get_weight() const;
void next(double min_wt);
void skip_to(Xapian::docid min_docid, double min_wt);
bool check(Xapian::docid min_docid, double min_wt);
bool at_end() const;
Xapian::docid get_docid() const;
FixedWeightPostingSource * clone() const;
std::string name() const;
std::string serialise() const;
FixedWeightPostingSource * unserialise(const std::string &serialised) const;
void init(const Database & db_);
std::string get_description() const;
};
}
#endif // XAPIAN_INCLUDED_POSTINGSOURCE_H