mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-24 12:44:07 +00:00
1538 lines
52 KiB
C
1538 lines
52 KiB
C
|
/** @file
|
||
|
* @brief Weighting scheme API.
|
||
|
*/
|
||
|
/* Copyright (C) 2004,2007,2008,2009,2010,2011,2012,2015,2016,2019 Olly Betts
|
||
|
* Copyright (C) 2009 Lemur Consulting Ltd
|
||
|
* Copyright (C) 2013,2014 Aarsh Shah
|
||
|
* Copyright (C) 2016 Vivek Pal
|
||
|
*
|
||
|
* This program is free software; you can redistribute it and/or
|
||
|
* modify it under the terms of the GNU General Public License as
|
||
|
* published by the Free Software Foundation; either version 2 of the
|
||
|
* License, or (at your option) any later version.
|
||
|
*
|
||
|
* This program is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
* GNU General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU General Public License
|
||
|
* along with this program; if not, write to the Free Software
|
||
|
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||
|
*/
|
||
|
|
||
|
#ifndef XAPIAN_INCLUDED_WEIGHT_H
|
||
|
#define XAPIAN_INCLUDED_WEIGHT_H
|
||
|
|
||
|
#include <string>
|
||
|
|
||
|
#include <xapian/types.h>
|
||
|
#include <xapian/visibility.h>
|
||
|
|
||
|
namespace Xapian {
|
||
|
|
||
|
/** Abstract base class for weighting schemes. */
|
||
|
class XAPIAN_VISIBILITY_DEFAULT Weight {
|
||
|
protected:
|
||
|
/// Stats which the weighting scheme can use (see @a need_stat()).
|
||
|
typedef enum {
|
||
|
/// Number of documents in the collection.
|
||
|
COLLECTION_SIZE = 1,
|
||
|
/// Number of documents in the RSet.
|
||
|
RSET_SIZE = 2,
|
||
|
/// Average length of documents in the collection.
|
||
|
AVERAGE_LENGTH = 4,
|
||
|
/// How many documents the current term is in.
|
||
|
TERMFREQ = 8,
|
||
|
/// How many documents in the RSet the current term is in.
|
||
|
RELTERMFREQ = 16,
|
||
|
/// Sum of wqf for terms in the query.
|
||
|
QUERY_LENGTH = 32,
|
||
|
/// Within-query-frequency of the current term.
|
||
|
WQF = 64,
|
||
|
/// Within-document-frequency of the current term in the current document.
|
||
|
WDF = 128,
|
||
|
/// Length of the current document (sum wdf).
|
||
|
DOC_LENGTH = 256,
|
||
|
/// Lower bound on (non-zero) document lengths.
|
||
|
DOC_LENGTH_MIN = 512,
|
||
|
/// Upper bound on document lengths.
|
||
|
DOC_LENGTH_MAX = 1024,
|
||
|
/// Upper bound on wdf.
|
||
|
WDF_MAX = 2048,
|
||
|
/// Sum of wdf over the whole collection for the current term.
|
||
|
COLLECTION_FREQ = 4096,
|
||
|
/// Number of unique terms in the current document.
|
||
|
UNIQUE_TERMS = 8192,
|
||
|
/** Sum of lengths of all documents in the collection.
|
||
|
*
|
||
|
* This gives the total number of term occurrences.
|
||
|
*/
|
||
|
TOTAL_LENGTH = COLLECTION_SIZE | AVERAGE_LENGTH
|
||
|
} stat_flags;
|
||
|
|
||
|
/** Tell Xapian that your subclass will want a particular statistic.
|
||
|
*
|
||
|
* Some of the statistics can be costly to fetch or calculate, so
|
||
|
* Xapian needs to know which are actually going to be used. You
|
||
|
* should call need_stat() from your constructor for each such
|
||
|
* statistic.
|
||
|
*
|
||
|
* @param flag The stat_flags value for a required statistic.
|
||
|
*/
|
||
|
void need_stat(stat_flags flag) {
|
||
|
stats_needed = stat_flags(stats_needed | flag);
|
||
|
}
|
||
|
|
||
|
/** Allow the subclass to perform any initialisation it needs to.
|
||
|
*
|
||
|
* @param factor Any scaling factor (e.g. from OP_SCALE_WEIGHT).
|
||
|
* If the Weight object is for the term-independent
|
||
|
* weight supplied by get_sumextra()/get_maxextra(),
|
||
|
* then init(0.0) is called (starting from Xapian
|
||
|
* 1.2.11 and 1.3.1 - earlier versions failed to
|
||
|
* call init() for such Weight objects).
|
||
|
*/
|
||
|
virtual void init(double factor) = 0;
|
||
|
|
||
|
private:
|
||
|
/// Don't allow assignment.
|
||
|
void operator=(const Weight &);
|
||
|
|
||
|
/// A bitmask of the statistics this weighting scheme needs.
|
||
|
stat_flags stats_needed;
|
||
|
|
||
|
/// The number of documents in the collection.
|
||
|
Xapian::doccount collection_size_;
|
||
|
|
||
|
/// The number of documents marked as relevant.
|
||
|
Xapian::doccount rset_size_;
|
||
|
|
||
|
/// The average length of a document in the collection.
|
||
|
Xapian::doclength average_length_;
|
||
|
|
||
|
/// The number of documents which this term indexes.
|
||
|
Xapian::doccount termfreq_;
|
||
|
|
||
|
// The collection frequency of the term.
|
||
|
Xapian::termcount collectionfreq_;
|
||
|
|
||
|
/// The number of relevant documents which this term indexes.
|
||
|
Xapian::doccount reltermfreq_;
|
||
|
|
||
|
/// The length of the query.
|
||
|
Xapian::termcount query_length_;
|
||
|
|
||
|
/// The within-query-frequency of this term.
|
||
|
Xapian::termcount wqf_;
|
||
|
|
||
|
/// A lower bound on the minimum length of any document in the database.
|
||
|
Xapian::termcount doclength_lower_bound_;
|
||
|
|
||
|
/// An upper bound on the maximum length of any document in the database.
|
||
|
Xapian::termcount doclength_upper_bound_;
|
||
|
|
||
|
/// An upper bound on the wdf of this term.
|
||
|
Xapian::termcount wdf_upper_bound_;
|
||
|
|
||
|
public:
|
||
|
|
||
|
/// Default constructor, needed by subclass constructors.
|
||
|
Weight() : stats_needed() { }
|
||
|
|
||
|
/** Type of smoothing to use with the Language Model Weighting scheme.
|
||
|
*
|
||
|
* Default is TWO_STAGE_SMOOTHING.
|
||
|
*/
|
||
|
typedef enum {
|
||
|
TWO_STAGE_SMOOTHING = 1,
|
||
|
DIRICHLET_SMOOTHING = 2,
|
||
|
ABSOLUTE_DISCOUNT_SMOOTHING = 3,
|
||
|
JELINEK_MERCER_SMOOTHING = 4,
|
||
|
DIRICHLET_PLUS_SMOOTHING = 5
|
||
|
} type_smoothing;
|
||
|
|
||
|
class Internal;
|
||
|
|
||
|
/** Virtual destructor, because we have virtual methods. */
|
||
|
virtual ~Weight();
|
||
|
|
||
|
/** Clone this object.
|
||
|
*
|
||
|
* This method allocates and returns a copy of the object it is called on.
|
||
|
*
|
||
|
* If your subclass is called FooWeight and has parameters a and b, then
|
||
|
* you would implement FooWeight::clone() like so:
|
||
|
*
|
||
|
* FooWeight * FooWeight::clone() const { return new FooWeight(a, b); }
|
||
|
*
|
||
|
* Note that the returned object will be deallocated by Xapian after use
|
||
|
* with "delete". If you want to handle the deletion in a special way
|
||
|
* (for example when wrapping the Xapian API for use from another
|
||
|
* language) then you can define a static <code>operator delete</code>
|
||
|
* method in your subclass as shown here:
|
||
|
* https://trac.xapian.org/ticket/554#comment:1
|
||
|
*/
|
||
|
virtual Weight * clone() const = 0;
|
||
|
|
||
|
/** Return the name of this weighting scheme.
|
||
|
*
|
||
|
* This name is used by the remote backend. It is passed along with the
|
||
|
* serialised parameters to the remote server so that it knows which class
|
||
|
* to create.
|
||
|
*
|
||
|
* Return the full namespace-qualified name of your class here - if
|
||
|
* your class is called FooWeight, return "FooWeight" from this method
|
||
|
* (Xapian::BM25Weight returns "Xapian::BM25Weight" here).
|
||
|
*
|
||
|
* If you don't want to support the remote backend, you can use the
|
||
|
* default implementation which simply returns an empty string.
|
||
|
*/
|
||
|
virtual std::string name() const;
|
||
|
|
||
|
/** Return this object's parameters serialised as a single string.
|
||
|
*
|
||
|
* If you don't want to support the remote backend, you can use the
|
||
|
* default implementation which simply throws Xapian::UnimplementedError.
|
||
|
*/
|
||
|
virtual std::string serialise() const;
|
||
|
|
||
|
/** Unserialise parameters.
|
||
|
*
|
||
|
* This method unserialises parameters serialised by the @a serialise()
|
||
|
* method and allocates and returns a new object initialised with them.
|
||
|
*
|
||
|
* If you don't want to support the remote backend, you can use the
|
||
|
* default implementation which simply throws Xapian::UnimplementedError.
|
||
|
*
|
||
|
* Note that the returned object will be deallocated by Xapian after use
|
||
|
* with "delete". If you want to handle the deletion in a special way
|
||
|
* (for example when wrapping the Xapian API for use from another
|
||
|
* language) then you can define a static <code>operator delete</code>
|
||
|
* method in your subclass as shown here:
|
||
|
* https://trac.xapian.org/ticket/554#comment:1
|
||
|
*
|
||
|
* @param serialised A string containing the serialised parameters.
|
||
|
*/
|
||
|
virtual Weight * unserialise(const std::string & serialised) const;
|
||
|
|
||
|
/** Calculate the weight contribution for this object's term to a document.
|
||
|
*
|
||
|
* The parameters give information about the document which may be used
|
||
|
* in the calculations:
|
||
|
*
|
||
|
* @param wdf The within document frequency of the term in the document.
|
||
|
* @param doclen The document's length (unnormalised).
|
||
|
* @param uniqterms Number of unique terms in the document (used
|
||
|
* for absolute smoothing).
|
||
|
*/
|
||
|
virtual double get_sumpart(Xapian::termcount wdf,
|
||
|
Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterms) const = 0;
|
||
|
|
||
|
/** Return an upper bound on what get_sumpart() can return for any document.
|
||
|
*
|
||
|
* This information is used by the matcher to perform various
|
||
|
* optimisations, so strive to make the bound as tight as possible.
|
||
|
*/
|
||
|
virtual double get_maxpart() const = 0;
|
||
|
|
||
|
/** Calculate the term-independent weight component for a document.
|
||
|
*
|
||
|
* The parameter gives information about the document which may be used
|
||
|
* in the calculations:
|
||
|
*
|
||
|
* @param doclen The document's length (unnormalised).
|
||
|
* @param uniqterms The number of unique terms in the document.
|
||
|
*/
|
||
|
virtual double get_sumextra(Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterms) const = 0;
|
||
|
|
||
|
/** Return an upper bound on what get_sumextra() can return for any
|
||
|
* document.
|
||
|
*
|
||
|
* This information is used by the matcher to perform various
|
||
|
* optimisations, so strive to make the bound as tight as possible.
|
||
|
*/
|
||
|
virtual double get_maxextra() const = 0;
|
||
|
|
||
|
/** @private @internal Initialise this object to calculate weights for term
|
||
|
* @a term.
|
||
|
*
|
||
|
* Old version of method, as used by 1.4.18 and earlier. This
|
||
|
* should only be referenced from inside the library and 1.4.19 and
|
||
|
* later will call the new version instead. We continue to provide it
|
||
|
* mainly to avoid triggering ABI checking tools.
|
||
|
*
|
||
|
* @param stats Source of statistics.
|
||
|
* @param query_len_ Query length.
|
||
|
* @param term The term for the new object.
|
||
|
* @param wqf_ The within-query-frequency of @a term.
|
||
|
* @param factor Any scaling factor (e.g. from OP_SCALE_WEIGHT).
|
||
|
*/
|
||
|
void init_(const Internal & stats, Xapian::termcount query_len_,
|
||
|
const std::string & term, Xapian::termcount wqf_,
|
||
|
double factor);
|
||
|
|
||
|
/** @private @internal Initialise this object to calculate weights for term
|
||
|
* @a term.
|
||
|
*
|
||
|
* @param stats Source of statistics.
|
||
|
* @param query_len_ Query length.
|
||
|
* @param term The term for the new object.
|
||
|
* @param wqf_ The within-query-frequency of @a term.
|
||
|
* @param factor Any scaling factor (e.g. from OP_SCALE_WEIGHT).
|
||
|
* @param postlist Pointer to a LeafPostList for the term (cast to void*
|
||
|
* to avoid needing to forward declare class
|
||
|
* LeafPostList in public API headers) which can be used
|
||
|
* to get wdf upper bound
|
||
|
*/
|
||
|
void init_(const Internal & stats, Xapian::termcount query_len_,
|
||
|
const std::string & term, Xapian::termcount wqf_,
|
||
|
double factor, void* postlist);
|
||
|
|
||
|
/** @private @internal Initialise this object to calculate weights for a
|
||
|
* synonym.
|
||
|
*
|
||
|
* @param stats Source of statistics.
|
||
|
* @param query_len_ Query length.
|
||
|
* @param factor Any scaling factor (e.g. from OP_SCALE_WEIGHT).
|
||
|
* @param termfreq The termfreq to use.
|
||
|
* @param reltermfreq The reltermfreq to use.
|
||
|
* @param collection_freq The collection frequency to use.
|
||
|
*/
|
||
|
void init_(const Internal & stats, Xapian::termcount query_len_,
|
||
|
double factor, Xapian::doccount termfreq,
|
||
|
Xapian::doccount reltermfreq, Xapian::termcount collection_freq);
|
||
|
|
||
|
/** @private @internal Initialise this object to calculate the extra weight
|
||
|
* component.
|
||
|
*
|
||
|
* @param stats Source of statistics.
|
||
|
* @param query_len_ Query length.
|
||
|
*/
|
||
|
void init_(const Internal & stats, Xapian::termcount query_len_);
|
||
|
|
||
|
/** @private @internal Return true if the document length is needed.
|
||
|
*
|
||
|
* If this method returns true, then the document length will be fetched
|
||
|
* and passed to @a get_sumpart(). Otherwise 0 may be passed for the
|
||
|
* document length.
|
||
|
*/
|
||
|
bool get_sumpart_needs_doclength_() const {
|
||
|
return stats_needed & DOC_LENGTH;
|
||
|
}
|
||
|
|
||
|
/** @private @internal Return true if the WDF is needed.
|
||
|
*
|
||
|
* If this method returns true, then the WDF will be fetched and passed to
|
||
|
* @a get_sumpart(). Otherwise 0 may be passed for the wdf.
|
||
|
*/
|
||
|
bool get_sumpart_needs_wdf_() const {
|
||
|
return stats_needed & WDF;
|
||
|
}
|
||
|
|
||
|
/** @private @internal Return true if the number of unique terms is needed.
|
||
|
*
|
||
|
* If this method returns true, then the number of unique terms will be
|
||
|
* fetched and passed to @a get_sumpart(). Otherwise 0 may be passed for
|
||
|
* the number of unique terms.
|
||
|
*/
|
||
|
bool get_sumpart_needs_uniqueterms_() const {
|
||
|
return stats_needed & UNIQUE_TERMS;
|
||
|
}
|
||
|
|
||
|
/// @private @internal Test if this is a BoolWeight object.
|
||
|
bool is_bool_weight_() const {
|
||
|
// Checking the name isn't ideal, but (get_maxpart() == 0.0) isn't
|
||
|
// required to work without init() having been called. We can at
|
||
|
// least avoid the virtual method call in most non-BoolWeight cases
|
||
|
// as most other classes will need at least some stats.
|
||
|
return stats_needed == 0 && name() == "Xapian::BoolWeight";
|
||
|
}
|
||
|
|
||
|
protected:
|
||
|
/** Don't allow copying.
|
||
|
*
|
||
|
* This would ideally be private, but that causes a compilation error
|
||
|
* with GCC 4.1 (which appears to be a bug).
|
||
|
*/
|
||
|
Weight(const Weight &);
|
||
|
|
||
|
/// The number of documents in the collection.
|
||
|
Xapian::doccount get_collection_size() const { return collection_size_; }
|
||
|
|
||
|
/// The number of documents marked as relevant.
|
||
|
Xapian::doccount get_rset_size() const { return rset_size_; }
|
||
|
|
||
|
/// The average length of a document in the collection.
|
||
|
Xapian::doclength get_average_length() const { return average_length_; }
|
||
|
|
||
|
/// The number of documents which this term indexes.
|
||
|
Xapian::doccount get_termfreq() const { return termfreq_; }
|
||
|
|
||
|
/// The number of relevant documents which this term indexes.
|
||
|
Xapian::doccount get_reltermfreq() const { return reltermfreq_; }
|
||
|
|
||
|
/// The collection frequency of the term.
|
||
|
Xapian::termcount get_collection_freq() const { return collectionfreq_; }
|
||
|
|
||
|
/// The length of the query.
|
||
|
Xapian::termcount get_query_length() const { return query_length_; }
|
||
|
|
||
|
/// The within-query-frequency of this term.
|
||
|
Xapian::termcount get_wqf() const { return wqf_; }
|
||
|
|
||
|
/** An upper bound on the maximum length of any document in the database.
|
||
|
*
|
||
|
* This should only be used by get_maxpart() and get_maxextra().
|
||
|
*/
|
||
|
Xapian::termcount get_doclength_upper_bound() const {
|
||
|
return doclength_upper_bound_;
|
||
|
}
|
||
|
|
||
|
/** A lower bound on the minimum length of any document in the database.
|
||
|
*
|
||
|
* This bound does not include any zero-length documents.
|
||
|
*
|
||
|
* This should only be used by get_maxpart() and get_maxextra().
|
||
|
*/
|
||
|
Xapian::termcount get_doclength_lower_bound() const {
|
||
|
return doclength_lower_bound_;
|
||
|
}
|
||
|
|
||
|
/** An upper bound on the wdf of this term.
|
||
|
*
|
||
|
* This should only be used by get_maxpart() and get_maxextra().
|
||
|
*/
|
||
|
Xapian::termcount get_wdf_upper_bound() const {
|
||
|
return wdf_upper_bound_;
|
||
|
}
|
||
|
|
||
|
/// Total length of all documents in the collection.
|
||
|
Xapian::totallength get_total_length() const {
|
||
|
return Xapian::totallength(average_length_ * collection_size_ + 0.5);
|
||
|
}
|
||
|
};
|
||
|
|
||
|
/** Class implementing a "boolean" weighting scheme.
|
||
|
*
|
||
|
* This weighting scheme gives all documents zero weight.
|
||
|
*/
|
||
|
class XAPIAN_VISIBILITY_DEFAULT BoolWeight : public Weight {
|
||
|
BoolWeight * clone() const;
|
||
|
|
||
|
void init(double factor);
|
||
|
|
||
|
public:
|
||
|
/** Construct a BoolWeight. */
|
||
|
BoolWeight() { }
|
||
|
|
||
|
std::string name() const;
|
||
|
|
||
|
std::string serialise() const;
|
||
|
BoolWeight * unserialise(const std::string & serialised) const;
|
||
|
|
||
|
double get_sumpart(Xapian::termcount wdf,
|
||
|
Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterms) const;
|
||
|
double get_maxpart() const;
|
||
|
|
||
|
double get_sumextra(Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterms) const;
|
||
|
double get_maxextra() const;
|
||
|
};
|
||
|
|
||
|
/// Xapian::Weight subclass implementing the tf-idf weighting scheme.
|
||
|
class XAPIAN_VISIBILITY_DEFAULT TfIdfWeight : public Weight {
|
||
|
/* Three character string indicating the normalizations for tf(wdf), idf and
|
||
|
tfidf weight. */
|
||
|
std::string normalizations;
|
||
|
|
||
|
/// The factor to multiply with the weight.
|
||
|
double factor;
|
||
|
|
||
|
TfIdfWeight * clone() const;
|
||
|
|
||
|
void init(double factor);
|
||
|
|
||
|
/* When additional normalizations are implemented in the future, the additional statistics for them
|
||
|
should be accessed by these functions. */
|
||
|
double get_wdfn(Xapian::termcount wdf, char c) const;
|
||
|
double get_idfn(Xapian::doccount termfreq, char c) const;
|
||
|
double get_wtn(double wt, char c) const;
|
||
|
|
||
|
public:
|
||
|
/** Construct a TfIdfWeight
|
||
|
*
|
||
|
* @param normalizations A three character string indicating the
|
||
|
* normalizations to be used for the tf(wdf), idf
|
||
|
* and document weight. (default: "ntn")
|
||
|
*
|
||
|
* The @a normalizations string works like so:
|
||
|
*
|
||
|
* @li The first character specifies the normalization for the wdf. The
|
||
|
* following normalizations are currently supported:
|
||
|
*
|
||
|
* @li 'n': None. wdfn=wdf
|
||
|
* @li 'b': Boolean wdfn=1 if term in document else wdfn=0
|
||
|
* @li 's': Square wdfn=wdf*wdf
|
||
|
* @li 'l': Logarithmic wdfn=1+log<sub>e</sub>(wdf)
|
||
|
* @li 'L': Log average wdfn=(1+log(wdf))/(1+log(doclen/unique_terms))
|
||
|
*
|
||
|
* The Max-wdf and Augmented Max wdf normalizations haven't yet been
|
||
|
* implemented.
|
||
|
*
|
||
|
* @li The second character indicates the normalization for the idf. The
|
||
|
* following normalizations are currently supported:
|
||
|
*
|
||
|
* @li 'n': None idfn=1
|
||
|
* @li 't': TfIdf idfn=log(N/Termfreq) where N is the number of
|
||
|
* documents in collection and Termfreq is the number of documents
|
||
|
* which are indexed by the term t.
|
||
|
* @li 'p': Prob idfn=log((N-Termfreq)/Termfreq)
|
||
|
* @li 'f': Freq idfn=1/Termfreq
|
||
|
* @li 's': Squared idfn=(log(N/Termfreq))²
|
||
|
*
|
||
|
* @li The third and the final character indicates the normalization for
|
||
|
* the document weight. The following normalizations are currently
|
||
|
* supported:
|
||
|
*
|
||
|
* @li 'n': None wtn=tfn*idfn
|
||
|
*
|
||
|
* Implementing support for more normalizations of each type would require
|
||
|
* extending the backend to track more statistics.
|
||
|
*/
|
||
|
explicit TfIdfWeight(const std::string &normalizations);
|
||
|
|
||
|
/** Construct a TfIdfWeight using the default normalizations ("ntn"). */
|
||
|
TfIdfWeight()
|
||
|
: normalizations("ntn")
|
||
|
{
|
||
|
need_stat(TERMFREQ);
|
||
|
need_stat(WDF);
|
||
|
need_stat(WDF_MAX);
|
||
|
need_stat(COLLECTION_SIZE);
|
||
|
}
|
||
|
|
||
|
std::string name() const;
|
||
|
|
||
|
std::string serialise() const;
|
||
|
TfIdfWeight * unserialise(const std::string & serialised) const;
|
||
|
|
||
|
double get_sumpart(Xapian::termcount wdf,
|
||
|
Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterm) const;
|
||
|
double get_maxpart() const;
|
||
|
|
||
|
double get_sumextra(Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterms) const;
|
||
|
double get_maxextra() const;
|
||
|
};
|
||
|
|
||
|
|
||
|
/// Xapian::Weight subclass implementing the BM25 probabilistic formula.
|
||
|
class XAPIAN_VISIBILITY_DEFAULT BM25Weight : public Weight {
|
||
|
/// Factor to multiply the document length by.
|
||
|
mutable Xapian::doclength len_factor;
|
||
|
|
||
|
/// Factor combining all the document independent factors.
|
||
|
mutable double termweight;
|
||
|
|
||
|
/// The BM25 parameters.
|
||
|
double param_k1, param_k2, param_k3, param_b;
|
||
|
|
||
|
/// The minimum normalised document length value.
|
||
|
Xapian::doclength param_min_normlen;
|
||
|
|
||
|
BM25Weight * clone() const;
|
||
|
|
||
|
void init(double factor);
|
||
|
|
||
|
public:
|
||
|
/** Construct a BM25Weight.
|
||
|
*
|
||
|
* @param k1 A non-negative parameter controlling how influential
|
||
|
* within-document-frequency (wdf) is. k1=0 means that
|
||
|
* wdf doesn't affect the weights. The larger k1 is, the more
|
||
|
* wdf influences the weights. (default 1)
|
||
|
*
|
||
|
* @param k2 A non-negative parameter which controls the strength of a
|
||
|
* correction factor which depends upon query length and
|
||
|
* normalised document length. k2=0 disable this factor; larger
|
||
|
* k2 makes it stronger. (default 0)
|
||
|
*
|
||
|
* @param k3 A non-negative parameter controlling how influential
|
||
|
* within-query-frequency (wqf) is. k3=0 means that wqf
|
||
|
* doesn't affect the weights. The larger k3 is, the more
|
||
|
* wqf influences the weights. (default 1)
|
||
|
*
|
||
|
* @param b A parameter between 0 and 1, controlling how strong the
|
||
|
* document length normalisation of wdf is. 0 means no
|
||
|
* normalisation; 1 means full normalisation. (default 0.5)
|
||
|
*
|
||
|
* @param min_normlen A parameter specifying a minimum value for
|
||
|
* normalised document length. Normalised document length
|
||
|
* values less than this will be clamped to this value, helping
|
||
|
* to prevent very short documents getting large weights.
|
||
|
* (default 0.5)
|
||
|
*/
|
||
|
BM25Weight(double k1, double k2, double k3, double b, double min_normlen)
|
||
|
: param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
|
||
|
param_min_normlen(min_normlen)
|
||
|
{
|
||
|
if (param_k1 < 0) param_k1 = 0;
|
||
|
if (param_k2 < 0) param_k2 = 0;
|
||
|
if (param_k3 < 0) param_k3 = 0;
|
||
|
if (param_b < 0) {
|
||
|
param_b = 0;
|
||
|
} else if (param_b > 1) {
|
||
|
param_b = 1;
|
||
|
}
|
||
|
need_stat(COLLECTION_SIZE);
|
||
|
need_stat(RSET_SIZE);
|
||
|
need_stat(TERMFREQ);
|
||
|
need_stat(RELTERMFREQ);
|
||
|
need_stat(WDF);
|
||
|
need_stat(WDF_MAX);
|
||
|
if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
|
||
|
need_stat(DOC_LENGTH_MIN);
|
||
|
need_stat(AVERAGE_LENGTH);
|
||
|
}
|
||
|
if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
|
||
|
if (param_k2 != 0) need_stat(QUERY_LENGTH);
|
||
|
if (param_k3 != 0) need_stat(WQF);
|
||
|
}
|
||
|
|
||
|
BM25Weight()
|
||
|
: param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
|
||
|
param_min_normlen(0.5)
|
||
|
{
|
||
|
need_stat(COLLECTION_SIZE);
|
||
|
need_stat(RSET_SIZE);
|
||
|
need_stat(TERMFREQ);
|
||
|
need_stat(RELTERMFREQ);
|
||
|
need_stat(WDF);
|
||
|
need_stat(WDF_MAX);
|
||
|
need_stat(DOC_LENGTH_MIN);
|
||
|
need_stat(AVERAGE_LENGTH);
|
||
|
need_stat(DOC_LENGTH);
|
||
|
need_stat(WQF);
|
||
|
}
|
||
|
|
||
|
std::string name() const;
|
||
|
|
||
|
std::string serialise() const;
|
||
|
BM25Weight * unserialise(const std::string & serialised) const;
|
||
|
|
||
|
double get_sumpart(Xapian::termcount wdf,
|
||
|
Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterm) const;
|
||
|
double get_maxpart() const;
|
||
|
|
||
|
double get_sumextra(Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterms) const;
|
||
|
double get_maxextra() const;
|
||
|
};
|
||
|
|
||
|
/// Xapian::Weight subclass implementing the BM25+ probabilistic formula.
|
||
|
class XAPIAN_VISIBILITY_DEFAULT BM25PlusWeight : public Weight {
|
||
|
/// Factor to multiply the document length by.
|
||
|
mutable Xapian::doclength len_factor;
|
||
|
|
||
|
/// Factor combining all the document independent factors.
|
||
|
mutable double termweight;
|
||
|
|
||
|
/// The BM25+ parameters.
|
||
|
double param_k1, param_k2, param_k3, param_b;
|
||
|
|
||
|
/// The minimum normalised document length value.
|
||
|
Xapian::doclength param_min_normlen;
|
||
|
|
||
|
/// Additional parameter delta in the BM25+ formula.
|
||
|
double param_delta;
|
||
|
|
||
|
BM25PlusWeight * clone() const;
|
||
|
|
||
|
void init(double factor);
|
||
|
|
||
|
public:
|
||
|
/** Construct a BM25PlusWeight.
|
||
|
*
|
||
|
* @param k1 A non-negative parameter controlling how influential
|
||
|
* within-document-frequency (wdf) is. k1=0 means that
|
||
|
* wdf doesn't affect the weights. The larger k1 is, the more
|
||
|
* wdf influences the weights. (default 1)
|
||
|
*
|
||
|
* @param k2 A non-negative parameter which controls the strength of a
|
||
|
* correction factor which depends upon query length and
|
||
|
* normalised document length. k2=0 disable this factor; larger
|
||
|
* k2 makes it stronger. The paper which describes BM25+
|
||
|
* ignores BM25's document-independent component (so implicitly
|
||
|
* k2=0), but we support non-zero k2 too. (default 0)
|
||
|
*
|
||
|
* @param k3 A non-negative parameter controlling how influential
|
||
|
* within-query-frequency (wqf) is. k3=0 means that wqf
|
||
|
* doesn't affect the weights. The larger k3 is, the more
|
||
|
* wqf influences the weights. (default 1)
|
||
|
*
|
||
|
* @param b A parameter between 0 and 1, controlling how strong the
|
||
|
* document length normalisation of wdf is. 0 means no
|
||
|
* normalisation; 1 means full normalisation. (default 0.5)
|
||
|
*
|
||
|
* @param min_normlen A parameter specifying a minimum value for
|
||
|
* normalised document length. Normalised document length
|
||
|
* values less than this will be clamped to this value, helping
|
||
|
* to prevent very short documents getting large weights.
|
||
|
* (default 0.5)
|
||
|
*
|
||
|
* @param delta A parameter for pseudo tf value to control the scale
|
||
|
* of the tf lower bound. Delta(δ) can be tuned for example
|
||
|
* from 0.0 to 1.5 but BM25+ can still work effectively
|
||
|
* across collections with a fixed δ = 1.0. (default 1.0)
|
||
|
*/
|
||
|
BM25PlusWeight(double k1, double k2, double k3, double b,
|
||
|
double min_normlen, double delta)
|
||
|
: param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
|
||
|
param_min_normlen(min_normlen), param_delta(delta)
|
||
|
{
|
||
|
if (param_k1 < 0) param_k1 = 0;
|
||
|
if (param_k2 < 0) param_k2 = 0;
|
||
|
if (param_k3 < 0) param_k3 = 0;
|
||
|
if (param_delta < 0) param_delta = 0;
|
||
|
if (param_b < 0) {
|
||
|
param_b = 0;
|
||
|
} else if (param_b > 1) {
|
||
|
param_b = 1;
|
||
|
}
|
||
|
need_stat(COLLECTION_SIZE);
|
||
|
need_stat(RSET_SIZE);
|
||
|
need_stat(TERMFREQ);
|
||
|
need_stat(RELTERMFREQ);
|
||
|
need_stat(WDF);
|
||
|
need_stat(WDF_MAX);
|
||
|
if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
|
||
|
need_stat(DOC_LENGTH_MIN);
|
||
|
need_stat(AVERAGE_LENGTH);
|
||
|
}
|
||
|
if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
|
||
|
if (param_k2 != 0) need_stat(QUERY_LENGTH);
|
||
|
if (param_k3 != 0) need_stat(WQF);
|
||
|
if (param_delta != 0) {
|
||
|
need_stat(AVERAGE_LENGTH);
|
||
|
need_stat(DOC_LENGTH);
|
||
|
need_stat(WQF);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
BM25PlusWeight()
|
||
|
: param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
|
||
|
param_min_normlen(0.5), param_delta(1)
|
||
|
{
|
||
|
need_stat(COLLECTION_SIZE);
|
||
|
need_stat(RSET_SIZE);
|
||
|
need_stat(TERMFREQ);
|
||
|
need_stat(RELTERMFREQ);
|
||
|
need_stat(WDF);
|
||
|
need_stat(WDF_MAX);
|
||
|
need_stat(DOC_LENGTH_MIN);
|
||
|
need_stat(AVERAGE_LENGTH);
|
||
|
need_stat(DOC_LENGTH);
|
||
|
need_stat(WQF);
|
||
|
}
|
||
|
|
||
|
std::string name() const;
|
||
|
|
||
|
std::string serialise() const;
|
||
|
BM25PlusWeight * unserialise(const std::string & serialised) const;
|
||
|
|
||
|
double get_sumpart(Xapian::termcount wdf,
|
||
|
Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterm) const;
|
||
|
double get_maxpart() const;
|
||
|
|
||
|
double get_sumextra(Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterms) const;
|
||
|
double get_maxextra() const;
|
||
|
};
|
||
|
|
||
|
/** Xapian::Weight subclass implementing the traditional probabilistic formula.
|
||
|
*
|
||
|
* This class implements the "traditional" Probabilistic Weighting scheme, as
|
||
|
* described by the early papers on Probabilistic Retrieval. BM25 generally
|
||
|
* gives better results.
|
||
|
*
|
||
|
* TradWeight(k) is equivalent to BM25Weight(k, 0, 0, 1, 0), except that
|
||
|
* the latter returns weights (k+1) times larger.
|
||
|
*/
|
||
|
class XAPIAN_VISIBILITY_DEFAULT TradWeight : public Weight {
|
||
|
/// Factor to multiply the document length by.
|
||
|
mutable Xapian::doclength len_factor;
|
||
|
|
||
|
/// Factor combining all the document independent factors.
|
||
|
mutable double termweight;
|
||
|
|
||
|
/// The parameter in the formula.
|
||
|
double param_k;
|
||
|
|
||
|
TradWeight * clone() const;
|
||
|
|
||
|
void init(double factor);
|
||
|
|
||
|
public:
|
||
|
/** Construct a TradWeight.
|
||
|
*
|
||
|
* @param k A non-negative parameter controlling how influential
|
||
|
* within-document-frequency (wdf) and document length are.
|
||
|
* k=0 means that wdf and document length don't affect the
|
||
|
* weights. The larger k is, the more they do. (default 1)
|
||
|
*/
|
||
|
explicit TradWeight(double k = 1.0) : param_k(k) {
|
||
|
if (param_k < 0) param_k = 0;
|
||
|
if (param_k != 0.0) {
|
||
|
need_stat(AVERAGE_LENGTH);
|
||
|
need_stat(DOC_LENGTH);
|
||
|
}
|
||
|
need_stat(COLLECTION_SIZE);
|
||
|
need_stat(RSET_SIZE);
|
||
|
need_stat(TERMFREQ);
|
||
|
need_stat(RELTERMFREQ);
|
||
|
need_stat(DOC_LENGTH_MIN);
|
||
|
need_stat(WDF);
|
||
|
need_stat(WDF_MAX);
|
||
|
}
|
||
|
|
||
|
std::string name() const;
|
||
|
|
||
|
std::string serialise() const;
|
||
|
TradWeight * unserialise(const std::string & serialised) const;
|
||
|
|
||
|
double get_sumpart(Xapian::termcount wdf,
|
||
|
Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqueterms) const;
|
||
|
double get_maxpart() const;
|
||
|
|
||
|
double get_sumextra(Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterms) const;
|
||
|
double get_maxextra() const;
|
||
|
};
|
||
|
|
||
|
/** This class implements the InL2 weighting scheme.
|
||
|
*
|
||
|
* InL2 is a representative scheme of the Divergence from Randomness Framework
|
||
|
* by Gianni Amati.
|
||
|
*
|
||
|
* This weighting scheme is useful for tasks that require early precision.
|
||
|
*
|
||
|
* It uses the Inverse document frequency model (In), the Laplace method to
|
||
|
* find the aftereffect of sampling (L) and the second wdf normalization
|
||
|
* proposed by Amati to normalize the wdf in the document to the length of the
|
||
|
* document (H2).
|
||
|
*
|
||
|
* For more information about the DFR Framework and the InL2 scheme, please
|
||
|
* refer to: Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
|
||
|
* models of information retrieval based on measuring the divergence from
|
||
|
* randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002,
|
||
|
* pp. 357-389.
|
||
|
*/
|
||
|
class XAPIAN_VISIBILITY_DEFAULT InL2Weight : public Weight {
|
||
|
/// The wdf normalization parameter in the formula.
|
||
|
double param_c;
|
||
|
|
||
|
/// The upper bound on the weight a term can give to a document.
|
||
|
double upper_bound;
|
||
|
|
||
|
/// The constant values which are used on every call to get_sumpart().
|
||
|
double wqf_product_idf;
|
||
|
double c_product_avlen;
|
||
|
|
||
|
InL2Weight * clone() const;
|
||
|
|
||
|
void init(double factor);
|
||
|
|
||
|
public:
|
||
|
/** Construct an InL2Weight.
|
||
|
*
|
||
|
* @param c A strictly positive parameter controlling the extent
|
||
|
* of the normalization of the wdf to the document length. The
|
||
|
* default value of 1 is suitable for longer queries but it may
|
||
|
* need to be changed for shorter queries. For more information,
|
||
|
* please refer to Gianni Amati's PHD thesis.
|
||
|
*/
|
||
|
explicit InL2Weight(double c);
|
||
|
|
||
|
InL2Weight()
|
||
|
: param_c(1.0)
|
||
|
{
|
||
|
need_stat(AVERAGE_LENGTH);
|
||
|
need_stat(DOC_LENGTH);
|
||
|
need_stat(DOC_LENGTH_MIN);
|
||
|
need_stat(DOC_LENGTH_MAX);
|
||
|
need_stat(COLLECTION_SIZE);
|
||
|
need_stat(WDF);
|
||
|
need_stat(WDF_MAX);
|
||
|
need_stat(WQF);
|
||
|
need_stat(TERMFREQ);
|
||
|
}
|
||
|
|
||
|
std::string name() const;
|
||
|
|
||
|
std::string serialise() const;
|
||
|
InL2Weight * unserialise(const std::string & serialised) const;
|
||
|
|
||
|
double get_sumpart(Xapian::termcount wdf,
|
||
|
Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterms) const;
|
||
|
double get_maxpart() const;
|
||
|
|
||
|
double get_sumextra(Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterms) const;
|
||
|
double get_maxextra() const;
|
||
|
};
|
||
|
|
||
|
/** This class implements the IfB2 weighting scheme.
|
||
|
*
|
||
|
* IfB2 is a representative scheme of the Divergence from Randomness Framework
|
||
|
* by Gianni Amati.
|
||
|
*
|
||
|
* It uses the Inverse term frequency model (If), the Bernoulli method to find
|
||
|
* the aftereffect of sampling (B) and the second wdf normalization proposed
|
||
|
* by Amati to normalize the wdf in the document to the length of the document
|
||
|
* (H2).
|
||
|
*
|
||
|
* For more information about the DFR Framework and the IfB2 scheme, please
|
||
|
* refer to: Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
|
||
|
* models of information retrieval based on measuring the divergence from
|
||
|
* randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002,
|
||
|
* pp. 357-389.
|
||
|
*/
|
||
|
class XAPIAN_VISIBILITY_DEFAULT IfB2Weight : public Weight {
|
||
|
/// The wdf normalization parameter in the formula.
|
||
|
double param_c;
|
||
|
|
||
|
/// The upper bound on the weight.
|
||
|
double upper_bound;
|
||
|
|
||
|
/// The constant values which are used for calculations in get_sumpart().
|
||
|
double wqf_product_idf;
|
||
|
double c_product_avlen;
|
||
|
double B_constant;
|
||
|
|
||
|
IfB2Weight * clone() const;
|
||
|
|
||
|
void init(double factor);
|
||
|
|
||
|
public:
|
||
|
/** Construct an IfB2Weight.
|
||
|
*
|
||
|
* @param c A strictly positive parameter controlling the extent
|
||
|
* of the normalization of the wdf to the document length. The
|
||
|
* default value of 1 is suitable for longer queries but it may
|
||
|
* need to be changed for shorter queries. For more information,
|
||
|
* please refer to Gianni Amati's PHD thesis titled
|
||
|
* Probabilistic Models for Information Retrieval based on
|
||
|
* Divergence from Randomness.
|
||
|
*/
|
||
|
explicit IfB2Weight(double c);
|
||
|
|
||
|
IfB2Weight() : param_c(1.0) {
|
||
|
need_stat(AVERAGE_LENGTH);
|
||
|
need_stat(DOC_LENGTH);
|
||
|
need_stat(DOC_LENGTH_MIN);
|
||
|
need_stat(DOC_LENGTH_MAX);
|
||
|
need_stat(COLLECTION_SIZE);
|
||
|
need_stat(COLLECTION_FREQ);
|
||
|
need_stat(WDF);
|
||
|
need_stat(WDF_MAX);
|
||
|
need_stat(WQF);
|
||
|
need_stat(TERMFREQ);
|
||
|
}
|
||
|
|
||
|
std::string name() const;
|
||
|
|
||
|
std::string serialise() const;
|
||
|
IfB2Weight * unserialise(const std::string & serialised) const;
|
||
|
|
||
|
double get_sumpart(Xapian::termcount wdf,
|
||
|
Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterm) const;
|
||
|
double get_maxpart() const;
|
||
|
|
||
|
double get_sumextra(Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterms) const;
|
||
|
double get_maxextra() const;
|
||
|
};
|
||
|
|
||
|
/** This class implements the IneB2 weighting scheme.
|
||
|
*
|
||
|
* IneB2 is a representative scheme of the Divergence from Randomness
|
||
|
* Framework by Gianni Amati.
|
||
|
*
|
||
|
* It uses the Inverse expected document frequency model (Ine), the Bernoulli
|
||
|
* method to find the aftereffect of sampling (B) and the second wdf
|
||
|
* normalization proposed by Amati to normalize the wdf in the document to the
|
||
|
* length of the document (H2).
|
||
|
*
|
||
|
* For more information about the DFR Framework and the IneB2 scheme, please
|
||
|
* refer to: Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
|
||
|
* models of information retrieval based on measuring the divergence from
|
||
|
* randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002,
|
||
|
* pp. 357-389.
|
||
|
*/
|
||
|
class XAPIAN_VISIBILITY_DEFAULT IneB2Weight : public Weight {
|
||
|
/// The wdf normalization parameter in the formula.
|
||
|
double param_c;
|
||
|
|
||
|
/// The upper bound of the weight.
|
||
|
double upper_bound;
|
||
|
|
||
|
/// Constant values used in get_sumpart().
|
||
|
double wqf_product_idf;
|
||
|
double c_product_avlen;
|
||
|
double B_constant;
|
||
|
|
||
|
IneB2Weight * clone() const;
|
||
|
|
||
|
void init(double factor);
|
||
|
|
||
|
public:
|
||
|
/** Construct an IneB2Weight.
|
||
|
*
|
||
|
* @param c A strictly positive parameter controlling the extent
|
||
|
* of the normalization of the wdf to the document length. The
|
||
|
* default value of 1 is suitable for longer queries but it may
|
||
|
* need to be changed for shorter queries. For more information,
|
||
|
* please refer to Gianni Amati's PHD thesis.
|
||
|
*/
|
||
|
explicit IneB2Weight(double c);
|
||
|
|
||
|
IneB2Weight() : param_c(1.0) {
|
||
|
need_stat(AVERAGE_LENGTH);
|
||
|
need_stat(DOC_LENGTH);
|
||
|
need_stat(DOC_LENGTH_MIN);
|
||
|
need_stat(DOC_LENGTH_MAX);
|
||
|
need_stat(COLLECTION_SIZE);
|
||
|
need_stat(WDF);
|
||
|
need_stat(WDF_MAX);
|
||
|
need_stat(WQF);
|
||
|
need_stat(COLLECTION_FREQ);
|
||
|
need_stat(TERMFREQ);
|
||
|
}
|
||
|
|
||
|
std::string name() const;
|
||
|
|
||
|
std::string serialise() const;
|
||
|
IneB2Weight * unserialise(const std::string & serialised) const;
|
||
|
|
||
|
double get_sumpart(Xapian::termcount wdf,
|
||
|
Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterms) const;
|
||
|
double get_maxpart() const;
|
||
|
|
||
|
double get_sumextra(Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterms) const;
|
||
|
double get_maxextra() const;
|
||
|
};
|
||
|
|
||
|
/** This class implements the BB2 weighting scheme.
|
||
|
*
|
||
|
* BB2 is a representative scheme of the Divergence from Randomness Framework
|
||
|
* by Gianni Amati.
|
||
|
*
|
||
|
* It uses the Bose-Einstein probabilistic distribution (B) along with
|
||
|
* Stirling's power approximation, the Bernoulli method to find the
|
||
|
* aftereffect of sampling (B) and the second wdf normalization proposed by
|
||
|
* Amati to normalize the wdf in the document to the length of the document
|
||
|
* (H2).
|
||
|
*
|
||
|
* For more information about the DFR Framework and the BB2 scheme, please
|
||
|
* refer to : Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
|
||
|
* models of information retrieval based on measuring the divergence from
|
||
|
* randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002,
|
||
|
* pp. 357-389.
|
||
|
*/
|
||
|
class XAPIAN_VISIBILITY_DEFAULT BB2Weight : public Weight {
|
||
|
/// The wdf normalization parameter in the formula.
|
||
|
double param_c;
|
||
|
|
||
|
/// The upper bound on the weight.
|
||
|
double upper_bound;
|
||
|
|
||
|
/// The constant values to be used in get_sumpart().
|
||
|
double c_product_avlen;
|
||
|
double B_constant;
|
||
|
double wt;
|
||
|
double stirling_constant_1;
|
||
|
double stirling_constant_2;
|
||
|
|
||
|
BB2Weight * clone() const;
|
||
|
|
||
|
void init(double factor);
|
||
|
|
||
|
public:
|
||
|
/** Construct a BB2Weight.
|
||
|
*
|
||
|
* @param c A strictly positive parameter controlling the extent
|
||
|
* of the normalization of the wdf to the document length. A
|
||
|
* default value of 1 is suitable for longer queries but it may
|
||
|
* need to be changed for shorter queries. For more information,
|
||
|
* please refer to Gianni Amati's PHD thesis titled
|
||
|
* Probabilistic Models for Information Retrieval based on
|
||
|
* Divergence from Randomness.
|
||
|
*/
|
||
|
explicit BB2Weight(double c);
|
||
|
|
||
|
BB2Weight() : param_c(1.0) {
|
||
|
need_stat(AVERAGE_LENGTH);
|
||
|
need_stat(DOC_LENGTH);
|
||
|
need_stat(DOC_LENGTH_MIN);
|
||
|
need_stat(DOC_LENGTH_MAX);
|
||
|
need_stat(COLLECTION_SIZE);
|
||
|
need_stat(COLLECTION_FREQ);
|
||
|
need_stat(WDF);
|
||
|
need_stat(WDF_MAX);
|
||
|
need_stat(WQF);
|
||
|
need_stat(TERMFREQ);
|
||
|
}
|
||
|
|
||
|
std::string name() const;
|
||
|
|
||
|
std::string serialise() const;
|
||
|
BB2Weight * unserialise(const std::string & serialised) const;
|
||
|
|
||
|
double get_sumpart(Xapian::termcount wdf,
|
||
|
Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterms) const;
|
||
|
double get_maxpart() const;
|
||
|
|
||
|
double get_sumextra(Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterms) const;
|
||
|
double get_maxextra() const;
|
||
|
};
|
||
|
|
||
|
/** This class implements the DLH weighting scheme, which is a representative
|
||
|
* scheme of the Divergence from Randomness Framework by Gianni Amati.
|
||
|
*
|
||
|
* This is a parameter free weighting scheme and it should be used with query
|
||
|
* expansion to obtain better results. It uses the HyperGeometric Probabilistic
|
||
|
* model and Laplace's normalization to calculate the risk gain.
|
||
|
*
|
||
|
* For more information about the DFR Framework and the DLH scheme, please
|
||
|
* refer to :
|
||
|
* a.) Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
|
||
|
* models of information retrieval based on measuring the divergence from
|
||
|
* randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002, pp.
|
||
|
* 357-389.
|
||
|
* b.) FUB, IASI-CNR and University of Tor Vergata at TREC 2007 Blog Track.
|
||
|
* G. Amati and E. Ambrosi and M. Bianchi and C. Gaibisso and G. Gambosi.
|
||
|
* Proceedings of the 16th Text REtrieval Conference (TREC-2007), 2008.
|
||
|
*/
|
||
|
class XAPIAN_VISIBILITY_DEFAULT DLHWeight : public Weight {
|
||
|
/// Now unused but left in place in 1.4.x for ABI compatibility.
|
||
|
double lower_bound;
|
||
|
|
||
|
/// The upper bound on the weight.
|
||
|
double upper_bound;
|
||
|
|
||
|
/// The constant value to be used in get_sumpart().
|
||
|
double log_constant;
|
||
|
double wqf_product_factor;
|
||
|
|
||
|
DLHWeight * clone() const;
|
||
|
|
||
|
void init(double factor);
|
||
|
|
||
|
public:
|
||
|
DLHWeight() {
|
||
|
need_stat(DOC_LENGTH);
|
||
|
need_stat(COLLECTION_FREQ);
|
||
|
need_stat(WDF);
|
||
|
need_stat(WQF);
|
||
|
need_stat(WDF_MAX);
|
||
|
need_stat(DOC_LENGTH_MIN);
|
||
|
need_stat(DOC_LENGTH_MAX);
|
||
|
need_stat(TOTAL_LENGTH);
|
||
|
}
|
||
|
|
||
|
std::string name() const;
|
||
|
|
||
|
std::string serialise() const;
|
||
|
DLHWeight * unserialise(const std::string & serialised) const;
|
||
|
|
||
|
double get_sumpart(Xapian::termcount wdf,
|
||
|
Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterms) const;
|
||
|
double get_maxpart() const;
|
||
|
|
||
|
double get_sumextra(Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterms) const;
|
||
|
double get_maxextra() const;
|
||
|
};
|
||
|
|
||
|
/** This class implements the PL2 weighting scheme.
|
||
|
*
|
||
|
* PL2 is a representative scheme of the Divergence from Randomness Framework
|
||
|
* by Gianni Amati.
|
||
|
*
|
||
|
* This weighting scheme is useful for tasks that require early precision.
|
||
|
*
|
||
|
* It uses the Poisson approximation of the Binomial Probabilistic distribution
|
||
|
* (P) along with Stirling's approximation for the factorial value, the Laplace
|
||
|
* method to find the aftereffect of sampling (L) and the second wdf
|
||
|
* normalization proposed by Amati to normalize the wdf in the document to the
|
||
|
* length of the document (H2).
|
||
|
*
|
||
|
* For more information about the DFR Framework and the PL2 scheme, please
|
||
|
* refer to : Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic models
|
||
|
* of information retrieval based on measuring the divergence from randomness
|
||
|
* ACM Transactions on Information Systems (TOIS) 20, (4), 2002, pp. 357-389.
|
||
|
*/
|
||
|
class XAPIAN_VISIBILITY_DEFAULT PL2Weight : public Weight {
|
||
|
/// The wdf normalization parameter in the formula.
|
||
|
double param_c;
|
||
|
|
||
|
/** The factor to multiply weights by.
|
||
|
*
|
||
|
* The misleading name is due to this having been used to store a lower
|
||
|
* bound in 1.4.0. We no longer need to store that, and so this member
|
||
|
* has been repurposed in 1.4.1 and later (but the name left the same to
|
||
|
* ensure ABI compatibility with 1.4.0).
|
||
|
*/
|
||
|
double lower_bound;
|
||
|
|
||
|
/// The upper bound on the weight.
|
||
|
double upper_bound;
|
||
|
|
||
|
/// Constants for a given term in a given query.
|
||
|
double P1, P2;
|
||
|
|
||
|
/// Set by init() to (param_c * get_average_length())
|
||
|
double cl;
|
||
|
|
||
|
PL2Weight * clone() const;
|
||
|
|
||
|
void init(double factor);
|
||
|
|
||
|
public:
|
||
|
/** Construct a PL2Weight.
|
||
|
*
|
||
|
* @param c A strictly positive parameter controlling the extent
|
||
|
* of the normalization of the wdf to the document length. The
|
||
|
* default value of 1 is suitable for longer queries but it may
|
||
|
* need to be changed for shorter queries. For more information,
|
||
|
* please refer to Gianni Amati's PHD thesis titled
|
||
|
* Probabilistic Models for Information Retrieval based on
|
||
|
* Divergence from Randomness.
|
||
|
*/
|
||
|
explicit PL2Weight(double c);
|
||
|
|
||
|
PL2Weight() : param_c(1.0) {
|
||
|
need_stat(AVERAGE_LENGTH);
|
||
|
need_stat(DOC_LENGTH);
|
||
|
need_stat(DOC_LENGTH_MIN);
|
||
|
need_stat(DOC_LENGTH_MAX);
|
||
|
need_stat(COLLECTION_SIZE);
|
||
|
need_stat(COLLECTION_FREQ);
|
||
|
need_stat(WDF);
|
||
|
need_stat(WDF_MAX);
|
||
|
need_stat(WQF);
|
||
|
}
|
||
|
|
||
|
std::string name() const;
|
||
|
|
||
|
std::string serialise() const;
|
||
|
PL2Weight * unserialise(const std::string & serialised) const;
|
||
|
|
||
|
double get_sumpart(Xapian::termcount wdf,
|
||
|
Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterms) const;
|
||
|
double get_maxpart() const;
|
||
|
|
||
|
double get_sumextra(Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterms) const;
|
||
|
double get_maxextra() const;
|
||
|
};
|
||
|
|
||
|
/// Xapian::Weight subclass implementing the PL2+ probabilistic formula.
|
||
|
class XAPIAN_VISIBILITY_DEFAULT PL2PlusWeight : public Weight {
|
||
|
/// The factor to multiply weights by.
|
||
|
double factor;
|
||
|
|
||
|
/// The wdf normalization parameter in the formula.
|
||
|
double param_c;
|
||
|
|
||
|
/// Additional parameter delta in the PL2+ weighting formula.
|
||
|
double param_delta;
|
||
|
|
||
|
/// The upper bound on the weight.
|
||
|
double upper_bound;
|
||
|
|
||
|
/// Constants for a given term in a given query.
|
||
|
double P1, P2;
|
||
|
|
||
|
/// Set by init() to (param_c * get_average_length())
|
||
|
double cl;
|
||
|
|
||
|
/// Set by init() to get_collection_freq()) / get_collection_size()
|
||
|
double mean;
|
||
|
|
||
|
/// Weight contribution of delta term in the PL2+ function
|
||
|
double dw;
|
||
|
|
||
|
PL2PlusWeight * clone() const;
|
||
|
|
||
|
void init(double factor_);
|
||
|
|
||
|
public:
|
||
|
/** Construct a PL2PlusWeight.
|
||
|
*
|
||
|
* @param c A strictly positive parameter controlling the extent
|
||
|
* of the normalization of the wdf to the document length. The
|
||
|
* default value of 1 is suitable for longer queries but it may
|
||
|
* need to be changed for shorter queries. For more information,
|
||
|
* please refer to Gianni Amati's PHD thesis titled
|
||
|
* Probabilistic Models for Information Retrieval based on
|
||
|
* Divergence from Randomness.
|
||
|
*
|
||
|
* @param delta A parameter for pseudo tf value to control the scale
|
||
|
* of the tf lower bound. Delta(δ) should be a positive
|
||
|
* real number. It can be tuned for example from 0.1 to 1.5
|
||
|
* in increments of 0.1 or so. Experiments have shown that
|
||
|
* PL2+ works effectively across collections with a fixed δ = 0.8
|
||
|
* (default 0.8)
|
||
|
*/
|
||
|
PL2PlusWeight(double c, double delta);
|
||
|
|
||
|
PL2PlusWeight()
|
||
|
: param_c(1.0), param_delta(0.8) {
|
||
|
need_stat(AVERAGE_LENGTH);
|
||
|
need_stat(DOC_LENGTH);
|
||
|
need_stat(DOC_LENGTH_MIN);
|
||
|
need_stat(DOC_LENGTH_MAX);
|
||
|
need_stat(COLLECTION_SIZE);
|
||
|
need_stat(COLLECTION_FREQ);
|
||
|
need_stat(WDF);
|
||
|
need_stat(WDF_MAX);
|
||
|
need_stat(WQF);
|
||
|
}
|
||
|
|
||
|
std::string name() const;
|
||
|
|
||
|
std::string serialise() const;
|
||
|
PL2PlusWeight * unserialise(const std::string & serialised) const;
|
||
|
|
||
|
double get_sumpart(Xapian::termcount wdf,
|
||
|
Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterms) const;
|
||
|
double get_maxpart() const;
|
||
|
|
||
|
double get_sumextra(Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterms) const;
|
||
|
double get_maxextra() const;
|
||
|
};
|
||
|
|
||
|
/** This class implements the DPH weighting scheme.
|
||
|
*
|
||
|
* DPH is a representative scheme of the Divergence from Randomness Framework
|
||
|
* by Gianni Amati.
|
||
|
*
|
||
|
* This is a parameter free weighting scheme and it should be used with query
|
||
|
* expansion to obtain better results. It uses the HyperGeometric Probabilistic
|
||
|
* model and Popper's normalization to calculate the risk gain.
|
||
|
*
|
||
|
* For more information about the DFR Framework and the DPH scheme, please
|
||
|
* refer to :
|
||
|
* a.) Gianni Amati and Cornelis Joost Van Rijsbergen
|
||
|
* Probabilistic models of information retrieval based on measuring the
|
||
|
* divergence from randomness ACM Transactions on Information Systems (TOIS) 20,
|
||
|
* (4), 2002, pp. 357-389.
|
||
|
* b.) FUB, IASI-CNR and University of Tor Vergata at TREC 2007 Blog Track.
|
||
|
* G. Amati and E. Ambrosi and M. Bianchi and C. Gaibisso and G. Gambosi.
|
||
|
* Proceedings of the 16th Text Retrieval Conference (TREC-2007), 2008.
|
||
|
*/
|
||
|
class XAPIAN_VISIBILITY_DEFAULT DPHWeight : public Weight {
|
||
|
/// The upper bound on the weight.
|
||
|
double upper_bound;
|
||
|
|
||
|
/// Now unused but left in place in 1.4.x for ABI compatibility.
|
||
|
double lower_bound;
|
||
|
|
||
|
/// The constant value used in get_sumpart() .
|
||
|
double log_constant;
|
||
|
double wqf_product_factor;
|
||
|
|
||
|
DPHWeight * clone() const;
|
||
|
|
||
|
void init(double factor);
|
||
|
|
||
|
public:
|
||
|
/** Construct a DPHWeight. */
|
||
|
DPHWeight() {
|
||
|
need_stat(DOC_LENGTH);
|
||
|
need_stat(COLLECTION_FREQ);
|
||
|
need_stat(WDF);
|
||
|
need_stat(WQF);
|
||
|
need_stat(WDF_MAX);
|
||
|
need_stat(DOC_LENGTH_MIN);
|
||
|
need_stat(DOC_LENGTH_MAX);
|
||
|
need_stat(TOTAL_LENGTH);
|
||
|
}
|
||
|
|
||
|
std::string name() const;
|
||
|
|
||
|
std::string serialise() const;
|
||
|
DPHWeight * unserialise(const std::string & serialised) const;
|
||
|
|
||
|
double get_sumpart(Xapian::termcount wdf,
|
||
|
Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterms) const;
|
||
|
double get_maxpart() const;
|
||
|
|
||
|
double get_sumextra(Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterms) const;
|
||
|
double get_maxextra() const;
|
||
|
};
|
||
|
|
||
|
|
||
|
/** Xapian::Weight subclass implementing the Language Model formula.
|
||
|
*
|
||
|
* This class implements the "Language Model" Weighting scheme, as
|
||
|
* described by the early papers on LM by Bruce Croft.
|
||
|
*
|
||
|
* LM works by comparing the query to a Language Model of the document.
|
||
|
* The language model itself is parameter-free, though LMWeight takes
|
||
|
* parameters which specify the smoothing used.
|
||
|
*/
|
||
|
class XAPIAN_VISIBILITY_DEFAULT LMWeight : public Weight {
|
||
|
/** The type of smoothing to use. */
|
||
|
type_smoothing select_smoothing;
|
||
|
|
||
|
// Parameters for handling negative value of log, and for smoothing.
|
||
|
double param_log, param_smoothing1, param_smoothing2;
|
||
|
|
||
|
/** The factor to multiply weights by.
|
||
|
*
|
||
|
* The misleading name is due to this having been used to store some
|
||
|
* other value in 1.4.0. However, that value only takes one
|
||
|
* multiplication and one division to calculate, so for 1.4.x we can just
|
||
|
* recalculate it each time we need it, and so this member has been
|
||
|
* repurposed in 1.4.1 and later (but the name left the same to ensure ABI
|
||
|
* compatibility with 1.4.0).
|
||
|
*/
|
||
|
double weight_collection;
|
||
|
|
||
|
LMWeight * clone() const;
|
||
|
|
||
|
void init(double factor);
|
||
|
|
||
|
public:
|
||
|
/** Construct a LMWeight.
|
||
|
*
|
||
|
* @param param_log_ A non-negative parameter controlling how much
|
||
|
* to clamp negative values returned by the log.
|
||
|
* The log is calculated by multiplying the
|
||
|
* actual weight by param_log. If param_log is
|
||
|
* 0.0, then the document length upper bound will
|
||
|
* be used (default: document length upper bound)
|
||
|
*
|
||
|
* @param select_smoothing_ A parameter of type enum
|
||
|
* type_smoothing. This parameter
|
||
|
* controls which smoothing type to use.
|
||
|
* (default: TWO_STAGE_SMOOTHING)
|
||
|
*
|
||
|
* @param param_smoothing1_ A non-negative parameter for smoothing
|
||
|
* whose meaning depends on
|
||
|
* select_smoothing_. In
|
||
|
* JELINEK_MERCER_SMOOTHING, it plays the
|
||
|
* role of estimation and in
|
||
|
* DIRICHLET_SMOOTHING the role of query
|
||
|
* modelling. (default JELINEK_MERCER,
|
||
|
* ABSOLUTE, TWOSTAGE(0.7),
|
||
|
* DIRCHLET(2000))
|
||
|
*
|
||
|
* @param param_smoothing2_ A non-negative parameter which is used
|
||
|
* with TWO_STAGE_SMOOTHING as parameter for Dirichlet's
|
||
|
* smoothing (default: 2000) and as parameter delta to
|
||
|
* control the scale of the tf lower bound in the
|
||
|
* DIRICHLET_PLUS_SMOOTHING (default 0.05).
|
||
|
*
|
||
|
*/
|
||
|
// Unigram LM Constructor to specifically mention all parameters for handling negative log value and smoothing.
|
||
|
explicit LMWeight(double param_log_ = 0.0,
|
||
|
type_smoothing select_smoothing_ = TWO_STAGE_SMOOTHING,
|
||
|
double param_smoothing1_ = -1.0,
|
||
|
double param_smoothing2_ = -1.0)
|
||
|
: select_smoothing(select_smoothing_), param_log(param_log_), param_smoothing1(param_smoothing1_),
|
||
|
param_smoothing2(param_smoothing2_)
|
||
|
{
|
||
|
if (param_smoothing1 < 0) param_smoothing1 = 0.7;
|
||
|
if (param_smoothing2 < 0) {
|
||
|
if (select_smoothing == TWO_STAGE_SMOOTHING)
|
||
|
param_smoothing2 = 2000.0;
|
||
|
else
|
||
|
param_smoothing2 = 0.05;
|
||
|
}
|
||
|
need_stat(DOC_LENGTH);
|
||
|
need_stat(RSET_SIZE);
|
||
|
need_stat(TERMFREQ);
|
||
|
need_stat(RELTERMFREQ);
|
||
|
need_stat(DOC_LENGTH_MAX);
|
||
|
need_stat(WDF);
|
||
|
need_stat(WDF_MAX);
|
||
|
need_stat(COLLECTION_FREQ);
|
||
|
need_stat(TOTAL_LENGTH);
|
||
|
if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING)
|
||
|
need_stat(UNIQUE_TERMS);
|
||
|
if (select_smoothing == DIRICHLET_PLUS_SMOOTHING)
|
||
|
need_stat(DOC_LENGTH_MIN);
|
||
|
}
|
||
|
|
||
|
std::string name() const;
|
||
|
|
||
|
std::string serialise() const;
|
||
|
LMWeight * unserialise(const std::string & serialised) const;
|
||
|
|
||
|
double get_sumpart(Xapian::termcount wdf,
|
||
|
Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterm) const;
|
||
|
double get_maxpart() const;
|
||
|
|
||
|
double get_sumextra(Xapian::termcount doclen, Xapian::termcount) const;
|
||
|
double get_maxextra() const;
|
||
|
};
|
||
|
|
||
|
/** Xapian::Weight subclass implementing Coordinate Matching.
|
||
|
*
|
||
|
* Each matching term score one point. See Managing Gigabytes, Second Edition
|
||
|
* p181.
|
||
|
*/
|
||
|
class XAPIAN_VISIBILITY_DEFAULT CoordWeight : public Weight {
|
||
|
/// The factor to multiply weights by.
|
||
|
double factor;
|
||
|
|
||
|
public:
|
||
|
CoordWeight * clone() const;
|
||
|
|
||
|
void init(double factor_);
|
||
|
|
||
|
/** Construct a CoordWeight. */
|
||
|
CoordWeight() { }
|
||
|
|
||
|
std::string name() const;
|
||
|
|
||
|
std::string serialise() const;
|
||
|
CoordWeight * unserialise(const std::string & serialised) const;
|
||
|
|
||
|
double get_sumpart(Xapian::termcount wdf,
|
||
|
Xapian::termcount doclen,
|
||
|
Xapian::termcount uniqterm) const;
|
||
|
double get_maxpart() const;
|
||
|
|
||
|
double get_sumextra(Xapian::termcount, Xapian::termcount) const;
|
||
|
double get_maxextra() const;
|
||
|
};
|
||
|
|
||
|
}
|
||
|
|
||
|
#endif // XAPIAN_INCLUDED_WEIGHT_H
|