/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using System.Collections.Generic; using Lucene.Net.Documents; using FieldInvertState = Lucene.Net.Index.FieldInvertState; using Term = Lucene.Net.Index.Term; using SmallFloat = Lucene.Net.Util.SmallFloat; using IDFExplanation = Lucene.Net.Search.Explanation.IDFExplanation; namespace Lucene.Net.Search { /// Expert: Scoring API. ///

Subclasses implement search scoring. /// ///

The score of query q for document d correlates to the /// cosine-distance or dot-product between document and query vectors in a /// /// Vector Space Model (VSM) of Information Retrieval. /// A document whose vector is closer to the query vector in that model is scored higher. /// /// The score is computed as follows: /// ///

/// /// ///
/// /// /// /// /// /// /// /// /// /// /// ///
/// score(q,d)   =   /// coord(q,d)  ·  /// queryNorm(q)  ·  /// /// /// /// ( /// tf(t in d)  ·  /// idf(t)2  ·  /// t.Boost ·  /// norm(t,d) /// ) ///
t in q
///
/// ///

where /// /// /// /// tf(t in d) /// correlates to the term's frequency, /// defined as the number of times term t appears in the currently scored document d. /// Documents that have more occurrences of a given term receive a higher score. /// The default computation for tf(t in d) in /// DefaultSimilarity is: /// ///
 
/// /// /// /// /// ///
/// tf(t in d)   =   /// /// frequency½ ///
///
 
///
/// /// /// /// idf(t) stands for Inverse Document Frequency. This value /// correlates to the inverse of docFreq /// (the number of documents in which the term t appears). /// This means rarer terms give higher contribution to the total score. /// The default computation for idf(t) in /// DefaultSimilarity is: /// ///
 
/// /// /// /// /// /// /// ///
/// idf(t)  =   /// /// 1 + log ( /// /// /// /// /// ///
numDocs
–––––––––
docFreq+1
///
/// ) ///
///
 
///
/// /// /// /// coord(q,d) /// is a score factor based on how many of the query terms are found in the specified document. /// Typically, a document that contains more of the query's terms will receive a higher score /// than another document with fewer query terms. /// This is a search time factor computed in /// coord(q,d) /// by the Similarity in effect at search time. ///
 
///
/// /// /// /// queryNorm(q) /// /// is a normalizing factor used to make scores between queries comparable. /// This factor does not affect document ranking (since all ranked documents are multiplied by the same factor), /// but rather just attempts to make scores from different queries (or even different indexes) comparable. /// This is a search time factor computed by the Similarity in effect at search time. /// /// The default computation in /// DefaultSimilarity /// is: ///
 
/// /// /// /// /// ///
/// queryNorm(q)   =   /// queryNorm(sumOfSquaredWeights) ///   =   /// /// /// /// /// ///
1
/// –––––––––––––– ///
sumOfSquaredWeights½
///
///
 
/// /// The sum of squared weights (of the query terms) is /// computed by the query object. /// For example, a boolean query /// computes this value as: /// ///
 
/// /// /// /// /// /// /// /// /// /// /// ///
/// GetSumOfSquaredWeights   =   /// q.Boost 2 ///  ·  /// /// /// /// ( /// idf(t)  ·  /// t.Boost /// ) 2 ///
t in q
///
 
/// ///
/// /// /// /// t.Boost /// is a search time boost of term t in the query q as /// specified in the query text /// (see query syntax), /// or as set by application calls to /// . /// Notice that there is really no direct API for accessing a boost of one term in a multi term query, /// but rather multi terms are represented in a query as multi /// TermQuery objects, /// and so the boost of a term in the query is accessible by calling the sub-query /// . ///
 
///
/// /// /// /// norm(t,d) encapsulates a few (indexing time) boost and length factors: /// /// /// Document boost - set by calling /// doc.Boost /// before adding the document to the index. /// /// Field boost - set by calling /// field.Boost /// before adding the field to a document. /// /// LengthNorm(field) - computed /// when the document is added to the index in accordance with the number of tokens /// of this field in the document, so that shorter fields contribute more to the score. /// LengthNorm is computed by the Similarity class in effect at indexing. /// /// /// ///

/// When a document is added to the index, all the above factors are multiplied. /// If the document has multiple fields with the same name, all their boosts are multiplied together: /// ///
 
/// /// /// /// /// /// /// /// /// /// /// ///
/// norm(t,d)   =   /// doc.Boost ///  ·  /// LengthNorm(field) ///  ·  /// /// /// /// field.Boost ///
field f in d named as t
///
 
/// However the resulted norm value is encoded as a single byte /// before being stored. /// At search time, the norm byte value is read from the index /// directory and /// decoded back to a float norm value. /// This encoding/decoding, while reducing index size, comes with the price of /// precision loss - it is not guaranteed that decode(encode(x)) = x. /// For instance, decode(encode(0.89)) = 0.75. /// Also notice that search time is too late to modify this norm part of scoring, e.g. by /// using a different for search. ///
 
/// /// /// ///

/// /// /// /// /// /// [Serializable] public abstract class Similarity { protected Similarity() { InitBlock(); } [Serializable] private class AnonymousClassIDFExplanation1:IDFExplanation { public AnonymousClassIDFExplanation1(int df, int max, float idf, Similarity enclosingInstance) { InitBlock(df, max, idf, enclosingInstance); } private void InitBlock(int df, int max, float idf, Similarity enclosingInstance) { this.df = df; this.max = max; this.idf = idf; this.enclosingInstance = enclosingInstance; } private int df; private int max; private float idf; private Similarity enclosingInstance; public Similarity Enclosing_Instance { get { return enclosingInstance; } } //@Override public override System.String Explain() { return "idf(docFreq=" + df + ", maxDocs=" + max + ")"; } //@Override public override float Idf { get { return idf; } } } [Serializable] private class AnonymousClassIDFExplanation3:IDFExplanation { public AnonymousClassIDFExplanation3(float fIdf, System.Text.StringBuilder exp, Similarity enclosingInstance) { InitBlock(fIdf, exp, enclosingInstance); } private void InitBlock(float fIdf, System.Text.StringBuilder exp, Similarity enclosingInstance) { this.fIdf = fIdf; this.exp = exp; this.enclosingInstance = enclosingInstance; } private float fIdf; private System.Text.StringBuilder exp; private Similarity enclosingInstance; public Similarity Enclosing_Instance { get { return enclosingInstance; } } //@Override public override float Idf { get { return fIdf; } } //@Override public override System.String Explain() { return exp.ToString(); } } private void InitBlock() { } /// The Similarity implementation used by default. private static Similarity defaultImpl = new DefaultSimilarity(); public const int NO_DOC_ID_PROVIDED = - 1; /// Gets or sets the default Similarity implementation /// used by indexing and search code. ///

This is initially an instance of . ///

/// /// /// /// public static Similarity Default { get { return defaultImpl; } set { defaultImpl = value; } } /// Cache of decoded bytes. private static readonly float[] NORM_TABLE = new float[256]; /// Decodes a normalization factor stored in an index. /// /// public static float DecodeNorm(byte b) { return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127 } /// Returns a table for decoding normalization bytes. /// /// public static float[] GetNormDecoder() { return NORM_TABLE; } /// Compute the normalization value for a field, given the accumulated /// state of term processing for this field (see ). /// ///

Implementations should calculate a float value based on the field /// state and then return that value. /// ///

For backward compatibility this method by default calls /// passing /// as the second argument, and /// then multiplies this value by .

/// ///

WARNING: This API is new and experimental and may /// suddenly change.

/// ///

/// field name /// /// current processing state for this field /// /// the calculated float norm /// public virtual float ComputeNorm(System.String field, FieldInvertState state) { return (float) (state.Boost * LengthNorm(field, state.Length)); } /// Computes the normalization value for a field given the total number of /// terms contained in a field. These values, together with field boosts, are /// stored in an index and multipled into scores for hits on each field by the /// search code. /// ///

Matches in longer fields are less precise, so implementations of this /// method usually return smaller values when numTokens is large, /// and larger values when numTokens is small. /// ///

Note that the return values are computed under /// /// and then stored using /// . /// Thus they have limited precision, and documents /// must be re-indexed if this method is altered. /// ///

/// the name of the field /// /// the total number of tokens contained in fields named /// fieldName of doc. /// /// a normalization factor for hits on this field of this document /// /// /// public abstract float LengthNorm(System.String fieldName, int numTokens); /// Computes the normalization value for a query given the sum of the squared /// weights of each of the query terms. This value is then multipled into the /// weight of each query term. /// ///

This does not affect ranking, but rather just attempts to make scores /// from different queries comparable. /// ///

/// the sum of the squares of query term weights /// /// a normalization factor for query weights /// public abstract float QueryNorm(float sumOfSquaredWeights); /// Encodes a normalization factor for storage in an index. /// ///

The encoding uses a three-bit mantissa, a five-bit exponent, and /// the zero-exponent point at 15, thus /// representing values from around 7x10^9 to 2x10^-9 with about one /// significant decimal digit of accuracy. Zero is also represented. /// Negative numbers are rounded up to zero. Values too large to represent /// are rounded down to the largest representable value. Positive values too /// small to represent are rounded up to the smallest positive representable /// value. /// ///

/// /// public static byte EncodeNorm(float f) { return (byte) SmallFloat.FloatToByte315(f); } /// Computes a score factor based on a term or phrase's frequency in a /// document. This value is multiplied by the /// factor for each term in the query and these products are then summed to /// form the initial score for a document. /// ///

Terms and phrases repeated in a document indicate the topic of the /// document, so implementations of this method usually return larger values /// when freq is large, and smaller values when freq /// is small. /// ///

The default implementation calls . /// ///

/// the frequency of a term within a document /// /// a score factor based on a term's within-document frequency /// public virtual float Tf(int freq) { return Tf((float) freq); } /// Computes the amount of a sloppy phrase match, based on an edit distance. /// This value is summed for each sloppy phrase match in a document to form /// the frequency that is passed to . /// ///

A phrase match with a small edit distance to a document passage more /// closely matches the document, so implementations of this method usually /// return larger values when the edit distance is small and smaller values /// when it is large. /// ///

/// /// the edit distance of this sloppy phrase match /// the frequency increment for this match public abstract float SloppyFreq(int distance); /// Computes a score factor based on a term or phrase's frequency in a /// document. This value is multiplied by the /// factor for each term in the query and these products are then summed to /// form the initial score for a document. /// ///

Terms and phrases repeated in a document indicate the topic of the /// document, so implementations of this method usually return larger values /// when freq is large, and smaller values when freq /// is small. /// ///

/// the frequency of a term within a document /// /// a score factor based on a term's within-document frequency /// public abstract float Tf(float freq); /// Computes a score factor for a simple term and returns an explanation /// for that score factor. /// ///

/// The default implementation uses: /// /// /// idf(searcher.docFreq(term), searcher.MaxDoc); /// /// /// Note that is used instead of /// because it is /// proportional to , i.e., when one is /// inaccurate, so is the other, and in the same direction. /// ///

/// the term in question /// /// the document collection being searched /// /// an IDFExplain object that includes both an idf score factor /// and an explanation for the term. /// /// IOException public virtual IDFExplanation IdfExplain(Term term, Searcher searcher) { int df = searcher.DocFreq(term); int max = searcher.MaxDoc; float idf2 = Idf(df, max); return new AnonymousClassIDFExplanation1(df, max, idf2, this); } /// Computes a score factor for a phrase. /// ///

/// The default implementation sums the idf factor for /// each term in the phrase. /// ///

/// the terms in the phrase /// /// the document collection being searched /// /// an IDFExplain object that includes both an idf /// score factor for the phrase and an explanation /// for each term. /// /// IOException public virtual IDFExplanation IdfExplain(ICollection terms, Searcher searcher) { int max = searcher.MaxDoc; float idf2 = 0.0f; System.Text.StringBuilder exp = new System.Text.StringBuilder(); foreach (Term term in terms) { int df = searcher.DocFreq(term); idf2 += Idf(df, max); exp.Append(" "); exp.Append(term.Text); exp.Append("="); exp.Append(df); } float fIdf = idf2; return new AnonymousClassIDFExplanation3(fIdf, exp, this); } /// Computes a score factor based on a term's document frequency (the number /// of documents which contain the term). This value is multiplied by the /// factor for each term in the query and these products are /// then summed to form the initial score for a document. /// ///

Terms that occur in fewer documents are better indicators of topic, so /// implementations of this method usually return larger values for rare terms, /// and smaller values for common terms. /// ///

/// the number of documents which contain the term /// /// the total number of documents in the collection /// /// a score factor based on the term's document frequency /// public abstract float Idf(int docFreq, int numDocs); /// Computes a score factor based on the fraction of all query terms that a /// document contains. This value is multiplied into scores. /// ///

The presence of a large portion of the query terms indicates a better /// match with the query, so implementations of this method usually return /// larger values when the ratio between these parameters is large and smaller /// values when the ratio between them is small. /// ///

/// the number of query terms matched in the document /// /// the total number of terms in the query /// /// a score factor based on term overlap with the query /// public abstract float Coord(int overlap, int maxOverlap); /// Calculate a scoring factor based on the data in the payload. Overriding implementations /// are responsible for interpreting what is in the payload. Lucene makes no assumptions about /// what is in the byte array. ///

/// The default implementation returns 1. /// ///

/// The docId currently being scored. If this value is , then it should be assumed that the PayloadQuery implementation does not provide document information /// /// The fieldName of the term this payload belongs to /// /// The start position of the payload /// /// The end position of the payload /// /// The payload byte array to be scored /// /// The offset into the payload array /// /// The length in the array /// /// An implementation dependent float to be used as a scoring factor /// /// public virtual float ScorePayload(int docId, System.String fieldName, int start, int end, byte[] payload, int offset, int length) { return 1; } static Similarity() { { for (int i = 0; i < 256; i++) NORM_TABLE[i] = SmallFloat.Byte315ToFloat((byte) i); } } } }