/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using FieldInvertState = Lucene.Net.Index.FieldInvertState; using Term = Lucene.Net.Index.Term; using SmallFloat = Lucene.Net.Util.SmallFloat; using IDFExplanation = Lucene.Net.Search.Explanation.IDFExplanation; namespace Lucene.Net.Search { /// Expert: Scoring API. ///

Subclasses implement search scoring. /// ///

The score of query q for document d correlates to the /// cosine-distance or dot-product between document and query vectors in a /// /// Vector Space Model (VSM) of Information Retrieval. /// A document whose vector is closer to the query vector in that model is scored higher. /// /// The score is computed as follows: /// ///

/// /// ///
/// /// /// /// /// /// /// /// /// /// /// ///
/// score(q,d)   =   /// coord(q,d)  ·  /// queryNorm(q)  ·  /// /// /// /// ( /// tf(t in d)  ·  /// idf(t)2  ·  /// t.getBoost() ·  /// norm(t,d) /// ) ///
t in q
///
/// ///

where ///

    ///
  1. /// /// tf(t in d) /// correlates to the term's frequency, /// defined as the number of times term t appears in the currently scored document d. /// Documents that have more occurrences of a given term receive a higher score. /// The default computation for tf(t in d) in /// {@link Lucene.Net.Search.DefaultSimilarity#Tf(float) DefaultSimilarity} is: /// ///
     
    /// /// /// /// /// ///
    /// {@link Lucene.Net.Search.DefaultSimilarity#Tf(float) tf(t in d)}   =   /// /// frequency½ ///
    ///
     
    ///
  2. /// ///
  3. /// /// idf(t) stands for Inverse Document Frequency. This value /// correlates to the inverse of docFreq /// (the number of documents in which the term t appears). /// This means rarer terms give higher contribution to the total score. /// The default computation for idf(t) in /// {@link Lucene.Net.Search.DefaultSimilarity#Idf(int, int) DefaultSimilarity} is: /// ///
     
    /// /// /// /// /// /// /// ///
    /// {@link Lucene.Net.Search.DefaultSimilarity#Idf(int, int) idf(t)}  =   /// /// 1 + log ( /// /// /// /// /// ///
    numDocs
    –––––––––
    docFreq+1
    ///
    /// ) ///
    ///
     
    ///
  4. /// ///
  5. /// /// coord(q,d) /// is a score factor based on how many of the query terms are found in the specified document. /// Typically, a document that contains more of the query's terms will receive a higher score /// than another document with fewer query terms. /// This is a search time factor computed in /// {@link #Coord(int, int) coord(q,d)} /// by the Similarity in effect at search time. ///
     
    ///
  6. /// ///
  7. /// /// queryNorm(q) /// /// is a normalizing factor used to make scores between queries comparable. /// This factor does not affect document ranking (since all ranked documents are multiplied by the same factor), /// but rather just attempts to make scores from different queries (or even different indexes) comparable. /// This is a search time factor computed by the Similarity in effect at search time. /// /// The default computation in /// {@link Lucene.Net.Search.DefaultSimilarity#QueryNorm(float) DefaultSimilarity} /// is: ///
     
    /// /// /// /// /// ///
    /// queryNorm(q)   =   /// {@link Lucene.Net.Search.DefaultSimilarity#QueryNorm(float) queryNorm(sumOfSquaredWeights)} ///   =   /// /// /// /// /// ///
    1
    /// –––––––––––––– ///
    sumOfSquaredWeights½
    ///
    ///
     
    /// /// The sum of squared weights (of the query terms) is /// computed by the query {@link Lucene.Net.Search.Weight} object. /// For example, a {@link Lucene.Net.Search.BooleanQuery boolean query} /// computes this value as: /// ///
     
    /// /// /// /// /// /// /// /// /// /// /// ///
    /// {@link Lucene.Net.Search.Weight#SumOfSquaredWeights() sumOfSquaredWeights}   =   /// {@link Lucene.Net.Search.Query#GetBoost() q.getBoost()} 2 ///  ·  /// /// /// /// ( /// idf(t)  ·  /// t.getBoost() /// ) 2 ///
    t in q
    ///
     
    /// ///
  8. /// ///
  9. /// /// t.getBoost() /// is a search time boost of term t in the query q as /// specified in the query text /// (see query syntax), /// or as set by application calls to /// {@link Lucene.Net.Search.Query#SetBoost(float) setBoost()}. /// Notice that there is really no direct API for accessing a boost of one term in a multi term query, /// but rather multi terms are represented in a query as multi /// {@link Lucene.Net.Search.TermQuery TermQuery} objects, /// and so the boost of a term in the query is accessible by calling the sub-query /// {@link Lucene.Net.Search.Query#GetBoost() getBoost()}. ///
     
    ///
  10. /// ///
  11. /// /// norm(t,d) encapsulates a few (indexing time) boost and length factors: /// ///
      ///
    • Document boost - set by calling /// {@link Lucene.Net.Documents.Document#SetBoost(float) doc.setBoost()} /// before adding the document to the index. ///
    • ///
    • Field boost - set by calling /// {@link Lucene.Net.Documents.Fieldable#SetBoost(float) field.setBoost()} /// before adding the field to a document. ///
    • ///
    • {@link #LengthNorm(String, int) lengthNorm(field)} - computed /// when the document is added to the index in accordance with the number of tokens /// of this field in the document, so that shorter fields contribute more to the score. /// LengthNorm is computed by the Similarity class in effect at indexing. ///
    • ///
    /// ///

    /// When a document is added to the index, all the above factors are multiplied. /// If the document has multiple fields with the same name, all their boosts are multiplied together: /// ///
     
    /// /// /// /// /// /// /// /// /// /// /// ///
    /// norm(t,d)   =   /// {@link Lucene.Net.Documents.Document#GetBoost() doc.getBoost()} ///  ·  /// {@link #LengthNorm(String, int) lengthNorm(field)} ///  ·  /// /// /// /// {@link Lucene.Net.Documents.Fieldable#GetBoost() f.getBoost}() ///
    field f in d named as t
    ///
     
    /// However the resulted norm value is {@link #EncodeNorm(float) encoded} as a single byte /// before being stored. /// At search time, the norm byte value is read from the index /// {@link Lucene.Net.Store.Directory directory} and /// {@link #DecodeNorm(byte) decoded} back to a float norm value. /// This encoding/decoding, while reducing index size, comes with the price of /// precision loss - it is not guaranteed that decode(encode(x)) = x. /// For instance, decode(encode(0.89)) = 0.75. /// Also notice that search time is too late to modify this norm part of scoring, e.g. by /// using a different {@link Similarity} for search. ///
     
    ///

  12. ///
/// ///
/// /// /// /// /// /// [Serializable] public abstract class Similarity { public Similarity() { InitBlock(); } [Serializable] private class AnonymousClassIDFExplanation:IDFExplanation { public AnonymousClassIDFExplanation(float idf, Similarity enclosingInstance) { InitBlock(idf, enclosingInstance); } private void InitBlock(float idf, Similarity enclosingInstance) { this.idf = idf; this.enclosingInstance = enclosingInstance; } private float idf; private Similarity enclosingInstance; public Similarity Enclosing_Instance { get { return enclosingInstance; } } //@Override public override float GetIdf() { return idf; } //@Override public override System.String Explain() { return "Inexplicable"; } } [Serializable] private class AnonymousClassIDFExplanation1:IDFExplanation { public AnonymousClassIDFExplanation1(int df, int max, float idf, Similarity enclosingInstance) { InitBlock(df, max, idf, enclosingInstance); } private void InitBlock(int df, int max, float idf, Similarity enclosingInstance) { this.df = df; this.max = max; this.idf = idf; this.enclosingInstance = enclosingInstance; } private int df; private int max; private float idf; private Similarity enclosingInstance; public Similarity Enclosing_Instance { get { return enclosingInstance; } } //@Override public override System.String Explain() { return "idf(docFreq=" + df + ", maxDocs=" + max + ")"; } //@Override public override float GetIdf() { return idf; } } [Serializable] private class AnonymousClassIDFExplanation2:IDFExplanation { public AnonymousClassIDFExplanation2(float idf, Similarity enclosingInstance) { InitBlock(idf, enclosingInstance); } private void InitBlock(float idf, Similarity enclosingInstance) { this.idf = idf; this.enclosingInstance = enclosingInstance; } private float idf; private Similarity enclosingInstance; public Similarity Enclosing_Instance { get { return enclosingInstance; } } //@Override public override float GetIdf() { return idf; } //@Override public override System.String Explain() { return "Inexplicable"; } } [Serializable] private class AnonymousClassIDFExplanation3:IDFExplanation { public AnonymousClassIDFExplanation3(float fIdf, System.Text.StringBuilder exp, Similarity enclosingInstance) { InitBlock(fIdf, exp, enclosingInstance); } private void InitBlock(float fIdf, System.Text.StringBuilder exp, Similarity enclosingInstance) { this.fIdf = fIdf; this.exp = exp; this.enclosingInstance = enclosingInstance; } private float fIdf; private System.Text.StringBuilder exp; private Similarity enclosingInstance; public Similarity Enclosing_Instance { get { return enclosingInstance; } } //@Override public override float GetIdf() { return fIdf; } //@Override public override System.String Explain() { return exp.ToString(); } } private void InitBlock() { SupportedMethods = GetSupportedMethods(this.GetType()); } public const int NO_DOC_ID_PROVIDED = - 1; /// Set the default Similarity implementation used by indexing and search /// code. /// /// /// /// /// /// public static void SetDefault(Similarity similarity) { Similarity.defaultImpl = similarity; } /// Return the default Similarity implementation used by indexing and search /// code. /// ///

This is initially an instance of {@link DefaultSimilarity}. /// ///

/// /// /// /// public static Similarity GetDefault() { return Similarity.defaultImpl; } /// Cache of decoded bytes. private static readonly float[] NORM_TABLE = new float[256]; /// Decodes a normalization factor stored in an index. /// /// public static float DecodeNorm(byte b) { return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127 } /// Returns a table for decoding normalization bytes. /// /// public static float[] GetNormDecoder() { return NORM_TABLE; } /// Compute the normalization value for a field, given the accumulated /// state of term processing for this field (see {@link FieldInvertState}). /// ///

Implementations should calculate a float value based on the field /// state and then return that value. /// ///

For backward compatibility this method by default calls /// {@link #LengthNorm(String, int)} passing /// {@link FieldInvertState#GetLength()} as the second argument, and /// then multiplies this value by {@link FieldInvertState#GetBoost()}.

/// ///

WARNING: This API is new and experimental and may /// suddenly change.

/// ///

/// field name /// /// current processing state for this field /// /// the calculated float norm /// public virtual float ComputeNorm(System.String field, FieldInvertState state) { return (float) (state.GetBoost() * LengthNorm(field, state.GetLength())); } /// Computes the normalization value for a field given the total number of /// terms contained in a field. These values, together with field boosts, are /// stored in an index and multipled into scores for hits on each field by the /// search code. /// ///

Matches in longer fields are less precise, so implementations of this /// method usually return smaller values when numTokens is large, /// and larger values when numTokens is small. /// ///

Note that the return values are computed under /// {@link Lucene.Net.Index.IndexWriter#AddDocument(Lucene.Net.Documents.Document)} /// and then stored using /// {@link #EncodeNorm(float)}. /// Thus they have limited precision, and documents /// must be re-indexed if this method is altered. /// ///

/// the name of the field /// /// the total number of tokens contained in fields named /// fieldName of doc. /// /// a normalization factor for hits on this field of this document /// /// /// /// public abstract float LengthNorm(System.String fieldName, int numTokens); /// Computes the normalization value for a query given the sum of the squared /// weights of each of the query terms. This value is then multipled into the /// weight of each query term. /// ///

This does not affect ranking, but rather just attempts to make scores /// from different queries comparable. /// ///

/// the sum of the squares of query term weights /// /// a normalization factor for query weights /// public abstract float QueryNorm(float sumOfSquaredWeights); /// Encodes a normalization factor for storage in an index. /// ///

The encoding uses a three-bit mantissa, a five-bit exponent, and /// the zero-exponent point at 15, thus /// representing values from around 7x10^9 to 2x10^-9 with about one /// significant decimal digit of accuracy. Zero is also represented. /// Negative numbers are rounded up to zero. Values too large to represent /// are rounded down to the largest representable value. Positive values too /// small to represent are rounded up to the smallest positive representable /// value. /// ///

/// /// /// /// public static byte EncodeNorm(float f) { return (byte) SmallFloat.FloatToByte315(f); } /// Computes a score factor based on a term or phrase's frequency in a /// document. This value is multiplied by the {@link #Idf(Term, Searcher)} /// factor for each term in the query and these products are then summed to /// form the initial score for a document. /// ///

Terms and phrases repeated in a document indicate the topic of the /// document, so implementations of this method usually return larger values /// when freq is large, and smaller values when freq /// is small. /// ///

The default implementation calls {@link #Tf(float)}. /// ///

/// the frequency of a term within a document /// /// a score factor based on a term's within-document frequency /// public virtual float Tf(int freq) { return Tf((float) freq); } /// Computes the amount of a sloppy phrase match, based on an edit distance. /// This value is summed for each sloppy phrase match in a document to form /// the frequency that is passed to {@link #Tf(float)}. /// ///

A phrase match with a small edit distance to a document passage more /// closely matches the document, so implementations of this method usually /// return larger values when the edit distance is small and smaller values /// when it is large. /// ///

/// /// /// the edit distance of this sloppy phrase match /// /// the frequency increment for this match /// public abstract float SloppyFreq(int distance); /// Computes a score factor based on a term or phrase's frequency in a /// document. This value is multiplied by the {@link #Idf(Term, Searcher)} /// factor for each term in the query and these products are then summed to /// form the initial score for a document. /// ///

Terms and phrases repeated in a document indicate the topic of the /// document, so implementations of this method usually return larger values /// when freq is large, and smaller values when freq /// is small. /// ///

/// the frequency of a term within a document /// /// a score factor based on a term's within-document frequency /// public abstract float Tf(float freq); /// Computes a score factor for a simple term. /// ///

The default implementation is:

		/// return idf(searcher.docFreq(term), searcher.maxDoc());
		/// 
/// /// Note that {@link Searcher#MaxDoc()} is used instead of /// {@link Lucene.Net.Index.IndexReader#NumDocs()} because it is proportional to /// {@link Searcher#DocFreq(Term)} , i.e., when one is inaccurate, /// so is the other, and in the same direction. /// ///
/// the term in question /// /// the document collection being searched /// /// a score factor for the term /// /// see {@link #IdfExplain(Term, Searcher)} /// [Obsolete("see IdfExplain(Term, Searcher)")] public virtual float Idf(Term term, Searcher searcher) { return Idf(searcher.DocFreq(term), searcher.MaxDoc()); } /// Computes a score factor for a simple term and returns an explanation /// for that score factor. /// ///

/// The default implementation uses: /// ///

		/// idf(searcher.docFreq(term), searcher.maxDoc());
		/// 
/// /// Note that {@link Searcher#MaxDoc()} is used instead of /// {@link Lucene.Net.Index.IndexReader#NumDocs()} because it is /// proportional to {@link Searcher#DocFreq(Term)} , i.e., when one is /// inaccurate, so is the other, and in the same direction. /// ///
/// the term in question /// /// the document collection being searched /// /// an IDFExplain object that includes both an idf score factor /// and an explanation for the term. /// /// IOException public virtual IDFExplanation IdfExplain(Term term, Searcher searcher) { if (SupportedMethods.overridesTermIDF) { float idf = Idf(term, searcher); return new AnonymousClassIDFExplanation(idf, this); } int df = searcher.DocFreq(term); int max = searcher.MaxDoc(); float idf2 = Idf(df, max); return new AnonymousClassIDFExplanation1(df, max, idf2, this); } /// Computes a score factor for a phrase. /// ///

The default implementation sums the {@link #Idf(Term,Searcher)} factor /// for each term in the phrase. /// ///

/// the terms in the phrase /// /// the document collection being searched /// /// idf score factor /// /// see {@link #idfExplain(Collection, Searcher)} /// [Obsolete("see IdfExplain(Collection, Searcher)")] public virtual float Idf(System.Collections.ICollection terms, Searcher searcher) { float idf = 0.0f; System.Collections.IEnumerator i = terms.GetEnumerator(); while (i.MoveNext()) { idf += Idf((Term) i.Current, searcher); } return idf; } /// Computes a score factor for a phrase. /// ///

/// The default implementation sums the idf factor for /// each term in the phrase. /// ///

/// the terms in the phrase /// /// the document collection being searched /// /// an IDFExplain object that includes both an idf /// score factor for the phrase and an explanation /// for each term. /// /// IOException public virtual IDFExplanation idfExplain(System.Collections.ICollection terms, Searcher searcher) { if (SupportedMethods.overridesCollectionIDF) { float idf = Idf(terms, searcher); return new AnonymousClassIDFExplanation2(idf, this); } int max = searcher.MaxDoc(); float idf2 = 0.0f; System.Text.StringBuilder exp = new System.Text.StringBuilder(); foreach (Term term in terms) { int df = searcher.DocFreq(term); idf2 += Idf(df, max); exp.Append(" "); exp.Append(term.Text()); exp.Append("="); exp.Append(df); } float fIdf = idf2; return new AnonymousClassIDFExplanation3(fIdf, exp, this); } /// Computes a score factor based on a term's document frequency (the number /// of documents which contain the term). This value is multiplied by the /// {@link #Tf(int)} factor for each term in the query and these products are /// then summed to form the initial score for a document. /// ///

Terms that occur in fewer documents are better indicators of topic, so /// implementations of this method usually return larger values for rare terms, /// and smaller values for common terms. /// ///

/// the number of documents which contain the term /// /// the total number of documents in the collection /// /// a score factor based on the term's document frequency /// public abstract float Idf(int docFreq, int numDocs); /// Computes a score factor based on the fraction of all query terms that a /// document contains. This value is multiplied into scores. /// ///

The presence of a large portion of the query terms indicates a better /// match with the query, so implementations of this method usually return /// larger values when the ratio between these parameters is large and smaller /// values when the ratio between them is small. /// ///

/// the number of query terms matched in the document /// /// the total number of terms in the query /// /// a score factor based on term overlap with the query /// public abstract float Coord(int overlap, int maxOverlap); /// Calculate a scoring factor based on the data in the payload. Overriding implementations /// are responsible for interpreting what is in the payload. Lucene makes no assumptions about /// what is in the byte array. ///

/// The default implementation returns 1. /// ///

/// The fieldName of the term this payload belongs to /// /// The payload byte array to be scored /// /// The offset into the payload array /// /// The length in the array /// /// An implementation dependent float to be used as a scoring factor /// /// /// See {@link #ScorePayload(int, String, int, int, byte[], int, int)} /// //TODO: When removing this, set the default value below to return 1. [Obsolete("See ScorePayload(int, String, int, int, byte[], int, int)")] public virtual float ScorePayload(System.String fieldName, byte[] payload, int offset, int length) { //Do nothing return 1; } /// Calculate a scoring factor based on the data in the payload. Overriding implementations /// are responsible for interpreting what is in the payload. Lucene makes no assumptions about /// what is in the byte array. ///

/// The default implementation returns 1. /// ///

/// The docId currently being scored. If this value is {@link #NO_DOC_ID_PROVIDED}, then it should be assumed that the PayloadQuery implementation does not provide document information /// /// The fieldName of the term this payload belongs to /// /// The start position of the payload /// /// The end position of the payload /// /// The payload byte array to be scored /// /// The offset into the payload array /// /// The length in the array /// /// An implementation dependent float to be used as a scoring factor /// /// public virtual float ScorePayload(int docId, System.String fieldName, int start, int end, byte[] payload, int offset, int length) { //TODO: When removing the deprecated scorePayload above, set this to return 1 return ScorePayload(fieldName, payload, offset, length); } /// Remove this when old API is removed! /// [Obsolete("Remove this when old API is removed! ")] private MethodSupport SupportedMethods; /// Remove this when old API is removed! /// [Obsolete("Remove this when old API is removed! ")] [Serializable] private sealed class MethodSupport { internal bool overridesCollectionIDF; internal bool overridesTermIDF; internal MethodSupport(System.Type clazz) { overridesCollectionIDF = IsMethodOverridden(clazz, "Idf", C_IDF_METHOD_PARAMS); overridesTermIDF = IsMethodOverridden(clazz, "Idf", T_IDF_METHOD_PARAMS); } private static bool IsMethodOverridden(System.Type clazz, System.String name, System.Type[] params_Renamed) { try { return clazz.GetMethod(name, (params_Renamed == null)?new System.Type[0]:(System.Type[]) params_Renamed).DeclaringType != typeof(Similarity); } catch (System.MethodAccessException e) { // should not happen throw new System.SystemException(e.Message, e); } } /// Remove this when old API is removed! /// [Obsolete("Remove this when old API is removed! ")] private static readonly System.Type[] T_IDF_METHOD_PARAMS = new System.Type[]{typeof(Term), typeof(Searcher)}; /// Remove this when old API is removed! /// [Obsolete("Remove this when old API is removed! ")] private static readonly System.Type[] C_IDF_METHOD_PARAMS = new System.Type[]{typeof(System.Collections.ICollection), typeof(Searcher)}; } /// Remove this when old API is removed! /// [Obsolete("Remove this when old API is removed! ")] private static readonly System.Collections.Hashtable knownMethodSupport = new System.Collections.Hashtable(); // {{Aroush-2.9 Port issue, need to mimic java's IdentityHashMap /* * From Java docs: * This class implements the Map interface with a hash table, using * reference-equality in place of object-equality when comparing keys * (and values). In other words, in an IdentityHashMap, two keys k1 and k2 * are considered equal if and only if (k1==k2). (In normal Map * implementations (like HashMap) two keys k1 and k2 are considered * equal if and only if (k1==null ? k2==null : k1.equals(k2)).) */ // Aroush-2.9}} /// Remove this when old API is removed! /// [Obsolete("Remove this when old API is removed! ")] private static MethodSupport GetSupportedMethods(System.Type clazz) { MethodSupport supportedMethods; lock (knownMethodSupport) { supportedMethods = (MethodSupport) knownMethodSupport[clazz]; if (supportedMethods == null) { knownMethodSupport.Add(clazz, supportedMethods = new MethodSupport(clazz)); } } return supportedMethods; } /// The Similarity implementation used by default. /// TODO: move back to top when old API is removed! /// /// private static Similarity defaultImpl = new DefaultSimilarity(); static Similarity() { { for (int i = 0; i < 256; i++) NORM_TABLE[i] = SmallFloat.Byte315ToFloat((byte) i); } } } }