/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using IndexReader = Lucene.Net.Index.IndexReader; using IndexWriter = Lucene.Net.Index.IndexWriter; using Term = Lucene.Net.Index.Term; using SmallFloat = Lucene.Net.Util.SmallFloat; namespace Lucene.Net.Search { /// Expert: Scoring API. ///

Subclasses implement search scoring. /// ///

The score of query q for document d correlates to the /// cosine-distance or dot-product between document and query vectors in a /// /// Vector Space Model (VSM) of Information Retrieval. /// A document whose vector is closer to the query vector in that model is scored higher. /// /// The score is computed as follows: /// ///

/// /// ///
/// /// /// /// /// /// /// /// /// /// /// ///
/// score(q,d)   =   /// coord(q,d)  ·  /// queryNorm(q)  ·  /// /// /// /// ( /// tf(t in d)  ·  /// idf(t)2  ·  /// t.getBoost() ·  /// norm(t,d) /// ) ///
t in q
///
/// ///

where ///

    ///
  1. /// /// tf(t in d) /// correlates to the term's frequency, /// defined as the number of times term t appears in the currently scored document d. /// Documents that have more occurrences of a given term receive a higher score. /// The default computation for tf(t in d) in /// {@link Lucene.Net.Search.DefaultSimilarity#Tf(float) DefaultSimilarity} is: /// ///
     
    /// /// /// /// /// ///
    /// {@link Lucene.Net.Search.DefaultSimilarity#Tf(float) tf(t in d)}   =   /// /// frequency½ ///
    ///
     
    ///
  2. /// ///
  3. /// /// idf(t) stands for Inverse Document Frequency. This value /// correlates to the inverse of docFreq /// (the number of documents in which the term t appears). /// This means rarer terms give higher contribution to the total score. /// The default computation for idf(t) in /// {@link Lucene.Net.Search.DefaultSimilarity#Idf(int, int) DefaultSimilarity} is: /// ///
     
    /// /// /// /// /// /// /// ///
    /// {@link Lucene.Net.Search.DefaultSimilarity#Idf(int, int) idf(t)}  =   /// /// 1 + log ( /// /// /// /// /// ///
    numDocs
    –––––––––
    docFreq+1
    ///
    /// ) ///
    ///
     
    ///
  4. /// ///
  5. /// /// coord(q,d) /// is a score factor based on how many of the query terms are found in the specified document. /// Typically, a document that contains more of the query's terms will receive a higher score /// than another document with fewer query terms. /// This is a search time factor computed in /// {@link #Coord(int, int) coord(q,d)} /// by the Similarity in effect at search time. ///
     
    ///
  6. /// ///
  7. /// /// queryNorm(q) /// /// is a normalizing factor used to make scores between queries comparable. /// This factor does not affect document ranking (since all ranked documents are multiplied by the same factor), /// but rather just attempts to make scores from different queries (or even different indexes) comparable. /// This is a search time factor computed by the Similarity in effect at search time. /// /// The default computation in /// {@link Lucene.Net.Search.DefaultSimilarity#QueryNorm(float) DefaultSimilarity} /// is: ///
     
    /// /// /// /// /// ///
    /// queryNorm(q)   =   /// {@link Lucene.Net.Search.DefaultSimilarity#QueryNorm(float) queryNorm(sumOfSquaredWeights)} ///   =   /// /// /// /// /// ///
    1
    /// –––––––––––––– ///
    sumOfSquaredWeights½
    ///
    ///
     
    /// /// The sum of squared weights (of the query terms) is /// computed by the query {@link Lucene.Net.Search.Weight} object. /// For example, a {@link Lucene.Net.Search.BooleanQuery boolean query} /// computes this value as: /// ///
     
    /// /// /// /// /// /// /// /// /// /// /// ///
    /// {@link Lucene.Net.Search.Weight#SumOfSquaredWeights() sumOfSquaredWeights}   =   /// {@link Lucene.Net.Search.Query#GetBoost() q.getBoost()} 2 ///  ·  /// /// /// /// ( /// idf(t)  ·  /// t.getBoost() /// ) 2 ///
    t in q
    ///
     
    /// ///
  8. /// ///
  9. /// /// t.getBoost() /// is a search time boost of term t in the query q as /// specified in the query text /// (see query syntax), /// or as set by application calls to /// {@link Lucene.Net.Search.Query#SetBoost(float) setBoost()}. /// Notice that there is really no direct API for accessing a boost of one term in a multi term query, /// but rather multi terms are represented in a query as multi /// {@link Lucene.Net.Search.TermQuery TermQuery} objects, /// and so the boost of a term in the query is accessible by calling the sub-query /// {@link Lucene.Net.Search.Query#GetBoost() getBoost()}. ///
     
    ///
  10. /// ///
  11. /// /// norm(t,d) encapsulates a few (indexing time) boost and length factors: /// ///
      ///
    • Document boost - set by calling /// {@link Lucene.Net.Documents.Document#SetBoost(float) doc.setBoost()} /// before adding the document to the index. ///
    • ///
    • Field boost - set by calling /// {@link Lucene.Net.Documents.Fieldable#SetBoost(float) field.setBoost()} /// before adding the field to a document. ///
    • ///
    • {@link #LengthNorm(String, int) lengthNorm(field)} - computed /// when the document is added to the index in accordance with the number of tokens /// of this field in the document, so that shorter fields contribute more to the score. /// LengthNorm is computed by the Similarity class in effect at indexing. ///
    • ///
    /// ///

    /// When a document is added to the index, all the above factors are multiplied. /// If the document has multiple fields with the same name, all their boosts are multiplied together: /// ///
     
    /// /// /// /// /// /// /// /// /// /// /// ///
    /// norm(t,d)   =   /// {@link Lucene.Net.Documents.Document#GetBoost() doc.getBoost()} ///  ·  /// {@link #LengthNorm(String, int) lengthNorm(field)} ///  ·  /// /// /// /// {@link Lucene.Net.Documents.Fieldable#GetBoost() f.getBoost}() ///
    field f in d named as t
    ///
     
    /// However the resulted norm value is {@link #EncodeNorm(float) encoded} as a single byte /// before being stored. /// At search time, the norm byte value is read from the index /// {@link Lucene.Net.Store.Directory directory} and /// {@link #DecodeNorm(byte) decoded} back to a float norm value. /// This encoding/decoding, while reducing index size, comes with the price of /// precision loss - it is not guaranteed that decode(encode(x)) = x. /// For instance, decode(encode(0.89)) = 0.75. /// Also notice that search time is too late to modify this norm part of scoring, e.g. by /// using a different {@link Similarity} for search. ///
     
    ///

  12. ///
/// ///
/// /// /// /// /// /// [Serializable] public abstract class Similarity { /// The Similarity implementation used by default. private static Similarity defaultImpl = new DefaultSimilarity(); /// Set the default Similarity implementation used by indexing and search /// code. /// /// /// /// /// /// public static void SetDefault(Similarity similarity) { Similarity.defaultImpl = similarity; } /// Return the default Similarity implementation used by indexing and search /// code. /// ///

This is initially an instance of {@link DefaultSimilarity}. /// ///

/// /// /// /// public static Similarity GetDefault() { return Similarity.defaultImpl; } /// Cache of decoded bytes. private static readonly float[] NORM_TABLE = new float[256]; /// Decodes a normalization factor stored in an index. /// /// public static float DecodeNorm(byte b) { return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127 } /// Returns a table for decoding normalization bytes. /// /// public static float[] GetNormDecoder() { return NORM_TABLE; } /// Computes the normalization value for a field given the total number of /// terms contained in a field. These values, together with field boosts, are /// stored in an index and multipled into scores for hits on each field by the /// search code. /// ///

Matches in longer fields are less precise, so implementations of this /// method usually return smaller values when numTokens is large, /// and larger values when numTokens is small. /// ///

That these values are computed under {@link /// IndexWriter#AddDocument(Lucene.Net.Documents.Document)} and stored then using /// {@link #EncodeNorm(float)}. Thus they have limited precision, and documents /// must be re-indexed if this method is altered. /// ///

/// the name of the field /// /// the total number of tokens contained in fields named /// fieldName of doc. /// /// a normalization factor for hits on this field of this document /// /// /// /// public abstract float LengthNorm(System.String fieldName, int numTokens); /// Computes the normalization value for a query given the sum of the squared /// weights of each of the query terms. This value is then multipled into the /// weight of each query term. /// ///

This does not affect ranking, but rather just attempts to make scores /// from different queries comparable. /// ///

/// the sum of the squares of query term weights /// /// a normalization factor for query weights /// public abstract float QueryNorm(float sumOfSquaredWeights); /// Encodes a normalization factor for storage in an index. /// ///

The encoding uses a three-bit mantissa, a five-bit exponent, and /// the zero-exponent point at 15, thus /// representing values from around 7x10^9 to 2x10^-9 with about one /// significant decimal digit of accuracy. Zero is also represented. /// Negative numbers are rounded up to zero. Values too large to represent /// are rounded down to the largest representable value. Positive values too /// small to represent are rounded up to the smallest positive representable /// value. /// ///

/// /// /// /// public static byte EncodeNorm(float f) { return (byte) SmallFloat.FloatToByte315(f); } /// Computes a score factor based on a term or phrase's frequency in a /// document. This value is multiplied by the {@link #Idf(Term, Searcher)} /// factor for each term in the query and these products are then summed to /// form the initial score for a document. /// ///

Terms and phrases repeated in a document indicate the topic of the /// document, so implementations of this method usually return larger values /// when freq is large, and smaller values when freq /// is small. /// ///

The default implementation calls {@link #Tf(float)}. /// ///

/// the frequency of a term within a document /// /// a score factor based on a term's within-document frequency /// public virtual float Tf(int freq) { return Tf((float) freq); } /// Computes the amount of a sloppy phrase match, based on an edit distance. /// This value is summed for each sloppy phrase match in a document to form /// the frequency that is passed to {@link #Tf(float)}. /// ///

A phrase match with a small edit distance to a document passage more /// closely matches the document, so implementations of this method usually /// return larger values when the edit distance is small and smaller values /// when it is large. /// ///

/// /// /// the edit distance of this sloppy phrase match /// /// the frequency increment for this match /// public abstract float SloppyFreq(int distance); /// Computes a score factor based on a term or phrase's frequency in a /// document. This value is multiplied by the {@link #Idf(Term, Searcher)} /// factor for each term in the query and these products are then summed to /// form the initial score for a document. /// ///

Terms and phrases repeated in a document indicate the topic of the /// document, so implementations of this method usually return larger values /// when freq is large, and smaller values when freq /// is small. /// ///

/// the frequency of a term within a document /// /// a score factor based on a term's within-document frequency /// public abstract float Tf(float freq); /// Computes a score factor for a simple term. /// ///

The default implementation is:

		/// return idf(searcher.docFreq(term), searcher.maxDoc());
		/// 
/// /// Note that {@link Searcher#MaxDoc()} is used instead of /// {@link IndexReader#NumDocs()} because it is proportional to /// {@link Searcher#DocFreq(Term)} , i.e., when one is inaccurate, /// so is the other, and in the same direction. /// ///
/// the term in question /// /// the document collection being searched /// /// a score factor for the term /// public virtual float Idf(Term term, Searcher searcher) { return Idf(searcher.DocFreq(term), searcher.MaxDoc()); } /// Computes a score factor for a phrase. /// ///

The default implementation sums the {@link #Idf(Term,Searcher)} factor /// for each term in the phrase. /// ///

/// the terms in the phrase /// /// the document collection being searched /// /// a score factor for the phrase /// public virtual float Idf(System.Collections.ICollection terms, Searcher searcher) { float idf = 0.0f; System.Collections.IEnumerator i = terms.GetEnumerator(); while (i.MoveNext()) { idf += Idf((Term) i.Current, searcher); } return idf; } /// Computes a score factor based on a term's document frequency (the number /// of documents which contain the term). This value is multiplied by the /// {@link #Tf(int)} factor for each term in the query and these products are /// then summed to form the initial score for a document. /// ///

Terms that occur in fewer documents are better indicators of topic, so /// implementations of this method usually return larger values for rare terms, /// and smaller values for common terms. /// ///

/// the number of documents which contain the term /// /// the total number of documents in the collection /// /// a score factor based on the term's document frequency /// public abstract float Idf(int docFreq, int numDocs); /// Computes a score factor based on the fraction of all query terms that a /// document contains. This value is multiplied into scores. /// ///

The presence of a large portion of the query terms indicates a better /// match with the query, so implementations of this method usually return /// larger values when the ratio between these parameters is large and smaller /// values when the ratio between them is small. /// ///

/// the number of query terms matched in the document /// /// the total number of terms in the query /// /// a score factor based on term overlap with the query /// public abstract float Coord(int overlap, int maxOverlap); static Similarity() { { for (int i = 0; i < 256; i++) NORM_TABLE[i] = SmallFloat.Byte315ToFloat((byte) i); } } } }