/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using IndexReader = Lucene.Net.Index.IndexReader; using IndexWriter = Lucene.Net.Index.IndexWriter; using Term = Lucene.Net.Index.Term; using SmallFloat = Lucene.Net.Util.SmallFloat; namespace Lucene.Net.Search { ///

Expert: Scoring API. ///

Subclasses implement search scoring. /// ///

The score of query q for document d correlates to the /// cosine-distance or dot-product between document and query vectors in a /// /// Vector Space Model (VSM) of Information Retrieval. /// A document whose vector is closer to the query vector in that model is scored higher. /// /// The score is computed as follows: /// ///

/// /// ///
/// /// /// /// /// /// /// /// /// /// /// ///
/// score(q,d) = /// coord(q,d) · /// queryNorm(q) · /// /// ∑ /// /// ( /// tf(t in d) · /// idf(t)² · /// t.getBoost() · /// norm(t,d) /// ) ///
t in q
///
/// ///

where ///

/// /// tf(t in d) /// correlates to the term's frequency, /// defined as the number of times term t appears in the currently scored document d. /// Documents that have more occurrences of a given term receive a higher score. /// The default computation for tf(t in d) in /// {@link Lucene.Net.Search.DefaultSimilarity#Tf(float) DefaultSimilarity} is: /// ///

/// /// /// /// /// ///
/// {@link Lucene.Net.Search.DefaultSimilarity#Tf(float) tf(t in d)} = /// /// frequency^½ ///
///

///
/// /// idf(t) stands for Inverse Document Frequency. This value /// correlates to the inverse of docFreq /// (the number of documents in which the term t appears). /// This means rarer terms give higher contribution to the total score. /// The default computation for idf(t) in /// {@link Lucene.Net.Search.DefaultSimilarity#Idf(int, int) DefaultSimilarity} is: /// ///

/// /// /// /// /// /// /// ///
/// {@link Lucene.Net.Search.DefaultSimilarity#Idf(int, int) idf(t)} = /// /// 1 + log ( /// /// /// /// /// ///
numDocs
–––––––––
docFreq+1
/// /// ) ///
///

///
/// /// coord(q,d) /// is a score factor based on how many of the query terms are found in the specified document. /// Typically, a document that contains more of the query's terms will receive a higher score /// than another document with fewer query terms. /// This is a search time factor computed in /// {@link #Coord(int, int) coord(q,d)} /// by the Similarity in effect at search time. ///

///

/// /// queryNorm(q) /// /// is a normalizing factor used to make scores between queries comparable. /// This factor does not affect document ranking (since all ranked documents are multiplied by the same factor), /// but rather just attempts to make scores from different queries (or even different indexes) comparable. /// This is a search time factor computed by the Similarity in effect at search time. /// /// The default computation in /// {@link Lucene.Net.Search.DefaultSimilarity#QueryNorm(float) DefaultSimilarity} /// is: ///

/// /// /// /// /// ///

/// queryNorm(q) = /// {@link Lucene.Net.Search.DefaultSimilarity#QueryNorm(float) queryNorm(sumOfSquaredWeights)} /// = ///

/// /// /// /// ///

/// –––––––––––––– ///

sumOfSquaredWeights^½

///

///

/// /// The sum of squared weights (of the query terms) is /// computed by the query {@link Lucene.Net.Search.Weight} object. /// For example, a {@link Lucene.Net.Search.BooleanQuery boolean query} /// computes this value as: /// ///

/// /// /// /// /// /// /// /// /// /// /// ///

/// {@link Lucene.Net.Search.Weight#SumOfSquaredWeights() sumOfSquaredWeights} = /// {@link Lucene.Net.Search.Query#GetBoost() q.getBoost()} ² /// · ///	/// ∑ ///	/// ( /// idf(t) · /// t.getBoost() /// ) ² ///
	t in q

///

/// ///

/// /// t.getBoost() /// is a search time boost of term t in the query q as /// specified in the query text /// (see query syntax), /// or as set by application calls to /// {@link Lucene.Net.Search.Query#SetBoost(float) setBoost()}. /// Notice that there is really no direct API for accessing a boost of one term in a multi term query, /// but rather multi terms are represented in a query as multi /// {@link Lucene.Net.Search.TermQuery TermQuery} objects, /// and so the boost of a term in the query is accessible by calling the sub-query /// {@link Lucene.Net.Search.Query#GetBoost() getBoost()}. ///

///

/// /// norm(t,d) encapsulates a few (indexing time) boost and length factors: /// ///

Document boost - set by calling /// {@link Lucene.Net.Documents.Document#SetBoost(float) doc.setBoost()} /// before adding the document to the index. ///
Field boost - set by calling /// {@link Lucene.Net.Documents.Fieldable#SetBoost(float) field.setBoost()} /// before adding the field to a document. ///
{@link #LengthNorm(String, int) lengthNorm(field)} - computed /// when the document is added to the index in accordance with the number of tokens /// of this field in the document, so that shorter fields contribute more to the score. /// LengthNorm is computed by the Similarity class in effect at indexing. ///

/// ///

/// When a document is added to the index, all the above factors are multiplied. /// If the document has multiple fields with the same name, all their boosts are multiplied together: /// ///

/// /// /// /// /// /// /// /// /// /// /// ///
/// norm(t,d) = /// {@link Lucene.Net.Documents.Document#GetBoost() doc.getBoost()} /// · /// {@link #LengthNorm(String, int) lengthNorm(field)} /// · /// /// ∏ /// /// {@link Lucene.Net.Documents.Fieldable#GetBoost() f.getBoost}() ///
field f in d named as t
///

/// However the resulted norm value is {@link #EncodeNorm(float) encoded} as a single byte /// before being stored. /// At search time, the norm byte value is read from the index /// {@link Lucene.Net.Store.Directory directory} and /// {@link #DecodeNorm(byte) decoded} back to a float norm value. /// This encoding/decoding, while reducing index size, comes with the price of /// precision loss - it is not guaranteed that decode(encode(x)) = x. /// For instance, decode(encode(0.89)) = 0.75. /// Also notice that search time is too late to modify this norm part of scoring, e.g. by /// using a different {@link Similarity} for search. ///

///

/// ///

/// /// /// /// /// /// [Serializable] public abstract class Similarity { ///

The Similarity implementation used by default.

private static Similarity defaultImpl = new DefaultSimilarity(); ///

Set the default Similarity implementation used by indexing and search /// code. /// ///

/// /// /// /// public static void SetDefault(Similarity similarity) { Similarity.defaultImpl = similarity; } ///

Return the default Similarity implementation used by indexing and search /// code. /// ///

This is initially an instance of {@link DefaultSimilarity}. /// ///

/// /// /// /// public static Similarity GetDefault() { return Similarity.defaultImpl; } ///

Cache of decoded bytes.

private static readonly float[] NORM_TABLE = new float[256]; ///

Decodes a normalization factor stored in an index.

/// /// public static float DecodeNorm(byte b) { return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127 } ///

Returns a table for decoding normalization bytes.

/// /// public static float[] GetNormDecoder() { return NORM_TABLE; } ///

Computes the normalization value for a field given the total number of /// terms contained in a field. These values, together with field boosts, are /// stored in an index and multipled into scores for hits on each field by the /// search code. /// ///

Matches in longer fields are less precise, so implementations of this /// method usually return smaller values when numTokens is large, /// and larger values when numTokens is small. /// ///

That these values are computed under {@link /// IndexWriter#AddDocument(Lucene.Net.Documents.Document)} and stored then using /// {@link #EncodeNorm(float)}. Thus they have limited precision, and documents /// must be re-indexed if this method is altered. /// ///

/// the name of the field /// /// the total number of tokens contained in fields named /// fieldName of doc. /// /// a normalization factor for hits on this field of this document /// /// /// /// public abstract float LengthNorm(System.String fieldName, int numTokens); ///

Computes the normalization value for a query given the sum of the squared /// weights of each of the query terms. This value is then multipled into the /// weight of each query term. /// ///

This does not affect ranking, but rather just attempts to make scores /// from different queries comparable. /// ///

/// the sum of the squares of query term weights /// /// a normalization factor for query weights /// public abstract float QueryNorm(float sumOfSquaredWeights); ///

Encodes a normalization factor for storage in an index. /// ///

The encoding uses a three-bit mantissa, a five-bit exponent, and /// the zero-exponent point at 15, thus /// representing values from around 7x10^9 to 2x10^-9 with about one /// significant decimal digit of accuracy. Zero is also represented. /// Negative numbers are rounded up to zero. Values too large to represent /// are rounded down to the largest representable value. Positive values too /// small to represent are rounded up to the smallest positive representable /// value. /// ///

/// /// /// /// public static byte EncodeNorm(float f) { return (byte) SmallFloat.FloatToByte315(f); } ///

Computes a score factor based on a term or phrase's frequency in a /// document. This value is multiplied by the {@link #Idf(Term, Searcher)} /// factor for each term in the query and these products are then summed to /// form the initial score for a document. /// ///

Terms and phrases repeated in a document indicate the topic of the /// document, so implementations of this method usually return larger values /// when freq is large, and smaller values when freq /// is small. /// ///

The default implementation calls {@link #Tf(float)}. /// ///

/// the frequency of a term within a document /// /// a score factor based on a term's within-document frequency /// public virtual float Tf(int freq) { return Tf((float) freq); } ///

Computes the amount of a sloppy phrase match, based on an edit distance. /// This value is summed for each sloppy phrase match in a document to form /// the frequency that is passed to {@link #Tf(float)}. /// ///

A phrase match with a small edit distance to a document passage more /// closely matches the document, so implementations of this method usually /// return larger values when the edit distance is small and smaller values /// when it is large. /// ///

/// /// /// the edit distance of this sloppy phrase match /// /// the frequency increment for this match /// public abstract float SloppyFreq(int distance); ///

/// the frequency of a term within a document /// /// a score factor based on a term's within-document frequency /// public abstract float Tf(float freq); ///

Computes a score factor for a simple term. /// ///

The default implementation is:

		/// return idf(searcher.docFreq(term), searcher.maxDoc());
		///

/// /// Note that {@link Searcher#MaxDoc()} is used instead of /// {@link IndexReader#NumDocs()} because it is proportional to /// {@link Searcher#DocFreq(Term)} , i.e., when one is inaccurate, /// so is the other, and in the same direction. /// ///

/// the term in question /// /// the document collection being searched /// /// a score factor for the term /// public virtual float Idf(Term term, Searcher searcher) { return Idf(searcher.DocFreq(term), searcher.MaxDoc()); } ///

Computes a score factor for a phrase. /// ///

The default implementation sums the {@link #Idf(Term,Searcher)} factor /// for each term in the phrase. /// ///

/// the terms in the phrase /// /// the document collection being searched /// /// a score factor for the phrase /// public virtual float Idf(System.Collections.ICollection terms, Searcher searcher) { float idf = 0.0f; System.Collections.IEnumerator i = terms.GetEnumerator(); while (i.MoveNext()) { idf += Idf((Term) i.Current, searcher); } return idf; } ///

Computes a score factor based on a term's document frequency (the number /// of documents which contain the term). This value is multiplied by the /// {@link #Tf(int)} factor for each term in the query and these products are /// then summed to form the initial score for a document. /// ///

Terms that occur in fewer documents are better indicators of topic, so /// implementations of this method usually return larger values for rare terms, /// and smaller values for common terms. /// ///

/// the number of documents which contain the term /// /// the total number of documents in the collection /// /// a score factor based on the term's document frequency /// public abstract float Idf(int docFreq, int numDocs); ///

Computes a score factor based on the fraction of all query terms that a /// document contains. This value is multiplied into scores. /// ///

The presence of a large portion of the query terms indicates a better /// match with the query, so implementations of this method usually return /// larger values when the ratio between these parameters is large and smaller /// values when the ratio between them is small. /// ///

/// the number of query terms matched in the document /// /// the total number of terms in the query /// /// a score factor based on term overlap with the query /// public abstract float Coord(int overlap, int maxOverlap); static Similarity() { { for (int i = 0; i < 256; i++) NORM_TABLE[i] = SmallFloat.Byte315ToFloat((byte) i); } } } }

/// norm(t,d) = /// {@link Lucene.Net.Documents.Document#GetBoost() doc.getBoost()} /// · /// {@link #LengthNorm(String, int) lengthNorm(field)} /// · ///	/// ∏ ///	/// {@link Lucene.Net.Documents.Fieldable#GetBoost() f.getBoost}() ///
	field f in d named as t