/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using IndexReader = Lucene.Net.Index.IndexReader;
using IndexWriter = Lucene.Net.Index.IndexWriter;
using Term = Lucene.Net.Index.Term;
using SmallFloat = Lucene.Net.Util.SmallFloat;
namespace Lucene.Net.Search
{
/// Expert: Scoring API.
/// Subclasses implement search scoring.
///
///
The score of query q
for document d
correlates to the
/// cosine-distance or dot-product between document and query vectors in a
///
/// Vector Space Model (VSM) of Information Retrieval.
/// A document whose vector is closer to the query vector in that model is scored higher.
///
/// The score is computed as follows:
///
///
///
///
/// where
///
/// -
///
/// tf(t in d)
/// correlates to the term's frequency,
/// defined as the number of times term t appears in the currently scored document d.
/// Documents that have more occurrences of a given term receive a higher score.
/// The default computation for tf(t in d) in
/// {@link Lucene.Net.Search.DefaultSimilarity#Tf(float) DefaultSimilarity} is:
///
///
///
///
///
/// {@link Lucene.Net.Search.DefaultSimilarity#Tf(float) tf(t in d)} =
/// |
///
/// frequency½
/// |
///
///
///
///
///
/// -
///
/// idf(t) stands for Inverse Document Frequency. This value
/// correlates to the inverse of docFreq
/// (the number of documents in which the term t appears).
/// This means rarer terms give higher contribution to the total score.
/// The default computation for idf(t) in
/// {@link Lucene.Net.Search.DefaultSimilarity#Idf(int, int) DefaultSimilarity} is:
///
///
///
///
///
/// {@link Lucene.Net.Search.DefaultSimilarity#Idf(int, int) idf(t)} =
/// |
///
/// 1 + log (
/// |
///
///
/// numDocs |
/// ––––––––– |
/// docFreq+1 |
///
/// |
///
/// )
/// |
///
///
///
///
///
/// -
///
/// coord(q,d)
/// is a score factor based on how many of the query terms are found in the specified document.
/// Typically, a document that contains more of the query's terms will receive a higher score
/// than another document with fewer query terms.
/// This is a search time factor computed in
/// {@link #Coord(int, int) coord(q,d)}
/// by the Similarity in effect at search time.
///
///
///
/// -
///
/// queryNorm(q)
///
/// is a normalizing factor used to make scores between queries comparable.
/// This factor does not affect document ranking (since all ranked documents are multiplied by the same factor),
/// but rather just attempts to make scores from different queries (or even different indexes) comparable.
/// This is a search time factor computed by the Similarity in effect at search time.
///
/// The default computation in
/// {@link Lucene.Net.Search.DefaultSimilarity#QueryNorm(float) DefaultSimilarity}
/// is:
///
///
///
///
/// queryNorm(q) =
/// {@link Lucene.Net.Search.DefaultSimilarity#QueryNorm(float) queryNorm(sumOfSquaredWeights)}
/// =
/// |
///
///
/// 1 |
///
/// ––––––––––––––
/// |
/// sumOfSquaredWeights½ |
///
/// |
///
///
///
///
/// The sum of squared weights (of the query terms) is
/// computed by the query {@link Lucene.Net.Search.Weight} object.
/// For example, a {@link Lucene.Net.Search.BooleanQuery boolean query}
/// computes this value as:
///
///
///
///
///
/// {@link Lucene.Net.Search.Weight#SumOfSquaredWeights() sumOfSquaredWeights} =
/// {@link Lucene.Net.Search.Query#GetBoost() q.getBoost()} 2
/// ·
/// |
///
/// ∑
/// |
///
/// (
/// idf(t) ·
/// t.getBoost()
/// ) 2
/// |
///
///
/// |
/// t in q |
/// |
///
///
///
///
///
///
/// -
///
/// t.getBoost()
/// is a search time boost of term t in the query q as
/// specified in the query text
/// (see query syntax),
/// or as set by application calls to
/// {@link Lucene.Net.Search.Query#SetBoost(float) setBoost()}.
/// Notice that there is really no direct API for accessing a boost of one term in a multi term query,
/// but rather multi terms are represented in a query as multi
/// {@link Lucene.Net.Search.TermQuery TermQuery} objects,
/// and so the boost of a term in the query is accessible by calling the sub-query
/// {@link Lucene.Net.Search.Query#GetBoost() getBoost()}.
///
///
///
/// -
///
/// norm(t,d) encapsulates a few (indexing time) boost and length factors:
///
///
/// - Document boost - set by calling
/// {@link Lucene.Net.Documents.Document#SetBoost(float) doc.setBoost()}
/// before adding the document to the index.
///
/// - Field boost - set by calling
/// {@link Lucene.Net.Documents.Fieldable#SetBoost(float) field.setBoost()}
/// before adding the field to a document.
///
/// - {@link #LengthNorm(String, int) lengthNorm(field)} - computed
/// when the document is added to the index in accordance with the number of tokens
/// of this field in the document, so that shorter fields contribute more to the score.
/// LengthNorm is computed by the Similarity class in effect at indexing.
///
///
///
///
/// When a document is added to the index, all the above factors are multiplied.
/// If the document has multiple fields with the same name, all their boosts are multiplied together:
///
///
///
///
///
/// norm(t,d) =
/// {@link Lucene.Net.Documents.Document#GetBoost() doc.getBoost()}
/// ·
/// {@link #LengthNorm(String, int) lengthNorm(field)}
/// ·
/// |
///
/// ∏
/// |
///
/// {@link Lucene.Net.Documents.Fieldable#GetBoost() f.getBoost}()
/// |
///
///
/// |
/// field f in d named as t |
/// |
///
///
///
/// However the resulted norm value is {@link #EncodeNorm(float) encoded} as a single byte
/// before being stored.
/// At search time, the norm byte value is read from the index
/// {@link Lucene.Net.Store.Directory directory} and
/// {@link #DecodeNorm(byte) decoded} back to a float norm value.
/// This encoding/decoding, while reducing index size, comes with the price of
/// precision loss - it is not guaranteed that decode(encode(x)) = x.
/// For instance, decode(encode(0.89)) = 0.75.
/// Also notice that search time is too late to modify this norm part of scoring, e.g. by
/// using a different {@link Similarity} for search.
///
///
///
///
///
///
///
///
///
///
///
[Serializable]
public abstract class Similarity
{
/// The Similarity implementation used by default.
private static Similarity defaultImpl = new DefaultSimilarity();
/// Set the default Similarity implementation used by indexing and search
/// code.
///
///
///
///
///
///
public static void SetDefault(Similarity similarity)
{
Similarity.defaultImpl = similarity;
}
/// Return the default Similarity implementation used by indexing and search
/// code.
///
/// This is initially an instance of {@link DefaultSimilarity}.
///
///
///
///
///
///
public static Similarity GetDefault()
{
return Similarity.defaultImpl;
}
/// Cache of decoded bytes.
private static readonly float[] NORM_TABLE = new float[256];
/// Decodes a normalization factor stored in an index.
///
///
public static float DecodeNorm(byte b)
{
return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127
}
/// Returns a table for decoding normalization bytes.
///
///
public static float[] GetNormDecoder()
{
return NORM_TABLE;
}
/// Computes the normalization value for a field given the total number of
/// terms contained in a field. These values, together with field boosts, are
/// stored in an index and multipled into scores for hits on each field by the
/// search code.
///
/// Matches in longer fields are less precise, so implementations of this
/// method usually return smaller values when numTokens
is large,
/// and larger values when numTokens
is small.
///
///
That these values are computed under {@link
/// IndexWriter#AddDocument(Lucene.Net.Documents.Document)} and stored then using
/// {@link #EncodeNorm(float)}. Thus they have limited precision, and documents
/// must be re-indexed if this method is altered.
///
///
/// the name of the field
///
/// the total number of tokens contained in fields named
/// fieldName of doc.
///
/// a normalization factor for hits on this field of this document
///
///
///
///
public abstract float LengthNorm(System.String fieldName, int numTokens);
/// Computes the normalization value for a query given the sum of the squared
/// weights of each of the query terms. This value is then multipled into the
/// weight of each query term.
///
/// This does not affect ranking, but rather just attempts to make scores
/// from different queries comparable.
///
///
/// the sum of the squares of query term weights
///
/// a normalization factor for query weights
///
public abstract float QueryNorm(float sumOfSquaredWeights);
/// Encodes a normalization factor for storage in an index.
///
/// The encoding uses a three-bit mantissa, a five-bit exponent, and
/// the zero-exponent point at 15, thus
/// representing values from around 7x10^9 to 2x10^-9 with about one
/// significant decimal digit of accuracy. Zero is also represented.
/// Negative numbers are rounded up to zero. Values too large to represent
/// are rounded down to the largest representable value. Positive values too
/// small to represent are rounded up to the smallest positive representable
/// value.
///
///
///
///
///
///
public static byte EncodeNorm(float f)
{
return (byte) SmallFloat.FloatToByte315(f);
}
/// Computes a score factor based on a term or phrase's frequency in a
/// document. This value is multiplied by the {@link #Idf(Term, Searcher)}
/// factor for each term in the query and these products are then summed to
/// form the initial score for a document.
///
/// Terms and phrases repeated in a document indicate the topic of the
/// document, so implementations of this method usually return larger values
/// when freq
is large, and smaller values when freq
/// is small.
///
///
The default implementation calls {@link #Tf(float)}.
///
///
/// the frequency of a term within a document
///
/// a score factor based on a term's within-document frequency
///
public virtual float Tf(int freq)
{
return Tf((float) freq);
}
/// Computes the amount of a sloppy phrase match, based on an edit distance.
/// This value is summed for each sloppy phrase match in a document to form
/// the frequency that is passed to {@link #Tf(float)}.
///
/// A phrase match with a small edit distance to a document passage more
/// closely matches the document, so implementations of this method usually
/// return larger values when the edit distance is small and smaller values
/// when it is large.
///
///
///
///
/// the edit distance of this sloppy phrase match
///
/// the frequency increment for this match
///
public abstract float SloppyFreq(int distance);
/// Computes a score factor based on a term or phrase's frequency in a
/// document. This value is multiplied by the {@link #Idf(Term, Searcher)}
/// factor for each term in the query and these products are then summed to
/// form the initial score for a document.
///
/// Terms and phrases repeated in a document indicate the topic of the
/// document, so implementations of this method usually return larger values
/// when freq
is large, and smaller values when freq
/// is small.
///
///
/// the frequency of a term within a document
///
/// a score factor based on a term's within-document frequency
///
public abstract float Tf(float freq);
/// Computes a score factor for a simple term.
///
/// The default implementation is:
/// return idf(searcher.docFreq(term), searcher.maxDoc());
///
///
/// Note that {@link Searcher#MaxDoc()} is used instead of
/// {@link IndexReader#NumDocs()} because it is proportional to
/// {@link Searcher#DocFreq(Term)} , i.e., when one is inaccurate,
/// so is the other, and in the same direction.
///
///
/// the term in question
///
/// the document collection being searched
///
/// a score factor for the term
///
public virtual float Idf(Term term, Searcher searcher)
{
return Idf(searcher.DocFreq(term), searcher.MaxDoc());
}
/// Computes a score factor for a phrase.
///
/// The default implementation sums the {@link #Idf(Term,Searcher)} factor
/// for each term in the phrase.
///
///
/// the terms in the phrase
///
/// the document collection being searched
///
/// a score factor for the phrase
///
public virtual float Idf(System.Collections.ICollection terms, Searcher searcher)
{
float idf = 0.0f;
System.Collections.IEnumerator i = terms.GetEnumerator();
while (i.MoveNext())
{
idf += Idf((Term) i.Current, searcher);
}
return idf;
}
/// Computes a score factor based on a term's document frequency (the number
/// of documents which contain the term). This value is multiplied by the
/// {@link #Tf(int)} factor for each term in the query and these products are
/// then summed to form the initial score for a document.
///
/// Terms that occur in fewer documents are better indicators of topic, so
/// implementations of this method usually return larger values for rare terms,
/// and smaller values for common terms.
///
///
/// the number of documents which contain the term
///
/// the total number of documents in the collection
///
/// a score factor based on the term's document frequency
///
public abstract float Idf(int docFreq, int numDocs);
/// Computes a score factor based on the fraction of all query terms that a
/// document contains. This value is multiplied into scores.
///
/// The presence of a large portion of the query terms indicates a better
/// match with the query, so implementations of this method usually return
/// larger values when the ratio between these parameters is large and smaller
/// values when the ratio between them is small.
///
///
/// the number of query terms matched in the document
///
/// the total number of terms in the query
///
/// a score factor based on term overlap with the query
///
public abstract float Coord(int overlap, int maxOverlap);
static Similarity()
{
{
for (int i = 0; i < 256; i++)
NORM_TABLE[i] = SmallFloat.Byte315ToFloat((byte) i);
}
}
}
}