/* Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ parcel Lucy; /** Judge how well a document matches a query. * * After determining whether a document matches a given query, a score must be * calculated which indicates how I the document matches the query. The * Similarity class is used to judge how "similar" the query and the document * are to each other; the closer the resemblance, they higher the document * scores. * * The default implementation uses Lucene's modified cosine similarity * measure. Subclasses might tweak the existing algorithms, or might be used * in conjunction with custom Query subclasses to implement arbitrary scoring * schemes. * * Most of the methods operate on single fields, but some are used to combine * scores from multiple fields. */ public class Lucy::Index::Similarity cnick Sim inherits Clownfish::Obj : dumpable { float *norm_decoder; inert incremented Similarity* new(); /** Constructor. Takes no arguments. */ public inert Similarity* init(Similarity *self); /** Factory method for creating a Posting. */ public incremented Posting* Make_Posting(Similarity *self); /** Factory method for creating a PostingWriter. */ incremented PostingWriter* Make_Posting_Writer(Similarity *self, Schema *schema, Snapshot *snapshot, Segment *segment, PolyReader *polyreader, int32_t field_num); /** Return a score factor based on the frequency of a term in a given * document. The default implementation is sqrt(freq). Other * implementations typically produce ascending scores with ascending * freqs, since the more times a doc matches, the more relevant it is * likely to be. */ public float TF(Similarity *self, float freq); /** Calculate the Inverse Document Frequecy for a term in a given * collection. * * @param doc_freq The number of documents that the term appears in. * @param total_docs The number of documents in the collection. */ public float IDF(Similarity *self, int64_t doc_freq, int64_t total_docs); /** Calculate a score factor based on the number of terms which match. */ public float Coord(Similarity *self, uint32_t overlap, uint32_t max_overlap); /** Dampen the scores of long documents. * * After a field is broken up into terms at index-time, each term must be * assigned a weight. One of the factors in calculating this weight is * the number of tokens that the original field was broken into. * * Typically, we assume that the more tokens in a field, the less * important any one of them is -- so that, e.g. 5 mentions of "Kafka" in * a short article are given more heft than 5 mentions of "Kafka" in an * entire book. The default implementation of length_norm expresses this * using an inverted square root. * * However, the inverted square root has a tendency to reward very short * fields highly, which isn't always appropriate for fields you expect to * have a lot of tokens on average. */ public float Length_Norm(Similarity *self, uint32_t num_tokens); /** Normalize a Query's weight so that it is comparable to other Queries. */ public float Query_Norm(Similarity *self, float sum_of_squared_weights); /** encode_norm and decode_norm encode and decode between 32-bit IEEE * floating point numbers and a 5-bit exponent, 3-bit mantissa float. The * range covered by the single-byte encoding is 7x10^9 to 2x10^-9. The * accuracy is about one significant decimal digit. */ uint32_t Encode_Norm(Similarity *self, float f); /** See encode_norm. */ float Decode_Norm(Similarity *self, uint32_t input); float* Get_Norm_Decoder(Similarity *self); public void Destroy(Similarity *self); public incremented Obj* Dump(Similarity *self); public incremented Similarity* Load(Similarity *self, Obj *dump); public bool_t Equals(Similarity *self, Obj *other); public void Serialize(Similarity *self, OutStream *outstream); public incremented Similarity* Deserialize(decremented Similarity *self, InStream *instream); }