/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using System.Collections; using System.Collections.Generic; using System.Linq; using System.Text; using Lucene.Net.Search; using Lucene.Net.Index; using Lucene.Net.Analysis; using Lucene.Net.Analysis.Tokenattributes; using Lucene.Net.Support; using Lucene.Net.Util; namespace Lucene.Net.Search { /// /// Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms. /// In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration /// of fuzzy scoring factors. /// This generally produces good results for queries where users may provide details in a number of /// fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and /// a fast query. /// /// For each source term the fuzzy variants are held in a BooleanQuery with no coord factor (because /// we are not looking for matches on multiple variants in any one doc). Additionally, a specialized /// TermQuery is used for variants and does not use that variant term's IDF because this would favour rarer /// terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query /// term) and this is factored into the variant's boost. If the source query term does not exist in the /// index the average IDF of the variants is used. /// public class FuzzyLikeThisQuery : Query { static Similarity sim = new DefaultSimilarity(); Query rewrittenQuery = null; EquatableList fieldVals = new EquatableList(); Analyzer analyzer; ScoreTermQueue q; int MAX_VARIANTS_PER_TERM = 50; bool ignoreTF = false; private int maxNumTerms; public override int GetHashCode() { int prime = 31; int result = 1; result = prime * result + ((analyzer == null) ? 0 : analyzer.GetHashCode()); result = prime * result + ((fieldVals == null) ? 0 : fieldVals.GetHashCode()); result = prime * result + (ignoreTF ? 1231 : 1237); result = prime * result + maxNumTerms; return result; } public override bool Equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (GetType() != obj.GetType()) return false; FuzzyLikeThisQuery other = (FuzzyLikeThisQuery)obj; if (analyzer == null) { if (other.analyzer != null) return false; } else if (!analyzer.Equals(other.analyzer)) return false; if (fieldVals == null) { if (other.fieldVals != null) return false; } else if (!fieldVals.Equals(other.fieldVals)) return false; if (ignoreTF != other.ignoreTF) return false; if (maxNumTerms != other.maxNumTerms) return false; return true; } /* * * The total number of terms clauses that will appear once rewritten as a BooleanQuery * */ public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer) { q = new ScoreTermQueue(maxNumTerms); this.analyzer = analyzer; this.maxNumTerms = maxNumTerms; } class FieldVals { internal String queryString; internal String fieldName; internal float minSimilarity; internal int prefixLength; public FieldVals(String name, float similarity, int length, String queryString) { fieldName = name; minSimilarity = similarity; prefixLength = length; this.queryString = queryString; } public override int GetHashCode() { int prime = 31; int result = 1; result = prime * result + ((fieldName == null) ? 0 : fieldName.GetHashCode()); result = prime * result + BitConverter.ToInt32(BitConverter.GetBytes(minSimilarity),0); result = prime * result + prefixLength; result = prime * result + ((queryString == null) ? 0 : queryString.GetHashCode()); return result; } public override bool Equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (GetType() != obj.GetType()) return false; FieldVals other = (FieldVals)obj; if (fieldName == null) { if (other.fieldName != null) return false; } else if (!fieldName.Equals(other.fieldName)) return false; if (BitConverter.ToInt32(BitConverter.GetBytes(minSimilarity), 0) != BitConverter.ToInt32(BitConverter.GetBytes(other.minSimilarity), 0)) //if (Float.floatToIntBits(minSimilarity) != Float.floatToIntBits(other.minSimilarity)) return false; if (prefixLength != other.prefixLength) return false; if (queryString == null) { if (other.queryString != null) return false; } else if (!queryString.Equals(other.queryString)) return false; return true; } } /* * Adds user input for "fuzzification" * The string which will be parsed by the analyzer and for which fuzzy variants will be parsed * * The minimum similarity of the term variants (see FuzzyTermEnum) * Length of required common prefix on variant terms (see FuzzyTermEnum) */ public void AddTerms(String queryString, String fieldName, float minSimilarity, int prefixLength) { fieldVals.Add(new FieldVals(fieldName, minSimilarity, prefixLength, queryString)); } private void AddTerms(IndexReader reader, FieldVals f) { if (f.queryString == null) return; TokenStream ts = analyzer.TokenStream(f.fieldName, new System.IO.StringReader(f.queryString)); ITermAttribute termAtt = ts.AddAttribute(); int corpusNumDocs = reader.NumDocs(); Term internSavingTemplateTerm = new Term(f.fieldName); //optimization to avoid constructing new Term() objects HashSet processedTerms = new HashSet(); while (ts.IncrementToken()) { String term = termAtt.Term; if (!processedTerms.Contains(term)) { processedTerms.Add(term); ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term float minScore = 0; Term startTerm = internSavingTemplateTerm.CreateTerm(term); FuzzyTermEnum fe = new FuzzyTermEnum(reader, startTerm, f.minSimilarity, f.prefixLength); TermEnum origEnum = reader.Terms(startTerm); int df = 0; if (startTerm.Equals(origEnum.Term)) { df = origEnum.DocFreq(); //store the df so all variants use same idf } int numVariants = 0; int totalVariantDocFreqs = 0; do { Term possibleMatch = fe.Term; if (possibleMatch != null) { numVariants++; totalVariantDocFreqs += fe.DocFreq(); float score = fe.Difference(); if (variantsQ.Size() < MAX_VARIANTS_PER_TERM || score > minScore) { ScoreTerm st = new ScoreTerm(possibleMatch, score, startTerm); variantsQ.InsertWithOverflow(st); minScore = variantsQ.Top().Score; // maintain minScore } } } while (fe.Next()); if (numVariants > 0) { int avgDf = totalVariantDocFreqs / numVariants; if (df == 0)//no direct match we can use as df for all variants { df = avgDf; //use avg df of all variants } // take the top variants (scored by edit distance) and reset the score // to include an IDF factor then add to the global queue for ranking // overall top query terms int size = variantsQ.Size(); for (int i = 0; i < size; i++) { ScoreTerm st = variantsQ.Pop(); st.Score = (st.Score * st.Score) * sim.Idf(df, corpusNumDocs); q.InsertWithOverflow(st); } } } } } public override Query Rewrite(IndexReader reader) { if (rewrittenQuery != null) { return rewrittenQuery; } //load up the list of possible terms foreach (FieldVals f in fieldVals) { AddTerms(reader, f); } //clear the list of fields fieldVals.Clear(); BooleanQuery bq = new BooleanQuery(); //create BooleanQueries to hold the variants for each token/field pair and ensure it // has no coord factor //Step 1: sort the termqueries by term/field HashMap> variantQueries = new HashMap>(); int size = q.Size(); for (int i = 0; i < size; i++) { ScoreTerm st = q.Pop(); var l = variantQueries[st.fuzziedSourceTerm]; if (l == null) { l = new List(); variantQueries.Add(st.fuzziedSourceTerm, l); } l.Add(st); } //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries foreach(var variants in variantQueries.Values) { if (variants.Count == 1) { //optimize where only one selected variant ScoreTerm st = variants[0]; TermQuery tq = new FuzzyTermQuery(st.Term, ignoreTF); tq.Boost = st.Score; // set the boost to a mix of IDF and score bq.Add(tq, Occur.SHOULD); } else { BooleanQuery termVariants = new BooleanQuery(true); //disable coord and IDF for these term variants foreach(ScoreTerm st in variants) { TermQuery tq = new FuzzyTermQuery(st.Term, ignoreTF); // found a match tq.Boost = st.Score; // set the boost using the ScoreTerm's score termVariants.Add(tq, Occur.SHOULD); // add to query } bq.Add(termVariants, Occur.SHOULD); // add to query } } //TODO possible alternative step 3 - organize above booleans into a new layer of field-based // booleans with a minimum-should-match of NumFields-1? bq.Boost = Boost; this.rewrittenQuery = bq; return bq; } //Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best // term variants) then is reset with IDF for use in ranking against all other // terms/fields private class ScoreTerm { public Term Term { get; set; } public float Score { get; set; } internal Term fuzziedSourceTerm; public ScoreTerm(Term term, float score, Term fuzziedSourceTerm) { this.Term = term; this.Score = score; this.fuzziedSourceTerm = fuzziedSourceTerm; } } private class ScoreTermQueue : PriorityQueue { public ScoreTermQueue(int size) { Initialize(size); } /* (non-Javadoc) * */ public override bool LessThan(ScoreTerm termA, ScoreTerm termB) { if (termA.Score == termB.Score) return termA.Term.CompareTo(termB.Term) > 0; else return termA.Score < termB.Score; } } //overrides basic TermQuery to negate effects of IDF (idf is factored into boost of containing BooleanQuery) private class FuzzyTermQuery : TermQuery { bool ignoreTF; public FuzzyTermQuery(Term t, bool ignoreTF): base(t) { this.ignoreTF = ignoreTF; } public override Similarity GetSimilarity(Searcher searcher) { Similarity result = base.GetSimilarity(searcher); result = new AnonymousSimilarityDelegator(this,result); return result; } class AnonymousSimilarityDelegator : SimilarityDelegator { FuzzyTermQuery parent = null; public AnonymousSimilarityDelegator(FuzzyTermQuery parent,Similarity result) : base(result) { this.parent = parent; } public override float Tf(float freq) { if (parent.ignoreTF) { return 1; //ignore tf } return base.Tf(freq); } public override float Idf(int docFreq, int numDocs) { //IDF is already factored into individual term boosts return 1; } } } /* (non-Javadoc) * */ public override String ToString(String field) { return null; } public bool IsIgnoreTF() { return ignoreTF; } public void SetIgnoreTF(bool ignoreTF) { this.ignoreTF = ignoreTF; } } }