/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using IndexReader = Lucene.Net.Index.IndexReader; using Term = Lucene.Net.Index.Term; using PriorityQueue = Lucene.Net.Util.PriorityQueue; using ToStringUtils = Lucene.Net.Util.ToStringUtils; namespace Lucene.Net.Search { /// Implements the fuzzy search query. The similarity measurement /// is based on the Levenshtein (edit distance) algorithm. /// /// Warning: this query is not very scalable with its default prefix /// length of 0 - in this case, *every* term will be enumerated and /// cause an edit score calculation. /// /// [Serializable] public class FuzzyQuery:MultiTermQuery { public const float defaultMinSimilarity = 0.5f; public const int defaultPrefixLength = 0; private float minimumSimilarity; private int prefixLength; private bool termLongEnough = false; new protected internal Term term; /// Create a new FuzzyQuery that will match terms with a similarity /// of at least minimumSimilarity to term. /// If a prefixLength > 0 is specified, a common prefix /// of that length is also required. /// /// /// the term to search for /// /// a value between 0 and 1 to set the required similarity /// between the query term and the matching terms. For example, for a /// minimumSimilarity of 0.5 a term of the same length /// as the query term is considered similar to the query term if the edit distance /// between both terms is less than length(term)*0.5 /// /// length of common (non-fuzzy) prefix /// /// IllegalArgumentException if minimumSimilarity is >= 1 or < 0 /// or if prefixLength < 0 /// public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength):base(term) { // will be removed in 3.0 this.term = term; if (minimumSimilarity >= 1.0f) throw new System.ArgumentException("minimumSimilarity >= 1"); else if (minimumSimilarity < 0.0f) throw new System.ArgumentException("minimumSimilarity < 0"); if (prefixLength < 0) throw new System.ArgumentException("prefixLength < 0"); if (term.Text().Length > 1.0f / (1.0f - minimumSimilarity)) { this.termLongEnough = true; } this.minimumSimilarity = minimumSimilarity; this.prefixLength = prefixLength; rewriteMethod = SCORING_BOOLEAN_QUERY_REWRITE; } /// Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0)}. public FuzzyQuery(Term term, float minimumSimilarity):this(term, minimumSimilarity, defaultPrefixLength) { } /// Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f, 0)}. public FuzzyQuery(Term term):this(term, defaultMinSimilarity, defaultPrefixLength) { } /// Returns the minimum similarity that is required for this query to match. /// float value between 0.0 and 1.0 /// public virtual float GetMinSimilarity() { return minimumSimilarity; } /// Returns the non-fuzzy prefix length. This is the number of characters at the start /// of a term that must be identical (not fuzzy) to the query term if the query /// is to match that term. /// public virtual int GetPrefixLength() { return prefixLength; } public /*protected internal*/ override FilteredTermEnum GetEnum(IndexReader reader) { return new FuzzyTermEnum(reader, GetTerm(), minimumSimilarity, prefixLength); } /// Returns the pattern term. [Obsolete("Lucene.Net-2.9.1. This method overrides obsolete member Lucene.Net.Search.MultiTermQuery.GetTerm()")] public override Term GetTerm() { return term; } public override void SetRewriteMethod(RewriteMethod method) { throw new System.NotSupportedException("FuzzyQuery cannot change rewrite method"); } public override Query Rewrite(IndexReader reader) { if (!termLongEnough) { // can only match if it's exact return new TermQuery(term); } FilteredTermEnum enumerator = GetEnum(reader); int maxClauseCount = BooleanQuery.GetMaxClauseCount(); ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount); ScoreTerm reusableST = null; try { do { float score = 0.0f; Term t = enumerator.Term(); if (t != null) { score = enumerator.Difference(); if (reusableST == null) { reusableST = new ScoreTerm(t, score); } else if (score >= reusableST.score) { // reusableST holds the last "rejected" entry, so, if // this new score is not better than that, there's no // need to try inserting it reusableST.score = score; reusableST.term = t; } else { continue; } reusableST = (ScoreTerm) stQueue.InsertWithOverflow(reusableST); } } while (enumerator.Next()); } finally { enumerator.Close(); } BooleanQuery query = new BooleanQuery(true); int size = stQueue.Size(); for (int i = 0; i < size; i++) { ScoreTerm st = (ScoreTerm) stQueue.Pop(); TermQuery tq = new TermQuery(st.term); // found a match tq.SetBoost(GetBoost() * st.score); // set the boost query.Add(tq, BooleanClause.Occur.SHOULD); // add to query } return query; } public override System.String ToString(System.String field) { System.Text.StringBuilder buffer = new System.Text.StringBuilder(); if (!term.Field().Equals(field)) { buffer.Append(term.Field()); buffer.Append(":"); } buffer.Append(term.Text()); buffer.Append('~'); buffer.Append(SupportClass.Single.ToString(minimumSimilarity)); buffer.Append(ToStringUtils.Boost(GetBoost())); return buffer.ToString(); } protected internal class ScoreTerm { public Term term; public float score; public ScoreTerm(Term term, float score) { this.term = term; this.score = score; } } protected internal class ScoreTermQueue:PriorityQueue { public ScoreTermQueue(int size) { Initialize(size); } /* (non-Javadoc) * @see Lucene.Net.Util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object) */ public override bool LessThan(System.Object a, System.Object b) { ScoreTerm termA = (ScoreTerm) a; ScoreTerm termB = (ScoreTerm) b; if (termA.score == termB.score) return termA.term.CompareTo(termB.term) > 0; else return termA.score < termB.score; } } public override int GetHashCode() { int prime = 31; int result = base.GetHashCode(); result = prime * result + BitConverter.ToInt32(BitConverter.GetBytes(minimumSimilarity), 0); result = prime * result + prefixLength; result = prime * result + ((term == null)?0:term.GetHashCode()); return result; } public override bool Equals(System.Object obj) { if (this == obj) return true; if (!base.Equals(obj)) return false; if (GetType() != obj.GetType()) return false; FuzzyQuery other = (FuzzyQuery) obj; if (BitConverter.ToInt32(BitConverter.GetBytes(minimumSimilarity), 0) != BitConverter.ToInt32(BitConverter.GetBytes(other.minimumSimilarity), 0)) return false; if (prefixLength != other.prefixLength) return false; if (term == null) { if (other.term != null) return false; } else if (!term.Equals(other.term)) return false; return true; } } }