/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using System.Collections.Generic; using NUnit.Framework; using StandardAnalyzer = Lucene.Net.Analysis.Standard.StandardAnalyzer; using WhitespaceAnalyzer = Lucene.Net.Analysis.WhitespaceAnalyzer; using Document = Lucene.Net.Documents.Document; using Field = Lucene.Net.Documents.Field; using IndexWriter = Lucene.Net.Index.IndexWriter; using IndexReader = Lucene.Net.Index.IndexReader; using Term = Lucene.Net.Index.Term; using RAMDirectory = Lucene.Net.Store.RAMDirectory; using LuceneTestCase = Lucene.Net.Util.LuceneTestCase; using Directory = Lucene.Net.Store.Directory; using MockRAMDirectory = Lucene.Net.Store.MockRAMDirectory; using QueryParser = Lucene.Net.QueryParsers.QueryParser; namespace Lucene.Net.Search { /// Tests {@link FuzzyQuery}. /// /// [TestFixture] public class TestFuzzyQuery:LuceneTestCase { [Test] public virtual void TestFuzziness() { RAMDirectory directory = new RAMDirectory(); IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); AddDoc("aaaaa", writer); AddDoc("aaaab", writer); AddDoc("aaabb", writer); AddDoc("aabbb", writer); AddDoc("abbbb", writer); AddDoc("bbbbb", writer); AddDoc("ddddd", writer); writer.Optimize(); writer.Close(); IndexSearcher searcher = new IndexSearcher(directory, true); FuzzyQuery query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 0); ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(3, hits.Length); // same with prefix query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 1); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(3, hits.Length); query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 2); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(3, hits.Length); query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 3); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(3, hits.Length); query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 4); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(2, hits.Length); query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 5); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 6); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); // test scoring query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(3, hits.Length, "3 documents should match"); List order = new List(new[] {"bbbbb", "abbbb", "aabbb"}); for (int i = 0; i < hits.Length; i++) { String term = searcher.Doc(hits[i].Doc).Get("field"); //System.out.println(hits[i].score); Assert.AreEqual(order[i], term); } // test BooleanQuery.maxClauseCount int savedClauseCount = BooleanQuery.MaxClauseCount; try { BooleanQuery.MaxClauseCount = 2; // This query would normally return 3 documents, because 3 terms match (see above): query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(2, hits.Length, "only 2 documents should match"); order = new List(new[] {"bbbbb", "abbbb"}); for (int i = 0; i < hits.Length; i++) { String term = searcher.Doc(hits[i].Doc).Get("field"); //System.out.println(hits[i].score); Assert.AreEqual(order[i], term); } } finally { BooleanQuery.MaxClauseCount = savedClauseCount; } // not similar enough: query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); query = new FuzzyQuery(new Term("field", "aaccc"), FuzzyQuery.defaultMinSimilarity, 0); // edit distance to "aaaaa" = 3 hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // query identical to a word in the index: query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 0); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(3, hits.Length); Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa")); // default allows for up to two edits: Assert.AreEqual(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab")); Assert.AreEqual(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb")); // query similar to a word in the index: query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 0); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(3, hits.Length); Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa")); Assert.AreEqual(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab")); Assert.AreEqual(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb")); // now with prefix query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 1); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(3, hits.Length); Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa")); Assert.AreEqual(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab")); Assert.AreEqual(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb")); query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 2); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(3, hits.Length); Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa")); Assert.AreEqual(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab")); Assert.AreEqual(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb")); query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 3); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(3, hits.Length); Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa")); Assert.AreEqual(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab")); Assert.AreEqual(searcher.Doc(hits[2].Doc).Get("field"), ("aaabb")); query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 4); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(2, hits.Length); Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaa")); Assert.AreEqual(searcher.Doc(hits[1].Doc).Get("field"), ("aaaab")); query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 5); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 0); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd")); // now with prefix query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 1); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd")); query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 2); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd")); query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 3); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd")); query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 4); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("ddddd")); query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 5); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // different field = no match: query = new FuzzyQuery(new Term("anotherfield", "ddddX"), FuzzyQuery.defaultMinSimilarity, 0); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); searcher.Close(); directory.Close(); } [Test] public virtual void TestFuzzinessLong() { RAMDirectory directory = new RAMDirectory(); IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); AddDoc("aaaaaaa", writer); AddDoc("segment", writer); writer.Optimize(); writer.Close(); IndexSearcher searcher = new IndexSearcher(directory, true); FuzzyQuery query; // not similar enough: query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0); ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // edit distance to "aaaaaaa" = 3, this matches because the string is longer than // in testDefaultFuzziness so a bigger difference is allowed: query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 0); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaaaa")); // now with prefix query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 1); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaaaa")); query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 4); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), ("aaaaaaa")); query = new FuzzyQuery(new Term("field", "aaaaccc"), FuzzyQuery.defaultMinSimilarity, 5); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // no match, more than half of the characters is wrong: query = new FuzzyQuery(new Term("field", "aaacccc"), FuzzyQuery.defaultMinSimilarity, 0); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // now with prefix query = new FuzzyQuery(new Term("field", "aaacccc"), FuzzyQuery.defaultMinSimilarity, 2); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // "student" and "stellent" are indeed similar to "segment" by default: query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 0); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 0); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); // now with prefix query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 1); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 1); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); query = new FuzzyQuery(new Term("field", "student"), FuzzyQuery.defaultMinSimilarity, 2); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); query = new FuzzyQuery(new Term("field", "stellent"), FuzzyQuery.defaultMinSimilarity, 2); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // "student" doesn't match anymore thanks to increased minimum similarity: query = new FuzzyQuery(new Term("field", "student"), 0.6f, 0); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); Assert.Throws(() => new FuzzyQuery(new Term("field", "student"), 1.1f), "Expected ArgumentException"); Assert.Throws(() => new FuzzyQuery(new Term("field", "student"), -0.1f), "Expected ArgumentException"); searcher.Close(); directory.Close(); } [Test] public virtual void TestTokenLengthOpt() { RAMDirectory directory = new RAMDirectory(); IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); AddDoc("12345678911", writer); AddDoc("segment", writer); writer.Optimize(); writer.Close(); IndexSearcher searcher = new IndexSearcher(directory, true); Query query; // term not over 10 chars, so optimization shortcuts query = new FuzzyQuery(new Term("field", "1234569"), 0.9f); ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // 10 chars, so no optimization query = new FuzzyQuery(new Term("field", "1234567891"), 0.9f); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); // over 10 chars, so no optimization query = new FuzzyQuery(new Term("field", "12345678911"), 0.9f); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); // over 10 chars, no match query = new FuzzyQuery(new Term("field", "sdfsdfsdfsdf"), 0.9f); hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(0, hits.Length); } [Test] public virtual void TestGiga() { StandardAnalyzer analyzer = new StandardAnalyzer(Util.Version.LUCENE_CURRENT); Directory index = new MockRAMDirectory(); IndexWriter w = new IndexWriter(index, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED); AddDoc("Lucene in Action", w); AddDoc("Lucene for Dummies", w); // addDoc("Giga", w); AddDoc("Giga byte", w); AddDoc("ManagingGigabytesManagingGigabyte", w); AddDoc("ManagingGigabytesManagingGigabytes", w); AddDoc("The Art of Computer Science", w); AddDoc("J. K. Rowling", w); AddDoc("JK Rowling", w); AddDoc("Joanne K Roling", w); AddDoc("Bruce Willis", w); AddDoc("Willis bruce", w); AddDoc("Brute willis", w); AddDoc("B. willis", w); IndexReader r = w.GetReader(); w.Close(); Query q = new QueryParser(Util.Version.LUCENE_CURRENT, "field", analyzer).Parse("giga~0.9"); // 3. search IndexSearcher searcher = new IndexSearcher(r); ScoreDoc[] hits = searcher.Search(q, 10).ScoreDocs; Assert.AreEqual(1, hits.Length); Assert.AreEqual(searcher.Doc(hits[0].Doc).Get("field"), "Giga byte"); r.Close(); } private void AddDoc(System.String text, IndexWriter writer) { Document doc = new Document(); doc.Add(new Field("field", text, Field.Store.YES, Field.Index.ANALYZED)); writer.AddDocument(doc); } } }