/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using System.Collections; using System.Collections.Generic; using System.Linq; using System.Text; using Lucene.Net.Documents; using Lucene.Net.Store; using Lucene.Net.Analysis; using Lucene.Net.Analysis.Standard; using Lucene.Net.Index; using NUnit.Framework; namespace Lucene.Net.Search { [TestFixture] public class DuplicateFilterTest : TestCase { private static String KEY_FIELD = "url"; private RAMDirectory directory; private IndexReader reader; TermQuery tq = new TermQuery(new Term("text", "lucene")); private IndexSearcher searcher; [SetUp] public void SetUp() { directory = new RAMDirectory(); IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(Util.Version.LUCENE_CURRENT), true, IndexWriter.MaxFieldLength.UNLIMITED); //Add series of docs with filterable fields : url, text and dates flags AddDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101"); AddDoc(writer, "http://lucene.apache.org", "New release pending", "20040102"); AddDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101"); AddDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101"); AddDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102"); AddDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101"); AddDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101"); AddDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102"); writer.Close(); reader = IndexReader.Open(directory,true); searcher = new IndexSearcher(reader); } [TearDown] public void TearDown() { reader.Close(); searcher.Close(); directory.Close(); } private void AddDoc(IndexWriter writer, String url, String text, String date) { Document doc = new Document(); doc.Add(new Field(KEY_FIELD, url, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field("text", text, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("date", date, Field.Store.YES, Field.Index.ANALYZED)); writer.AddDocument(doc); } [Test] public void TestDefaultFilter() { DuplicateFilter df = new DuplicateFilter(KEY_FIELD); Hashtable results = new Hashtable(); var h = searcher.Search(tq, df, 1000).ScoreDocs; for (int i = 0; i < h.Length; i++) { Document d = searcher.Doc(h[i].Doc); String url = d.Get(KEY_FIELD); Assert.IsFalse(results.Contains(url), "No duplicate urls should be returned"); results.Add(url,url); } } [Test] public void TestNoFilter() { Hashtable results = new Hashtable(); ScoreDoc[] h = searcher.Search(tq, null, 1000).ScoreDocs; Assert.IsTrue(h.Length > 0, "Default searching should have found some matches"); bool dupsFound = false; for (int i = 0; i < h.Length; i++) { Document d = searcher.Doc(h[i].Doc); String url = d.Get(KEY_FIELD); if (!dupsFound) dupsFound = results.Contains(url); results[url]=url; } Assert.IsTrue(dupsFound, "Default searching should have found duplicate urls"); } [Test] public void TestFastFilter() { DuplicateFilter df = new DuplicateFilter(KEY_FIELD); df.ProcessingMode = DuplicateFilter.PM_FAST_INVALIDATION; Hashtable results = new Hashtable(); ScoreDoc[] h = searcher.Search(tq, df, 1000).ScoreDocs; Assert.IsTrue(h.Length > 0, "Filtered searching should have found some matches"); for (int i = 0; i < h.Length; i++) { Document d = searcher.Doc(h[i].Doc); String url = d.Get(KEY_FIELD); Assert.IsFalse(results.Contains(url), "No duplicate urls should be returned"); results.Add(url,url); } Assert.AreEqual(2, results.Count, "Two urls found"); } [Test] public void TestKeepsLastFilter() { DuplicateFilter df = new DuplicateFilter(KEY_FIELD); df.KeepMode = DuplicateFilter.KM_USE_LAST_OCCURRENCE; ScoreDoc[] h = searcher.Search(tq, df, 1000).ScoreDocs; Assert.IsTrue(h.Length > 0, "Filtered searching should have found some matches"); for (int i = 0; i < h.Length; i++) { Document d = searcher.Doc(h[i].Doc); String url = d.Get(KEY_FIELD); TermDocs td = reader.TermDocs(new Term(KEY_FIELD, url)); int lastDoc = 0; while (td.Next()) { lastDoc = td.Doc; } Assert.AreEqual(lastDoc, h[i].Doc, "Duplicate urls should return last doc"); } } [Test] public void TestKeepsFirstFilter() { DuplicateFilter df = new DuplicateFilter(KEY_FIELD); df.KeepMode = DuplicateFilter.KM_USE_FIRST_OCCURRENCE; ScoreDoc[] h = searcher.Search(tq, df, 1000).ScoreDocs; Assert.IsTrue(h.Length > 0, "Filtered searching should have found some matches"); for (int i = 0; i < h.Length; i++) { Document d = searcher.Doc(h[i].Doc); String url = d.Get(KEY_FIELD); TermDocs td = reader.TermDocs(new Term(KEY_FIELD, url)); int lastDoc = 0; td.Next(); lastDoc = td.Doc; Assert.AreEqual(lastDoc, h[i].Doc, "Duplicate urls should return first doc"); } } } }