/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using System.Xml; using Contrib.Regex; using Lucene.Net.Analysis; using Lucene.Net.Analysis.Standard; using Lucene.Net.Analysis.Tokenattributes; using Lucene.Net.Documents; using Lucene.Net.QueryParsers; using Lucene.Net.Search.Spans; using Lucene.Net.Store; using Lucene.Net.Support; using NUnit.Framework; using Version = Lucene.Net.Util.Version; using Lucene.Net.Index; using Lucene.Net.Test.Analysis; using Lucene.Net.Util; using Directory = Lucene.Net.Store.Directory; using Token = Lucene.Net.Analysis.Token; namespace Lucene.Net.Search.Highlight.Test { /* * JUnit Test for Highlighter class. * */ public class HighlighterTest : BaseTokenStreamTestCase, IFormatter { // TODO: change to CURRENT, does not work because posIncr: protected internal static readonly Version TEST_VERSION = Version.LUCENE_CURRENT; private IndexReader reader; protected internal static readonly String FIELD_NAME = "contents"; private static readonly String NUMERIC_FIELD_NAME = "nfield"; private Query query; private RAMDirectory ramDir; public IndexSearcher searcher = null; private int numHighlights = 0; private readonly Analyzer analyzer = new StandardAnalyzer(TEST_VERSION); private TopDocs hits; private String[] texts = { "Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot" , "This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy" , "JFK has been shot", "John Kennedy has been shot", "This text has a typo in referring to Keneddy", "wordx wordy wordz wordx wordy wordx worda wordb wordy wordc", "y z x y z a b", "lets is a the lets is a the lets is a the lets" }; public HighlighterTest() { } /* * Constructor for HighlightExtractorTest. * * @param arg0 */ public HighlighterTest(String arg0) : base(arg0) { } [Test] public void TestQueryScorerHits() { Analyzer analyzer = new SimpleAnalyzer(); QueryParser qp = new QueryParser(TEST_VERSION, FIELD_NAME, analyzer); query = qp.Parse("\"very long\""); searcher = new IndexSearcher(ramDir, true); TopDocs hits = searcher.Search(query, 10); QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(scorer); for (int i = 0; i < hits.ScoreDocs.Length; i++) { Document doc = searcher.Doc(hits.ScoreDocs[i].Doc); String storedField = doc.Get(FIELD_NAME); TokenStream stream = TokenSources.GetAnyTokenStream(searcher.IndexReader, hits.ScoreDocs[i].Doc, FIELD_NAME, doc, analyzer); IFragmenter fragmenter = new SimpleSpanFragmenter(scorer); highlighter.TextFragmenter = fragmenter; String fragment = highlighter.GetBestFragment(stream, storedField); Console.WriteLine(fragment); } } [Test] public void TestHighlightingWithDefaultField() { String s1 = "I call our world Flatland, not because we call it so,"; QueryParser parser = new QueryParser(TEST_VERSION, FIELD_NAME, new StandardAnalyzer(TEST_VERSION)); // Verify that a query against the default field results in text being // highlighted // regardless of the field name. Query q = parser.Parse("\"world Flatland\"~3"); String expected = "I call our world Flatland, not because we call it so,"; String observed = HighlightField(q, "SOME_FIELD_NAME", s1); Console.WriteLine("Expected: \"" + expected + "\n" + "Observed: \"" + observed); Assert.AreEqual(expected, observed, "Query in the default field results in text for *ANY* field being highlighted"); // Verify that a query against a named field does not result in any // highlighting // when the query field name differs from the name of the field being // highlighted, // which in this example happens to be the default field name. q = parser.Parse("text:\"world Flatland\"~3"); expected = s1; observed = HighlightField(q, FIELD_NAME, s1); Console.WriteLine("Expected: \"" + expected + "\n" + "Observed: \"" + observed); Assert.AreEqual(s1, HighlightField(q, FIELD_NAME, s1), "Query in a named field does not result in highlighting when that field isn't in the query"); } /* * This method intended for use with testHighlightingWithDefaultField() * @throws InvalidTokenOffsetsException */ private static String HighlightField(Query query, String fieldName, String text) { TokenStream tokenStream = new StandardAnalyzer(TEST_VERSION).TokenStream(fieldName, new StringReader(text)); // Assuming "", "" used to highlight SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(); QueryScorer scorer = new QueryScorer(query, fieldName, FIELD_NAME); Highlighter highlighter = new Highlighter(formatter, scorer); highlighter.TextFragmenter = new SimpleFragmenter(int.MaxValue); String rv = highlighter.GetBestFragments(tokenStream, text, 1, "(FIELD TEXT TRUNCATED)"); return rv.Length == 0 ? text : rv; } [Test] public void TestSimpleSpanHighlighter() { DoSearching("Kennedy"); int maxNumFragmentsRequired = 2; QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(scorer); for (int i = 0; i < hits.TotalHits; i++) { String text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(FIELD_NAME); TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); highlighter.TextFragmenter = new SimpleFragmenter(40); String result = highlighter.GetBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); Console.WriteLine("\t" + result); } // Not sure we can assert anything here - just running to check we dont // throw any exceptions } // LUCENE-1752 [Test] public void TestRepeatingTermsInMultBooleans() { String content = "x y z a b c d e f g b c g"; String ph1 = "\"a b c d\""; String ph2 = "\"b c g\""; String f1 = "f1"; String f2 = "f2"; String f1c = f1 + ":"; String f2c = f2 + ":"; String q = "(" + f1c + ph1 + " OR " + f2c + ph1 + ") AND (" + f1c + ph2 + " OR " + f2c + ph2 + ")"; Analyzer analyzer = new WhitespaceAnalyzer(); QueryParser qp = new QueryParser(TEST_VERSION, f1, analyzer); Query query = qp.Parse(q); QueryScorer scorer = new QueryScorer(query, f1); scorer.IsExpandMultiTermQuery = false; Highlighter h = new Highlighter(this, scorer); h.GetBestFragment(analyzer, f1, content); Assert.IsTrue(numHighlights == 7, "Failed to find correct number of highlights " + numHighlights + " found"); } [Test] public void TestSimpleQueryScorerPhraseHighlighting() { DoSearching("\"very long and contains\""); int maxNumFragmentsRequired = 2; QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(this, scorer); for (int i = 0; i < hits.TotalHits; i++) { String text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(FIELD_NAME); TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); highlighter.TextFragmenter = new SimpleFragmenter(40); String result = highlighter.GetBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); Console.WriteLine("\t" + result); } Assert.IsTrue(numHighlights == 3, "Failed to find correct number of highlights " + numHighlights + " found"); numHighlights = 0; DoSearching("\"This piece of text refers to Kennedy\""); maxNumFragmentsRequired = 2; scorer = new QueryScorer(query, FIELD_NAME); highlighter = new Highlighter(this, scorer); for (int i = 0; i < hits.TotalHits; i++) { String text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(FIELD_NAME); TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); highlighter.TextFragmenter = new SimpleFragmenter(40); String result = highlighter.GetBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); Console.WriteLine("\t" + result); } Assert.IsTrue(numHighlights == 4, "Failed to find correct number of highlights " + numHighlights + " found"); numHighlights = 0; DoSearching("\"lets is a the lets is a the lets is a the lets\""); maxNumFragmentsRequired = 2; scorer = new QueryScorer(query, FIELD_NAME); highlighter = new Highlighter(this, scorer); for (int i = 0; i < hits.TotalHits; i++) { String text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(FIELD_NAME); TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); highlighter.TextFragmenter = new SimpleFragmenter(40); String result = highlighter.GetBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); Console.WriteLine("\t" + result); } Assert.IsTrue(numHighlights == 4, "Failed to find correct number of highlights " + numHighlights + " found"); } [Test] public void TestSpanRegexQuery() { const int maxNumFragmentsRequired = 2; query = new SpanOrQuery(new SpanQuery[] {new SpanRegexQuery(new Term(FIELD_NAME, "ken.*"))}); searcher = new IndexSearcher(ramDir, true); hits = searcher.Search(query, 100); var scorer = new QueryScorer(query, FIELD_NAME); var highlighter = new Highlighter(this, scorer); for (int i = 0; i < hits.TotalHits; i++) { String text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(FIELD_NAME); TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); highlighter.TextFragmenter = new SimpleFragmenter(40); String result = highlighter.GetBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); Console.WriteLine("\t" + result); } Assert.IsTrue(numHighlights == 5, "Failed to find correct number of highlights " + numHighlights + " found"); } [Test] public void TestRegexQuery() { const int maxNumFragmentsRequired = 2; query = new RegexQuery(new Term(FIELD_NAME, "ken.*")); searcher = new IndexSearcher(ramDir, true); hits = searcher.Search(query, 100); var scorer = new QueryScorer(query, FIELD_NAME); var highlighter = new Highlighter(this, scorer); for (int i = 0; i < hits.TotalHits; i++) { String text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(FIELD_NAME); TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); highlighter.TextFragmenter = new SimpleFragmenter(40); String result = highlighter.GetBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); Console.WriteLine("\t" + result); } Assert.IsTrue(numHighlights == 5, "Failed to find correct number of highlights " + numHighlights + " found"); } [Test] public void TestNumericRangeQuery() { // doesn't currently highlight, but make sure it doesn't cause exception either query = NumericRangeQuery.NewIntRange(NUMERIC_FIELD_NAME, 2, 6, true, true); searcher = new IndexSearcher(ramDir, true); hits = searcher.Search(query, 100); int maxNumFragmentsRequired = 2; QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(this, scorer); for (int i = 0; i < hits.TotalHits; i++) { String text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(NUMERIC_FIELD_NAME); TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); highlighter.TextFragmenter = new SimpleFragmenter(40); String result = highlighter.GetBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); //Console.WriteLine("\t" + result); } } [Test] public void TestSimpleQueryScorerPhraseHighlighting2() { DoSearching("\"text piece long\"~5"); int maxNumFragmentsRequired = 2; var scorer = new QueryScorer(query, FIELD_NAME); var highlighter = new Highlighter(this, scorer); highlighter.TextFragmenter = new SimpleFragmenter(40); for (int i = 0; i < hits.TotalHits; i++) { var text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(FIELD_NAME); var tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); var result = highlighter.GetBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); Console.WriteLine("\t" + result); } Assert.IsTrue(numHighlights == 6, "Failed to find correct number of highlights " + numHighlights + " found"); } [Test] public void TestSimpleQueryScorerPhraseHighlighting3() { DoSearching("\"x y z\""); int maxNumFragmentsRequired = 2; for (int i = 0; i < hits.TotalHits; i++) { var text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(FIELD_NAME); var tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); var scorer = new QueryScorer(query, FIELD_NAME); var highlighter = new Highlighter(this, scorer) {TextFragmenter = new SimpleFragmenter(40)}; var result = highlighter.GetBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); Console.WriteLine("\t" + result); Assert.IsTrue(numHighlights == 3, "Failed to find correct number of highlights " + numHighlights + " found"); } } [Test] public void TestSimpleSpanFragmenter() { DoSearching("\"piece of text that is very long\""); int maxNumFragmentsRequired = 2; QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(this, scorer); for (int i = 0; i < hits.TotalHits; i++) { String text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(FIELD_NAME); TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); highlighter.TextFragmenter = new SimpleSpanFragmenter(scorer, 5); String result = highlighter.GetBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); Console.WriteLine("\t" + result); } DoSearching("\"been shot\""); maxNumFragmentsRequired = 2; scorer = new QueryScorer(query, FIELD_NAME); highlighter = new Highlighter(this, scorer); for (int i = 0; i < hits.TotalHits; i++) { String text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(FIELD_NAME); TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); highlighter.TextFragmenter = new SimpleSpanFragmenter(scorer, 20); String result = highlighter.GetBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); Console.WriteLine("\t" + result); } } // position sensitive query added after position insensitive query [Test] public void TestPosTermStdTerm() { DoSearching("y \"x y z\""); int maxNumFragmentsRequired = 2; QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(this, scorer); for (int i = 0; i < hits.TotalHits; i++) { String text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(FIELD_NAME); TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); highlighter.TextFragmenter = new SimpleFragmenter(40); String result = highlighter.GetBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); Console.WriteLine("\t" + result); Assert.IsTrue(numHighlights == 4, "Failed to find correct number of highlights " + numHighlights + " found"); } } [Test] public void TestQueryScorerMultiPhraseQueryHighlighting() { MultiPhraseQuery mpq = new MultiPhraseQuery(); mpq.Add(new Term[] {new Term(FIELD_NAME, "wordx"), new Term(FIELD_NAME, "wordb")}); mpq.Add(new Term(FIELD_NAME, "wordy")); DoSearching(mpq); int maxNumFragmentsRequired = 2; AssertExpectedHighlightCount(maxNumFragmentsRequired, 6); } [Test] public void TestQueryScorerMultiPhraseQueryHighlightingWithGap() { MultiPhraseQuery mpq = new MultiPhraseQuery(); /* * The toString of MultiPhraseQuery doesn't work so well with these * out-of-order additions, but the Query itself seems to match accurately. */ mpq.Add(new Term[] {new Term(FIELD_NAME, "wordz")}, 2); mpq.Add(new Term[] {new Term(FIELD_NAME, "wordx")}, 0); DoSearching(mpq); int maxNumFragmentsRequired = 1; int expectedHighlights = 2; AssertExpectedHighlightCount(maxNumFragmentsRequired, expectedHighlights); } [Test] public void TestNearSpanSimpleQuery() { DoSearching(new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(FIELD_NAME, "beginning")), new SpanTermQuery(new Term(FIELD_NAME, "kennedy")) }, 3, false)); var helper = new TestHighlightRunner(TestHighlightRunner.QUERY); helper.TestAction = () => helper.DoStandardHighlights(analyzer, searcher, hits, query, this); helper.Run(); Assert.IsTrue(numHighlights == 2, "Failed to find correct number of highlights " + numHighlights + " found"); } [Test] public void TestSimpleQueryTermScorerHighlighter() { DoSearching("Kennedy"); Highlighter highlighter = new Highlighter(new QueryTermScorer(query)); highlighter.TextFragmenter = new SimpleFragmenter(40); int maxNumFragmentsRequired = 2; for (int i = 0; i < hits.TotalHits; i++) { String text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(FIELD_NAME); TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); String result = highlighter.GetBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); Console.WriteLine("\t" + result); } // Not sure we can assert anything here - just running to check we dont // throw any exceptions } [Test] public void TestSpanHighlighting() { Query query1 = new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(FIELD_NAME, "wordx")), new SpanTermQuery(new Term(FIELD_NAME, "wordy")) }, 1, false); Query query2 = new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(FIELD_NAME, "wordy")), new SpanTermQuery(new Term(FIELD_NAME, "wordc")) }, 1, false); BooleanQuery bquery = new BooleanQuery(); bquery.Add(query1, Occur.SHOULD); bquery.Add(query2, Occur.SHOULD); DoSearching(bquery); var helper = new TestHighlightRunner(TestHighlightRunner.QUERY); helper.TestAction = () => helper.DoStandardHighlights(analyzer, searcher, hits, query, this); helper.Run(); Assert.IsTrue(numHighlights == 7, "Failed to find correct number of highlights " + numHighlights + " found"); } [Test] public void TestNotSpanSimpleQuery() { DoSearching(new SpanNotQuery(new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(FIELD_NAME, "shot")), new SpanTermQuery(new Term(FIELD_NAME, "kennedy")) }, 3, false), new SpanTermQuery( new Term(FIELD_NAME, "john")))); var helper = new TestHighlightRunner(TestHighlightRunner.QUERY); helper.TestAction = () => helper.DoStandardHighlights(analyzer, searcher, hits, query, this); helper.Run(); Assert.IsTrue(numHighlights == 4, "Failed to find correct number of highlights " + numHighlights + " found"); } [Test] public void TestGetBestFragmentsSimpleQuery() { var helper = new TestHighlightRunner(); helper.TestAction = () => { numHighlights = 0; DoSearching("Kennedy"); helper.DoStandardHighlights(analyzer, searcher, hits, query, this); Assert.IsTrue(numHighlights == 4, "Failed to find correct number of highlights " + numHighlights + " found"); }; helper.Start(); } [Test] public void TestGetFuzzyFragments() { var helper = new TestHighlightRunner(); helper.TestAction = () => { numHighlights = 0; DoSearching("Kinnedy~"); helper.DoStandardHighlights(analyzer, searcher, hits, query, this, true); Assert.IsTrue(numHighlights == 5, "Failed to find correct number of highlights " + numHighlights + " found"); }; helper.Start(); } [Test] public void TestGetWildCardFragments() { var helper = new TestHighlightRunner(); helper.TestAction = () => { numHighlights = 0; DoSearching("K?nnedy"); helper.DoStandardHighlights(analyzer, searcher, hits, query, this); Assert.IsTrue(numHighlights == 4, "Failed to find correct number of highlights " + numHighlights + " found"); }; helper.Start(); } [Test] public void TestGetMidWildCardFragments() { var helper = new TestHighlightRunner(); helper.TestAction = () => { numHighlights = 0; DoSearching("K*dy"); helper.DoStandardHighlights(analyzer, searcher, hits, query, this); Assert.IsTrue(numHighlights == 5, "Failed to find correct number of highlights " + numHighlights + " found"); }; helper.Start(); } [Test] public void TestGetRangeFragments() { var helper = new TestHighlightRunner(); helper.TestAction = () => { numHighlights = 0; String queryString = FIELD_NAME + ":[kannedy TO kznnedy]"; // Need to explicitly set the QueryParser property to use TermRangeQuery // rather // than RangeFilters QueryParser parser = new QueryParser(TEST_VERSION, FIELD_NAME, analyzer); parser.MultiTermRewriteMethod = MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE; query = parser.Parse(queryString); DoSearching(query); helper.DoStandardHighlights(analyzer, searcher, hits, query, this); Assert.IsTrue(numHighlights == 5, "Failed to find correct number of highlights " + numHighlights + " found"); }; helper.Start(); } [Test] public void TestConstantScoreMultiTermQuery() { numHighlights = 0; query = new WildcardQuery(new Term(FIELD_NAME, "ken*")); ((WildcardQuery) query).RewriteMethod = MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE; searcher = new IndexSearcher(ramDir, true); // can't rewrite ConstantScore if you want to highlight it - // it rewrites to ConstantScoreQuery which cannot be highlighted // query = unReWrittenQuery.Rewrite(reader); Console.WriteLine("Searching for: " + query.ToString(FIELD_NAME)); hits = searcher.Search(query, null, 1000); for (int i = 0; i < hits.TotalHits; i++) { String text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(HighlighterTest.FIELD_NAME); int maxNumFragmentsRequired = 2; String fragmentSeparator = "..."; QueryScorer scorer = null; TokenStream tokenStream = null; tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(this, scorer); highlighter.TextFragmenter = new SimpleFragmenter(20); String result = highlighter.GetBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator); Console.WriteLine("\t" + result); } Assert.IsTrue(numHighlights == 5, "Failed to find correct number of highlights " + numHighlights + " found"); // try null field hits = searcher.Search(query, null, 1000); numHighlights = 0; for (int i = 0; i < hits.TotalHits; i++) { String text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(HighlighterTest.FIELD_NAME); int maxNumFragmentsRequired = 2; String fragmentSeparator = "..."; QueryScorer scorer = null; TokenStream tokenStream = null; tokenStream = analyzer.TokenStream(HighlighterTest.FIELD_NAME, new StringReader(text)); scorer = new QueryScorer(query, null); Highlighter highlighter = new Highlighter(this, scorer); highlighter.TextFragmenter = new SimpleFragmenter(20); String result = highlighter.GetBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator); Console.WriteLine("\t" + result); } Assert.IsTrue(numHighlights == 5, "Failed to find correct number of highlights " + numHighlights + " found"); // try default field hits = searcher.Search(query, null, 1000); numHighlights = 0; for (int i = 0; i < hits.TotalHits; i++) { String text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(HighlighterTest.FIELD_NAME); int maxNumFragmentsRequired = 2; String fragmentSeparator = "..."; QueryScorer scorer = null; TokenStream tokenStream = null; tokenStream = analyzer.TokenStream(HighlighterTest.FIELD_NAME, new StringReader(text)); scorer = new QueryScorer(query, "random_field", HighlighterTest.FIELD_NAME); Highlighter highlighter = new Highlighter(this, scorer); highlighter.TextFragmenter = new SimpleFragmenter(20); String result = highlighter.GetBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator); Console.WriteLine("\t" + result); } Assert.IsTrue(numHighlights == 5, "Failed to find correct number of highlights " + numHighlights + " found"); } [Test] public void TestGetBestFragmentsPhrase() { var helper = new TestHighlightRunner(); helper.TestAction = () => { numHighlights = 0; DoSearching("\"John Kennedy\""); helper.DoStandardHighlights(analyzer, searcher, hits, query, this); // Currently highlights "John" and "Kennedy" separately Assert.IsTrue(numHighlights == 2, "Failed to find correct number of highlights " + numHighlights + " found"); }; helper.Start(); } [Test] public void TestGetBestFragmentsQueryScorer() { var helper = new TestHighlightRunner(); helper.TestAction = () => { numHighlights = 0; SpanQuery[] clauses = new SpanQuery[] { new SpanTermQuery(new Term("contents", "john")), new SpanTermQuery(new Term("contents", "kennedy")) , }; SpanNearQuery snq = new SpanNearQuery(clauses, 1, true); DoSearching(snq); helper.DoStandardHighlights(analyzer, searcher, hits, query, this); // Currently highlights "John" and "Kennedy" separately Assert.IsTrue(numHighlights == 2, "Failed to find correct number of highlights " + numHighlights + " found"); }; helper.Start(); } [Test] public void TestOffByOne() { var helper = new TestHighlightRunner(); helper.TestAction = () => { TermQuery query = new TermQuery(new Term("data", "help")); Highlighter hg = new Highlighter(new SimpleHTMLFormatter(), new QueryTermScorer(query)); hg.TextFragmenter = new NullFragmenter(); String match = null; match = hg.GetBestFragment(analyzer, "data", "help me [54-65]"); Assert.AreEqual(match, "help me [54-65]"); }; helper.Start(); } [Test] public void TestGetBestFragmentsFilteredQuery() { var helper = new TestHighlightRunner(); helper.TestAction = () => { numHighlights = 0; TermRangeFilter rf = new TermRangeFilter("contents", "john", "john", true, true); SpanQuery[] clauses = { new SpanTermQuery(new Term("contents", "john")), new SpanTermQuery(new Term("contents", "kennedy")) }; SpanNearQuery snq = new SpanNearQuery(clauses, 1, true); FilteredQuery fq = new FilteredQuery(snq, rf); DoSearching(fq); helper.DoStandardHighlights(analyzer, searcher, hits, query, this); // Currently highlights "John" and "Kennedy" separately Assert.IsTrue(numHighlights == 2, "Failed to find correct number of highlights " + numHighlights + " found"); }; helper.Start(); } [Test] public void TestGetBestFragmentsFilteredPhraseQuery() { var helper = new TestHighlightRunner(); helper.TestAction = () => { numHighlights = 0; var rf = new TermRangeFilter("contents", "john", "john", true, true); var pq = new PhraseQuery(); pq.Add(new Term("contents", "john")); pq.Add(new Term("contents", "kennedy")); var fq = new FilteredQuery(pq, rf); DoSearching(fq); helper.DoStandardHighlights(analyzer, searcher, hits, query, this); // Currently highlights "John" and "Kennedy" separately Assert.IsTrue(numHighlights == 2, "Failed to find correct number of highlights " + numHighlights + " found"); }; helper.Start(); } [Test] public void TestGetBestFragmentsMultiTerm() { var helper = new TestHighlightRunner(); helper.TestAction = () => { numHighlights = 0; DoSearching("John Kenn*"); helper.DoStandardHighlights(analyzer, searcher, hits, query, this); Assert.IsTrue(numHighlights == 5, "Failed to find correct number of highlights " + numHighlights + " found"); }; helper.Start(); } [Test] public void TestGetBestFragmentsWithOr() { var helper = new TestHighlightRunner(); helper.TestAction = () => { numHighlights = 0; DoSearching("JFK OR Kennedy"); helper.DoStandardHighlights(analyzer, searcher, hits, query, this); Assert.IsTrue(numHighlights == 5, "Failed to find correct number of highlights " + numHighlights + " found"); }; helper.Start(); } [Test] public void TestGetBestSingleFragment() { var helper = new TestHighlightRunner(); helper.TestAction = () => { DoSearching("Kennedy"); numHighlights = 0; for (int i = 0; i < hits.TotalHits; i++) { String text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(FIELD_NAME); TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); Highlighter highlighter = helper.GetHighlighter(query, FIELD_NAME, tokenStream, this); highlighter.TextFragmenter = new SimpleFragmenter(40); String result = highlighter.GetBestFragment(tokenStream, text); Console.WriteLine("\t" + result); } Assert.IsTrue(numHighlights == 4, "Failed to find correct number of highlights " + numHighlights + " found"); numHighlights = 0; for (int i = 0; i < hits.TotalHits; i++) { String text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(FIELD_NAME); TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); Highlighter highlighter = helper.GetHighlighter(query, FIELD_NAME, tokenStream, this); highlighter.GetBestFragment(analyzer, FIELD_NAME, text); } Assert.IsTrue(numHighlights == 4, "Failed to find correct number of highlights " + numHighlights + " found"); numHighlights = 0; for (int i = 0; i < hits.TotalHits; i++) { String text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(FIELD_NAME); TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); Highlighter highlighter = helper.GetHighlighter(query, FIELD_NAME, tokenStream, this); highlighter.GetBestFragments(analyzer, FIELD_NAME, text, 10); } Assert.IsTrue(numHighlights == 4, "Failed to find correct number of highlights " + numHighlights + " found"); }; helper.Start(); } [Test] public void TestGetBestSingleFragmentWithWeights() { var helper = new TestHighlightRunner(); helper.TestAction = () => { WeightedSpanTerm[] wTerms = new WeightedSpanTerm[2]; wTerms[0] = new WeightedSpanTerm(10f, "hello"); var positionSpans = new List {new PositionSpan(0, 0)}; wTerms[0].AddPositionSpans(positionSpans); wTerms[1] = new WeightedSpanTerm(1f, "kennedy"); positionSpans = new List {new PositionSpan(14, 14)}; wTerms[1].AddPositionSpans(positionSpans); Highlighter highlighter = helper.GetHighlighter(wTerms, this); // new // Highlighter(new // QueryTermScorer(wTerms)); TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(texts[0])); highlighter.TextFragmenter = new SimpleFragmenter(2); String result = highlighter.GetBestFragment(tokenStream, texts[0]).Trim(); Assert.IsTrue("Hello".Equals(result), "Failed to find best section using weighted terms. Found: [" + result + "]"); // readjust weights wTerms[1].Weight = 50f; tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(texts[0])); highlighter = helper.GetHighlighter(wTerms, this); highlighter.TextFragmenter = new SimpleFragmenter(2); result = highlighter.GetBestFragment(tokenStream, texts[0]).Trim(); Assert.IsTrue("kennedy".Equals(result), "Failed to find best section using weighted terms. Found: " + result); }; helper.Start(); } // tests a "complex" analyzer that produces multiple // overlapping tokens [Test] public void TestOverlapAnalyzer() { var helper = new TestHighlightRunner(); helper.TestAction = () => { var synonyms = new HashMap(); synonyms["football"] = "soccer,footie"; var analyzer = new SynonymAnalyzer(synonyms); var srchkey = "football"; var s = "football-soccer in the euro 2004 footie competition"; var parser = new QueryParser(TEST_VERSION, "bookid", analyzer); var query = parser.Parse(srchkey); var tokenStream = analyzer.TokenStream(null, new StringReader(s)); var highlighter = helper.GetHighlighter(query, null, tokenStream, this); // Get 3 best fragments and seperate with a "..." tokenStream = analyzer.TokenStream(null, new StringReader(s)); var result = highlighter.GetBestFragments(tokenStream, s, 3, "..."); var expectedResult = "football-soccer in the euro 2004 footie competition"; Assert.IsTrue(expectedResult.Equals(result), "overlapping analyzer should handle highlights OK, expected:" + expectedResult + " actual:" + result); }; helper.Start(); } [Test] public void TestGetSimpleHighlight() { var helper = new TestHighlightRunner(); helper.TestAction = () => { numHighlights = 0; DoSearching("Kennedy"); // new Highlighter(this, new QueryTermScorer(query)); for (int i = 0; i < hits.TotalHits; i++) { String text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(FIELD_NAME); TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); Highlighter highlighter = helper.GetHighlighter(query, FIELD_NAME, tokenStream, this); String result = highlighter.GetBestFragment(tokenStream, text); Console.WriteLine("\t" + result); } Assert.IsTrue(numHighlights == 4, "Failed to find correct number of highlights " + numHighlights + "found"); }; helper.Start(); } [Test] public void TestGetTextFragments() { var helper = new TestHighlightRunner(); helper.TestAction = () => { DoSearching("Kennedy"); for (int i = 0; i < hits.TotalHits; i++) { var text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(FIELD_NAME); var tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); var highlighter = helper.GetHighlighter(query, FIELD_NAME, tokenStream, this); // new Highlighter(this, new // QueryTermScorer(query)); highlighter.TextFragmenter = new SimpleFragmenter(20); var stringResults = highlighter.GetBestFragments(tokenStream, text, 10); tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); var fragmentResults = highlighter.GetBestTextFragments(tokenStream, text, true, 10); Assert.IsTrue(fragmentResults.Length == stringResults.Length, "Failed to find correct number of text Fragments: " + fragmentResults.Length + " vs " + stringResults.Length); for (int j = 0; j < stringResults.Length; j++) { Console.WriteLine(fragmentResults[j]); Assert.IsTrue(fragmentResults[j].ToString().Equals(stringResults[j]), "Failed to find same text Fragments: " + fragmentResults[j] + " found"); } } }; helper.Start(); } [Test] public void TestMaxSizeHighlight() { var helper = new TestHighlightRunner(); helper.TestAction = () => { numHighlights = 0; DoSearching("meat"); TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(texts[0])); Highlighter highlighter = helper.GetHighlighter(query, FIELD_NAME, tokenStream, this); // new Highlighter(this, new // QueryTermScorer(query)); highlighter.MaxDocCharsToAnalyze = 30; highlighter.GetBestFragment(tokenStream, texts[0]); Assert.IsTrue(numHighlights == 0, "Setting MaxDocBytesToAnalyze should have prevented us from finding matches for this record: " + numHighlights + " found"); }; helper.Start(); } [Test] public void TestMaxSizeHighlightTruncates() { var helper = new TestHighlightRunner(); helper.TestAction = () => { var goodWord = "goodtoken"; var stopWords = Support.Compatibility.SetFactory.CreateHashSet(new[] { "stoppedtoken" }); var query = new TermQuery(new Term("data", goodWord)); string match; StringBuilder sb = new StringBuilder(); sb.Append(goodWord); for (int i = 0; i < 10000; i++) { sb.Append(" "); // only one stopword sb.Append(stopWords.First()); } SimpleHTMLFormatter fm = new SimpleHTMLFormatter(); Highlighter hg = helper.GetHighlighter(query, "data", new StandardAnalyzer(TEST_VERSION, stopWords). TokenStream( "data", new StringReader(sb.ToString())), fm); // new Highlighter(fm, // new // QueryTermScorer(query)); hg.TextFragmenter = new NullFragmenter(); hg.MaxDocCharsToAnalyze = 100; match = hg.GetBestFragment(new StandardAnalyzer(TEST_VERSION, stopWords), "data", sb.ToString()); Assert.IsTrue(match.Length < hg.MaxDocCharsToAnalyze, "Matched text should be no more than 100 chars in length "); // add another tokenized word to the overrall length - but set way // beyond // the length of text under consideration (after a large slug of stop // words // + whitespace) sb.Append(" "); sb.Append(goodWord); match = hg.GetBestFragment(new StandardAnalyzer(TEST_VERSION, stopWords), "data", sb.ToString()); Assert.IsTrue(match.Length < hg.MaxDocCharsToAnalyze, "Matched text should be no more than 100 chars in length "); }; helper.Start(); } [Test] public void TestMaxSizeEndHighlight() { var helper = new TestHighlightRunner(); helper.TestAction = () => { var stopWords = Support.Compatibility.SetFactory.CreateHashSet(new[] {"in", "it"}); TermQuery query = new TermQuery(new Term("text", "searchterm")); String text = "this is a text with searchterm in it"; SimpleHTMLFormatter fm = new SimpleHTMLFormatter(); Highlighter hg = helper.GetHighlighter(query, "text", new StandardAnalyzer(TEST_VERSION, stopWords). TokenStream("text", new StringReader(text)), fm); hg.TextFragmenter = new NullFragmenter(); hg.MaxDocCharsToAnalyze = 36; String match = hg.GetBestFragment(new StandardAnalyzer(TEST_VERSION, stopWords), "text", text); Assert.IsTrue(match.EndsWith("in it"), "Matched text should contain remainder of text after highlighted query "); }; helper.Start(); } [Test] public void TestUnRewrittenQuery() { var helper = new TestHighlightRunner(); helper.TestAction = () => { numHighlights = 0; // test to show how rewritten query can still be used searcher = new IndexSearcher(ramDir, true); Analyzer analyzer = new StandardAnalyzer(TEST_VERSION); QueryParser parser = new QueryParser(TEST_VERSION, FIELD_NAME, analyzer); Query query = parser.Parse("JF? or Kenned*"); Console.WriteLine("Searching with primitive query"); // forget to set this and... // query=query.Rewrite(reader); TopDocs hits = searcher.Search(query, null, 1000); // create an instance of the highlighter with the tags used to surround // highlighted text // QueryHighlightExtractor highlighter = new // QueryHighlightExtractor(this, // query, new StandardAnalyzer(TEST_VERSION)); int maxNumFragmentsRequired = 3; for (int i = 0; i < hits.TotalHits; i++) { String text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(FIELD_NAME); TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); Highlighter highlighter = helper.GetHighlighter(query, FIELD_NAME, tokenStream, this, false); highlighter.TextFragmenter = new SimpleFragmenter(40); String highlightedText = highlighter.GetBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); Console.WriteLine(highlightedText); } // We expect to have zero highlights if the query is multi-terms and is // not // rewritten! Assert.IsTrue(numHighlights == 0, "Failed to find correct number of highlights " + numHighlights + " found"); }; helper.Start(); } [Test] public void TestNoFragments() { var helper = new TestHighlightRunner(); helper.TestAction = () => { DoSearching("AnInvalidQueryWhichShouldYieldNoResults"); foreach (string text in texts) { TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); Highlighter highlighter = helper.GetHighlighter(query, FIELD_NAME, tokenStream, this); String result = highlighter.GetBestFragment(tokenStream, text); Assert.IsNull(result, "The highlight result should be null for text with no query terms"); } }; helper.Start(); } public class MockScorer : IScorer { public TokenStream Init(TokenStream tokenStream) { return null; } public void StartFragment(TextFragment newFragment) { } public float GetTokenScore() { return 0; } public float FragmentScore { get { return 1; } } } /* * Demonstrates creation of an XHTML compliant doc using new encoding facilities. * * @throws Exception */ [Test] public void TestEncoding() { String rawDocContent = "\"Smith & sons' prices < 3 and >4\" claims article"; // run the highlighter on the raw content (scorer does not score any tokens // for // highlighting but scores a single fragment for selection Highlighter highlighter = new Highlighter(this, new SimpleHTMLEncoder(), new MockScorer()); highlighter.TextFragmenter = new SimpleFragmenter(2000); TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(rawDocContent)); String encodedSnippet = highlighter.GetBestFragments(tokenStream, rawDocContent, 1, ""); // An ugly bit of XML creation: String xhtml = "\n" + "\n" + "\n" + "My Test HTML Document\n" + "\n" + "\n" + "

" + encodedSnippet + "

\n" + "\n" + ""; // now an ugly built of XML parsing to test the snippet is encoded OK var doc = new XmlDocument(); doc.LoadXml(xhtml); var root = doc.DocumentElement; var nodes = root.GetElementsByTagName("body"); var body = (XmlElement) nodes[0]; nodes = body.GetElementsByTagName("h2"); var h2 = (XmlElement) nodes[0]; string decodedSnippet = h2.FirstChild.Value; Assert.AreEqual(rawDocContent, decodedSnippet, "XHTML Encoding should have worked:"); } [Test] public void TestMultiSearcher() { // setup index 1 RAMDirectory ramDir1 = new RAMDirectory(); IndexWriter writer1 = new IndexWriter(ramDir1, new StandardAnalyzer(TEST_VERSION), true, IndexWriter.MaxFieldLength.UNLIMITED); Document d = new Document(); Field f = new Field(FIELD_NAME, "multiOne", Field.Store.YES, Field.Index.ANALYZED); d.Add(f); writer1.AddDocument(d); writer1.Optimize(); writer1.Close(); IndexReader reader1 = IndexReader.Open(ramDir1, true); // setup index 2 RAMDirectory ramDir2 = new RAMDirectory(); IndexWriter writer2 = new IndexWriter(ramDir2, new StandardAnalyzer(TEST_VERSION), true, IndexWriter.MaxFieldLength.UNLIMITED); d = new Document(); f = new Field(FIELD_NAME, "multiTwo", Field.Store.YES, Field.Index.ANALYZED); d.Add(f); writer2.AddDocument(d); writer2.Optimize(); writer2.Close(); IndexReader reader2 = IndexReader.Open(ramDir2, true); var searchers = new IndexSearcher[2]; searchers[0] = new IndexSearcher(ramDir1, true); searchers[1] = new IndexSearcher(ramDir2, true); MultiSearcher multiSearcher = new MultiSearcher(searchers); QueryParser parser = new QueryParser(TEST_VERSION, FIELD_NAME, new StandardAnalyzer(TEST_VERSION)); parser.MultiTermRewriteMethod = MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE; query = parser.Parse("multi*"); Console.WriteLine("Searching for: " + query.ToString(FIELD_NAME)); // at this point the multisearcher calls combine(query[]) hits = multiSearcher.Search(query, null, 1000); // query = QueryParser.Parse("multi*", FIELD_NAME, new StandardAnalyzer(TEST_VERSION)); Query[] expandedQueries = new Query[2]; expandedQueries[0] = query.Rewrite(reader1); expandedQueries[1] = query.Rewrite(reader2); query = query.Combine(expandedQueries); // create an instance of the highlighter with the tags used to surround // highlighted text Highlighter highlighter = new Highlighter(this, new QueryTermScorer(query)); for (int i = 0; i < hits.TotalHits; i++) { String text = multiSearcher.Doc(hits.ScoreDocs[i].Doc).Get(FIELD_NAME); TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); String highlightedText = highlighter.GetBestFragment(tokenStream, text); Console.WriteLine(highlightedText); } Assert.IsTrue(numHighlights == 2, "Failed to find correct number of highlights " + numHighlights + " found"); } [Test] public void TestFieldSpecificHighlighting() { var helper = new TestHighlightRunner(); helper.TestAction = () => { var docMainText = "fred is one of the people"; var parser = new QueryParser(TEST_VERSION, FIELD_NAME, analyzer); var query = parser.Parse("fred category:people"); // highlighting respects fieldnames used in query IScorer fieldSpecificScorer = null; if (helper.Mode == TestHighlightRunner.QUERY) { fieldSpecificScorer = new QueryScorer(query, FIELD_NAME); } else if (helper.Mode == TestHighlightRunner.QUERY_TERM) { fieldSpecificScorer = new QueryTermScorer(query, "contents"); } var fieldSpecificHighlighter = new Highlighter(new SimpleHTMLFormatter(), fieldSpecificScorer) {TextFragmenter = new NullFragmenter()}; String result = fieldSpecificHighlighter.GetBestFragment(analyzer, FIELD_NAME, docMainText); Assert.AreEqual(result, "fred is one of the people", "Should match"); // highlighting does not respect fieldnames used in query IScorer fieldInSpecificScorer = null; if (helper.Mode == TestHighlightRunner.QUERY) { fieldInSpecificScorer = new QueryScorer(query, null); } else if (helper.Mode == TestHighlightRunner.QUERY_TERM) { fieldInSpecificScorer = new QueryTermScorer(query); } var fieldInSpecificHighlighter = new Highlighter(new SimpleHTMLFormatter(), fieldInSpecificScorer) {TextFragmenter = new NullFragmenter()}; result = fieldInSpecificHighlighter.GetBestFragment(analyzer, FIELD_NAME, docMainText); Assert.AreEqual(result, "fred is one of the people", "Should match"); reader.Close(); }; helper.Start(); } private class MockTokenStream : TokenStream { public Action SetupAction { get; set; } public Func IncrementTokenAction { get; set; } public IEnumerator iter; public ITermAttribute termAtt; public IPositionIncrementAttribute posIncrAtt; public IOffsetAttribute offsetAtt; public void RunSetup() { SetupAction(); } public override bool IncrementToken() { return IncrementTokenAction(); } protected override void Dispose(bool disposing) { // do nothing } } protected TokenStream getTS2() { var ts = new MockTokenStream(); ts.SetupAction = () => { ts.termAtt = ts.AddAttribute(); ts.posIncrAtt = ts.AddAttribute(); ts.offsetAtt = ts.AddAttribute(); var lst = new List(); Token t = CreateToken("hi", 0, 2); t.PositionIncrement = 1; lst.Add(t); t = CreateToken("hispeed", 0, 8); t.PositionIncrement = 1; lst.Add(t); t = CreateToken("speed", 3, 8); t.PositionIncrement = 0; lst.Add(t); t = CreateToken("10", 8, 10); t.PositionIncrement = 1; lst.Add(t); t = CreateToken("foo", 11, 14); t.PositionIncrement = 1; lst.Add(t); ts.iter = lst.GetEnumerator(); }; ts.IncrementTokenAction = () => { if (ts.iter.MoveNext()) { Token token = ts.iter.Current; ts.ClearAttributes(); ts.termAtt.SetTermBuffer(token.Term); ts.posIncrAtt.PositionIncrement = token.PositionIncrement; ts.offsetAtt.SetOffset(token.StartOffset, token.EndOffset); return true; } return false; }; ts.RunSetup(); return ts; } // same token-stream as above, but the bigger token comes first this time protected TokenStream GetTS2A() { var ts = new MockTokenStream(); ts.SetupAction = () => { ts.termAtt = ts.AddAttribute(); ts.posIncrAtt = ts.AddAttribute(); ts.offsetAtt = ts.AddAttribute(); var lst = new List(); Token t = CreateToken("hispeed", 0, 8); t.PositionIncrement = 1; lst.Add(t); t = CreateToken("hi", 0, 2); t.PositionIncrement = 0; lst.Add(t); t = CreateToken("speed", 3, 8); t.PositionIncrement = 1; lst.Add(t); t = CreateToken("10", 8, 10); t.PositionIncrement = 1; lst.Add(t); t = CreateToken("foo", 11, 14); t.PositionIncrement = 1; lst.Add(t); ts.iter = lst.GetEnumerator(); }; ts.IncrementTokenAction = () => { if (ts.iter.MoveNext()) { Token token = ts.iter.Current; ts.ClearAttributes(); ts.termAtt.SetTermBuffer(token.Term); ts.posIncrAtt.PositionIncrement = (token.PositionIncrement); ts.offsetAtt.SetOffset(token.StartOffset, token.EndOffset); return true; } return false; }; ts.RunSetup(); return ts; } [Test] public void TestOverlapAnalyzer2() { var helper = new TestHighlightRunner(); helper.TestAction = () => { String s = "Hi-Speed10 foo"; Query query; Highlighter highlighter; String result; query = new QueryParser(TEST_VERSION, "text", new WhitespaceAnalyzer()).Parse("foo"); highlighter = helper.GetHighlighter(query, "text", getTS2(), this); result = highlighter.GetBestFragments(getTS2(), s, 3, "..."); Assert.AreEqual(result, "Hi-Speed10 foo"); query = new QueryParser(TEST_VERSION, "text", new WhitespaceAnalyzer()).Parse("10"); highlighter = helper.GetHighlighter(query, "text", getTS2(), this); result = highlighter.GetBestFragments(getTS2(), s, 3, "..."); Assert.AreEqual(result, "Hi-Speed10 foo"); query = new QueryParser(TEST_VERSION, "text", new WhitespaceAnalyzer()).Parse("hi"); highlighter = helper.GetHighlighter(query, "text", getTS2(), this); result = highlighter.GetBestFragments(getTS2(), s, 3, "..."); Assert.AreEqual(result, "Hi-Speed10 foo"); query = new QueryParser(TEST_VERSION, "text", new WhitespaceAnalyzer()).Parse( "speed"); highlighter = helper.GetHighlighter(query, "text", getTS2(), this); result = highlighter.GetBestFragments(getTS2(), s, 3, "..."); Assert.AreEqual(result, "Hi-Speed10 foo"); query = new QueryParser(TEST_VERSION, "text", new WhitespaceAnalyzer()).Parse( "hispeed"); highlighter = helper.GetHighlighter(query, "text", getTS2(), this); result = highlighter.GetBestFragments(getTS2(), s, 3, "..."); Assert.AreEqual(result, "Hi-Speed10 foo"); query = new QueryParser(TEST_VERSION, "text", new WhitespaceAnalyzer()).Parse( "hi speed"); highlighter = helper.GetHighlighter(query, "text", getTS2(), this); result = highlighter.GetBestFragments(getTS2(), s, 3, "..."); Assert.AreEqual(result, "Hi-Speed10 foo"); // ///////////////// same tests, just put the bigger overlapping token // first query = new QueryParser(TEST_VERSION, "text", new WhitespaceAnalyzer()).Parse("foo"); highlighter = helper.GetHighlighter(query, "text", GetTS2A(), this); result = highlighter.GetBestFragments(GetTS2A(), s, 3, "..."); Assert.AreEqual(result, "Hi-Speed10 foo"); query = new QueryParser(TEST_VERSION, "text", new WhitespaceAnalyzer()).Parse("10"); highlighter = helper.GetHighlighter(query, "text", GetTS2A(), this); result = highlighter.GetBestFragments(GetTS2A(), s, 3, "..."); Assert.AreEqual(result, "Hi-Speed10 foo"); query = new QueryParser(TEST_VERSION, "text", new WhitespaceAnalyzer()).Parse("hi"); highlighter = helper.GetHighlighter(query, "text", GetTS2A(), this); result = highlighter.GetBestFragments(GetTS2A(), s, 3, "..."); Assert.AreEqual(result, "Hi-Speed10 foo"); query = new QueryParser(TEST_VERSION, "text", new WhitespaceAnalyzer()).Parse( "speed"); highlighter = helper.GetHighlighter(query, "text", GetTS2A(), this); result = highlighter.GetBestFragments(GetTS2A(), s, 3, "..."); Assert.AreEqual(result, "Hi-Speed10 foo"); query = new QueryParser(TEST_VERSION, "text", new WhitespaceAnalyzer()).Parse( "hispeed"); highlighter = helper.GetHighlighter(query, "text", GetTS2A(), this); result = highlighter.GetBestFragments(GetTS2A(), s, 3, "..."); Assert.AreEqual(result, "Hi-Speed10 foo"); query = new QueryParser(TEST_VERSION, "text", new WhitespaceAnalyzer()).Parse( "hi speed"); highlighter = helper.GetHighlighter(query, "text", GetTS2A(), this); result = highlighter.GetBestFragments(GetTS2A(), s, 3, "..."); Assert.AreEqual(result, "Hi-Speed10 foo"); }; helper.Start(); } private Directory dir = new RAMDirectory(); private Analyzer a = new WhitespaceAnalyzer(); [Test] public void TestWeightedTermsWithDeletes() { MakeIndex(); DeleteDocument(); SearchIndex(); } private static Document Doc(String f, String v) { Document doc = new Document(); doc.Add(new Field(f, v, Field.Store.YES, Field.Index.ANALYZED)); return doc; } private void MakeIndex() { IndexWriter writer = new IndexWriter(dir, a, IndexWriter.MaxFieldLength.LIMITED); writer.AddDocument(Doc("t_text1", "random words for highlighting tests del")); writer.AddDocument(Doc("t_text1", "more random words for second field del")); writer.AddDocument(Doc("t_text1", "random words for highlighting tests del")); writer.AddDocument(Doc("t_text1", "more random words for second field")); writer.Optimize(); writer.Close(); } private void DeleteDocument() { IndexWriter writer = new IndexWriter(dir, a, false, IndexWriter.MaxFieldLength.LIMITED); writer.DeleteDocuments(new Term("t_text1", "del")); // To see negative idf, keep comment the following line //writer.Optimize(); writer.Close(); } private void SearchIndex() { String q = "t_text1:random"; QueryParser parser = new QueryParser(TEST_VERSION, "t_text1", a); Query query = parser.Parse(q); IndexSearcher searcher = new IndexSearcher(dir, true); // This scorer can return negative idf -> null fragment IScorer scorer = new QueryTermScorer(query, searcher.IndexReader, "t_text1"); // This scorer doesn't use idf (patch version) //Scorer scorer = new QueryTermScorer( query, "t_text1" ); Highlighter h = new Highlighter(scorer); TopDocs hits = searcher.Search(query, null, 10); for (int i = 0; i < hits.TotalHits; i++) { Document doc = searcher.Doc(hits.ScoreDocs[i].Doc); String result = h.GetBestFragment(a, "t_text1", doc.Get("t_text1")); Console.WriteLine("result:" + result); Assert.AreEqual(result, "more random words for second field"); } searcher.Close(); } /* * * [Test] public void testBigramAnalyzer() { * //test to ensure analyzers with none-consecutive start/end offsets //dont * double-highlight text //setup index 1 RAMDirectory ramDir = new * RAMDirectory(); Analyzer bigramAnalyzer=new CJKAnalyzer(); IndexWriter * writer = new IndexWriter(ramDir,bigramAnalyzer , true); Document d = new * Document(); Field f = new Field(FIELD_NAME, "java abc def", true, true, * true); d.Add(f); writer.AddDocument(d); writer.Close(); IndexReader reader = * IndexReader.Open(ramDir, true); * * IndexSearcher searcher=new IndexSearcher(reader); query = * QueryParser.Parse("abc", FIELD_NAME, bigramAnalyzer); * Console.WriteLine("Searching for: " + query.ToString(FIELD_NAME)); hits = * searcher.Search(query); * * Highlighter highlighter = new Highlighter(this,new * QueryFragmentScorer(query)); * * for (int i = 0; i < hits.TotalHits; i++) { String text = * searcher.Doc(hits.ScoreDocs[i].Doc).Get(FIELD_NAME); TokenStream * tokenStream=bigramAnalyzer.TokenStream(FIELD_NAME,new StringReader(text)); * String highlightedText = highlighter.GetBestFragment(tokenStream,text); * Console.WriteLine(highlightedText); } } */ public String HighlightTerm(String originalText, TokenGroup group) { if (@group.TotalScore <= 0) { return originalText; } numHighlights++; // update stats used in assertions return "" + originalText + ""; } public void DoSearching(String queryString) { QueryParser parser = new QueryParser(TEST_VERSION, FIELD_NAME, analyzer); parser.EnablePositionIncrements = true; parser.MultiTermRewriteMethod = MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE; query = parser.Parse(queryString); DoSearching(query); } public void DoSearching(Query unReWrittenQuery) { searcher = new IndexSearcher(ramDir, true); // for any multi-term queries to work (prefix, wildcard, range,fuzzy etc) // you must use a rewritten query! query = unReWrittenQuery.Rewrite(reader); Console.WriteLine("Searching for: " + query.ToString(FIELD_NAME)); hits = searcher.Search(query, null, 1000); } public void AssertExpectedHighlightCount(int maxNumFragmentsRequired, int expectedHighlights) { for (int i = 0; i < hits.TotalHits; i++) { String text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(FIELD_NAME); TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(this, scorer); highlighter.TextFragmenter = new SimpleFragmenter(40); String result = highlighter.GetBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); Console.WriteLine("\t" + result); Assert.IsTrue(numHighlights == expectedHighlights, "Failed to find correct number of highlights " + numHighlights + " found"); } } [SetUp] public override void SetUp() { base.SetUp(); ramDir = new RAMDirectory(); IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(TEST_VERSION), true, IndexWriter.MaxFieldLength.UNLIMITED); for (int i = 0; i < texts.Length; i++) { AddDoc(writer, texts[i]); } Document doc = new Document(); NumericField nfield = new NumericField(NUMERIC_FIELD_NAME, Field.Store.YES, true); nfield.SetIntValue(1); doc.Add(nfield); writer.AddDocument(doc, analyzer); nfield = new NumericField(NUMERIC_FIELD_NAME, Field.Store.YES, true); nfield.SetIntValue(3); doc = new Document(); doc.Add(nfield); writer.AddDocument(doc, analyzer); nfield = new NumericField(NUMERIC_FIELD_NAME, Field.Store.YES, true); nfield.SetIntValue(5); doc = new Document(); doc.Add(nfield); writer.AddDocument(doc, analyzer); nfield = new NumericField(NUMERIC_FIELD_NAME, Field.Store.YES, true); nfield.SetIntValue(7); doc = new Document(); doc.Add(nfield); writer.AddDocument(doc, analyzer); writer.Optimize(); writer.Close(); reader = IndexReader.Open(ramDir, true); numHighlights = 0; } private void AddDoc(IndexWriter writer, String text) { Document d = new Document(); Field f = new Field(FIELD_NAME, text, Field.Store.YES, Field.Index.ANALYZED); d.Add(f); writer.AddDocument(d); } [TearDown] public override void TearDown() { base.TearDown(); } private static Token CreateToken(String term, int start, int offset) { Token token = new Token(start, offset); token.SetTermBuffer(term); return token; } } // =================================================================== // ========== BEGIN TEST SUPPORTING CLASSES // ========== THESE LOOK LIKE, WITH SOME MORE EFFORT THESE COULD BE // ========== MADE MORE GENERALLY USEFUL. // TODO - make synonyms all interchangeable with each other and produce // a version that does hyponyms - the "is a specialised type of ...." // so that car = audi, bmw and volkswagen but bmw != audi so different // behaviour to synonyms // =================================================================== internal class SynonymAnalyzer : Analyzer { private IDictionary synonyms; public SynonymAnalyzer(IDictionary synonyms) { this.synonyms = synonyms; } /* * (non-Javadoc) * * @see org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String, * java.io.Reader) */ public override TokenStream TokenStream(String arg0, System.IO.TextReader arg1) { LowerCaseTokenizer stream = new LowerCaseTokenizer(arg1); stream.AddAttribute(); stream.AddAttribute(); stream.AddAttribute(); return new SynonymTokenizer(stream, synonyms); } } /* * Expands a token stream with synonyms (TODO - make the synonyms analyzed by choice of analyzer) * */ internal class SynonymTokenizer : TokenStream { private TokenStream realStream; private Token currentRealToken = null; private Token cRealToken = null; private IDictionary synonyms; private Tokenizer st = null; private ITermAttribute realTermAtt; private IPositionIncrementAttribute realPosIncrAtt; private IOffsetAttribute realOffsetAtt; private ITermAttribute termAtt; private IPositionIncrementAttribute posIncrAtt; private IOffsetAttribute offsetAtt; public SynonymTokenizer(TokenStream realStream, IDictionary synonyms) { this.realStream = realStream; this.synonyms = synonyms; realTermAtt = realStream.AddAttribute(); realPosIncrAtt = realStream.AddAttribute(); realOffsetAtt = realStream.AddAttribute(); termAtt = AddAttribute(); posIncrAtt = AddAttribute(); offsetAtt = AddAttribute(); } public override bool IncrementToken() { if (currentRealToken == null) { bool next = realStream.IncrementToken(); if (!next) { return false; } //Token nextRealToken = new Token(, offsetAtt.startOffset(), offsetAtt.endOffset()); ClearAttributes(); termAtt.SetTermBuffer(realTermAtt.Term); offsetAtt.SetOffset(realOffsetAtt.StartOffset, realOffsetAtt.EndOffset); posIncrAtt.PositionIncrement = realPosIncrAtt.PositionIncrement; String expansions = synonyms[realTermAtt.Term]; if (expansions == null) { return true; } st = new Tokenizer(expansions, ","); if (st.HasMoreTokens()) { currentRealToken = new Token(realOffsetAtt.StartOffset, realOffsetAtt.EndOffset); currentRealToken.SetTermBuffer(realTermAtt.Term); } return true; } else { String tok = st.NextToken(); ClearAttributes(); termAtt.SetTermBuffer(tok); offsetAtt.SetOffset(currentRealToken.StartOffset, currentRealToken.EndOffset); posIncrAtt.PositionIncrement = 0; if (!st.HasMoreTokens()) { currentRealToken = null; st = null; } return true; } } protected override void Dispose(bool disposing) { } } internal class TestHighlightRunner { public static readonly int QUERY = 0; public static readonly int QUERY_TERM = 1; public Action TestAction { get; set; } public int Mode { get; private set; } public TestHighlightRunner() : this(QUERY) { } public TestHighlightRunner(int mode) { Mode = mode; } public Highlighter GetHighlighter(Query query, String fieldName, TokenStream stream, IFormatter formatter) { return GetHighlighter(query, fieldName, stream, formatter, true); } public Highlighter GetHighlighter(Query query, String fieldName, TokenStream stream, IFormatter formatter, bool expanMultiTerm) { IScorer scorer = null; if (Mode == QUERY) { scorer = new QueryScorer(query, fieldName); if (!expanMultiTerm) { ((QueryScorer) scorer).IsExpandMultiTermQuery = false; } } else if (Mode == QUERY_TERM) { scorer = new QueryTermScorer(query); } else { throw new SystemException("Unknown highlight mode"); } return new Highlighter(formatter, scorer); } public Highlighter GetHighlighter(WeightedTerm[] weightedTerms, IFormatter formatter) { if (Mode == QUERY) { return new Highlighter(formatter, new QueryScorer((WeightedSpanTerm[]) weightedTerms)); } else if (Mode == QUERY_TERM) { return new Highlighter(formatter, new QueryTermScorer(weightedTerms)); } else { throw new SystemException("Unknown highlight mode"); } } public void DoStandardHighlights(Analyzer analyzer, IndexSearcher searcher, TopDocs hits, Query query, IFormatter formatter) { DoStandardHighlights(analyzer, searcher, hits, query, formatter, false); } public void DoStandardHighlights(Analyzer analyzer, IndexSearcher searcher, TopDocs hits, Query query, IFormatter formatter, bool expandMT) { IFragmenter frag = new SimpleFragmenter(20); for (int i = 0; i < hits.TotalHits; i++) { String text = searcher.Doc(hits.ScoreDocs[i].Doc).Get(HighlighterTest.FIELD_NAME); int maxNumFragmentsRequired = 2; String fragmentSeparator = "..."; IScorer scorer = null; TokenStream tokenStream = analyzer.TokenStream(HighlighterTest.FIELD_NAME, new StringReader(text)); if (Mode == QUERY) { scorer = new QueryScorer(query); } else if (Mode == QUERY_TERM) { scorer = new QueryTermScorer(query); } var highlighter = new Highlighter(formatter, scorer) {TextFragmenter = frag}; String result = highlighter.GetBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator); Console.WriteLine("\t" + result); } } public void Run() { if (TestAction == null) throw new InvalidOperationException("Must set TestAction before calling run!"); TestAction(); } public void Start() { if (TestAction == null) throw new InvalidOperationException("Must set TestAction before calling start!"); Console.WriteLine("Run QueryScorer"); TestAction(); Console.WriteLine("Run QueryTermScorer"); Mode = QUERY_TERM; TestAction(); } } }