package org.apache.lucene.search.highlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.StringTokenizer; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.LowerCaseTokenizer; import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.NumericField; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.FilteredQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.MultiSearcher; import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.NumericRangeQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeFilter; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner; import org.apache.lucene.search.regex.RegexQuery; import org.apache.lucene.search.regex.SpanRegexQuery; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanNotQuery; import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.w3c.dom.Element; import org.w3c.dom.NodeList; /** * JUnit Test for Highlighter class. * */ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatter { // TODO: change to CURRENT, does not work because posIncr: static final Version TEST_VERSION = Version.LUCENE_CURRENT; private IndexReader reader; static final String FIELD_NAME = "contents"; private static final String NUMERIC_FIELD_NAME = "nfield"; private Query query; RAMDirectory ramDir; public IndexSearcher searcher = null; int numHighlights = 0; final Analyzer analyzer = new StandardAnalyzer(TEST_VERSION); TopDocs hits; String[] texts = { "Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot", "This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy", "JFK has been shot", "John Kennedy has been shot", "This text has a typo in referring to Keneddy", "wordx wordy wordz wordx wordy wordx worda wordb wordy wordc", "y z x y z a b", "lets is a the lets is a the lets is a the lets" }; /** * Constructor for HighlightExtractorTest. * * @param arg0 */ public HighlighterTest(String arg0) { super(arg0); } public void testQueryScorerHits() throws Exception { Analyzer analyzer = new SimpleAnalyzer(); QueryParser qp = new QueryParser(TEST_VERSION, FIELD_NAME, analyzer); query = qp.parse("\"very long\""); searcher = new IndexSearcher(ramDir, true); TopDocs hits = searcher.search(query, 10); QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(scorer); for (int i = 0; i < hits.scoreDocs.length; i++) { Document doc = searcher.doc(hits.scoreDocs[i].doc); String storedField = doc.get(FIELD_NAME); TokenStream stream = TokenSources.getAnyTokenStream(searcher .getIndexReader(), hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer); Fragmenter fragmenter = new SimpleSpanFragmenter(scorer); highlighter.setTextFragmenter(fragmenter); String fragment = highlighter.getBestFragment(stream, storedField); System.out.println(fragment); } } public void testHighlightingWithDefaultField() throws Exception { String s1 = "I call our world Flatland, not because we call it so,"; QueryParser parser = new QueryParser(TEST_VERSION, FIELD_NAME, new StandardAnalyzer(TEST_VERSION)); // Verify that a query against the default field results in text being // highlighted // regardless of the field name. Query q = parser.parse("\"world Flatland\"~3"); String expected = "I call our world Flatland, not because we call it so,"; String observed = highlightField(q, "SOME_FIELD_NAME", s1); System.out.println("Expected: \"" + expected + "\n" + "Observed: \"" + observed); assertEquals("Query in the default field results in text for *ANY* field being highlighted", expected, observed); // Verify that a query against a named field does not result in any // highlighting // when the query field name differs from the name of the field being // highlighted, // which in this example happens to be the default field name. q = parser.parse("text:\"world Flatland\"~3"); expected = s1; observed = highlightField(q, FIELD_NAME, s1); System.out.println("Expected: \"" + expected + "\n" + "Observed: \"" + observed); assertEquals( "Query in a named field does not result in highlighting when that field isn't in the query", s1, highlightField(q, FIELD_NAME, s1)); } /** * This method intended for use with testHighlightingWithDefaultField() * @throws InvalidTokenOffsetsException */ private static String highlightField(Query query, String fieldName, String text) throws IOException, InvalidTokenOffsetsException { TokenStream tokenStream = new StandardAnalyzer(TEST_VERSION).tokenStream(fieldName, new StringReader(text)); // Assuming "", "" used to highlight SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(); QueryScorer scorer = new QueryScorer(query, fieldName, FIELD_NAME); Highlighter highlighter = new Highlighter(formatter, scorer); highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE)); String rv = highlighter.getBestFragments(tokenStream, text, 1, "(FIELD TEXT TRUNCATED)"); return rv.length() == 0 ? text : rv; } public void testSimpleSpanHighlighter() throws Exception { doSearching("Kennedy"); int maxNumFragmentsRequired = 2; QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(scorer); for (int i = 0; i < hits.totalHits; i++) { String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); highlighter.setTextFragmenter(new SimpleFragmenter(40)); String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); System.out.println("\t" + result); } // Not sure we can assert anything here - just running to check we dont // throw any exceptions } // LUCENE-1752 public void testRepeatingTermsInMultBooleans() throws Exception { String content = "x y z a b c d e f g b c g"; String ph1 = "\"a b c d\""; String ph2 = "\"b c g\""; String f1 = "f1"; String f2 = "f2"; String f1c = f1 + ":"; String f2c = f2 + ":"; String q = "(" + f1c + ph1 + " OR " + f2c + ph1 + ") AND (" + f1c + ph2 + " OR " + f2c + ph2 + ")"; Analyzer analyzer = new WhitespaceAnalyzer(); QueryParser qp = new QueryParser(TEST_VERSION, f1, analyzer); Query query = qp.parse(q); QueryScorer scorer = new QueryScorer(query, f1); scorer.setExpandMultiTermQuery(false); Highlighter h = new Highlighter(this, scorer); h.getBestFragment(analyzer, f1, content); assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 7); } public void testSimpleQueryScorerPhraseHighlighting() throws Exception { doSearching("\"very long and contains\""); int maxNumFragmentsRequired = 2; QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(this, scorer); for (int i = 0; i < hits.totalHits; i++) { String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); highlighter.setTextFragmenter(new SimpleFragmenter(40)); String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); System.out.println("\t" + result); } assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 3); numHighlights = 0; doSearching("\"This piece of text refers to Kennedy\""); maxNumFragmentsRequired = 2; scorer = new QueryScorer(query, FIELD_NAME); highlighter = new Highlighter(this, scorer); for (int i = 0; i < hits.totalHits; i++) { String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); highlighter.setTextFragmenter(new SimpleFragmenter(40)); String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); System.out.println("\t" + result); } assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); numHighlights = 0; doSearching("\"lets is a the lets is a the lets is a the lets\""); maxNumFragmentsRequired = 2; scorer = new QueryScorer(query, FIELD_NAME); highlighter = new Highlighter(this, scorer); for (int i = 0; i < hits.totalHits; i++) { String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); highlighter.setTextFragmenter(new SimpleFragmenter(40)); String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); System.out.println("\t" + result); } assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); } public void testSpanRegexQuery() throws Exception { query = new SpanOrQuery(new SpanQuery [] { new SpanRegexQuery(new Term(FIELD_NAME, "ken.*")) }); searcher = new IndexSearcher(ramDir, true); hits = searcher.search(query, 100); int maxNumFragmentsRequired = 2; QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(this, scorer); for (int i = 0; i < hits.totalHits; i++) { String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); highlighter.setTextFragmenter(new SimpleFragmenter(40)); String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); System.out.println("\t" + result); } assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); } public void testRegexQuery() throws Exception { query = new RegexQuery(new Term(FIELD_NAME, "ken.*")); searcher = new IndexSearcher(ramDir, true); hits = searcher.search(query, 100); int maxNumFragmentsRequired = 2; QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(this, scorer); for (int i = 0; i < hits.totalHits; i++) { String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); highlighter.setTextFragmenter(new SimpleFragmenter(40)); String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); System.out.println("\t" + result); } assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); } public void testNumericRangeQuery() throws Exception { // doesn't currently highlight, but make sure it doesn't cause exception either query = NumericRangeQuery.newIntRange(NUMERIC_FIELD_NAME, 2, 6, true, true); searcher = new IndexSearcher(ramDir, true); hits = searcher.search(query, 100); int maxNumFragmentsRequired = 2; QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(this, scorer); for (int i = 0; i < hits.totalHits; i++) { String text = searcher.doc(hits.scoreDocs[i].doc).get(NUMERIC_FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); highlighter.setTextFragmenter(new SimpleFragmenter(40)); String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); //System.out.println("\t" + result); } } public void testSimpleQueryScorerPhraseHighlighting2() throws Exception { doSearching("\"text piece long\"~5"); int maxNumFragmentsRequired = 2; QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(this,scorer); highlighter.setTextFragmenter(new SimpleFragmenter(40)); for (int i = 0; i < hits.totalHits; i++) { String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); System.out.println("\t" + result); } assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 6); } public void testSimpleQueryScorerPhraseHighlighting3() throws Exception { doSearching("\"x y z\""); int maxNumFragmentsRequired = 2; for (int i = 0; i < hits.totalHits; i++) { String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(this, scorer); highlighter.setTextFragmenter(new SimpleFragmenter(40)); String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); System.out.println("\t" + result); assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 3); } } public void testSimpleSpanFragmenter() throws Exception { doSearching("\"piece of text that is very long\""); int maxNumFragmentsRequired = 2; QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(this, scorer); for (int i = 0; i < hits.totalHits; i++) { String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 5)); String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); System.out.println("\t" + result); } doSearching("\"been shot\""); maxNumFragmentsRequired = 2; scorer = new QueryScorer(query, FIELD_NAME); highlighter = new Highlighter(this, scorer); for (int i = 0; i < hits.totalHits; i++) { String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 20)); String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); System.out.println("\t" + result); } } // position sensitive query added after position insensitive query public void testPosTermStdTerm() throws Exception { doSearching("y \"x y z\""); int maxNumFragmentsRequired = 2; QueryScorer scorer = new QueryScorer(query, FIELD_NAME); Highlighter highlighter = new Highlighter(this,scorer); for (int i = 0; i < hits.totalHits; i++) { String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,new StringReader(text)); highlighter.setTextFragmenter(new SimpleFragmenter(40)); String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); System.out.println("\t" + result); assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); } } public void testQueryScorerMultiPhraseQueryHighlighting() throws Exception { MultiPhraseQuery mpq = new MultiPhraseQuery(); mpq.add(new Term[] { new Term(FIELD_NAME, "wordx"), new Term(FIELD_NAME, "wordb") }); mpq.add(new Term(FIELD_NAME, "wordy")); doSearching(mpq); final int maxNumFragmentsRequired = 2; assertExpectedHighlightCount(maxNumFragmentsRequired, 6); } public void testQueryScorerMultiPhraseQueryHighlightingWithGap() throws Exception { MultiPhraseQuery mpq = new MultiPhraseQuery(); /* * The toString of MultiPhraseQuery doesn't work so well with these * out-of-order additions, but the Query itself seems to match accurately. */ mpq.add(new Term[] { new Term(FIELD_NAME, "wordz") }, 2); mpq.add(new Term[] { new Term(FIELD_NAME, "wordx") }, 0); doSearching(mpq); final int maxNumFragmentsRequired = 1; final int expectedHighlights = 2; assertExpectedHighlightCount(maxNumFragmentsRequired, expectedHighlights); } public void testNearSpanSimpleQuery() throws Exception { doSearching(new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(FIELD_NAME, "beginning")), new SpanTermQuery(new Term(FIELD_NAME, "kennedy")) }, 3, false)); TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { mode = QUERY; doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this); } }; helper.run(); assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2); } public void testSimpleQueryTermScorerHighlighter() throws Exception { doSearching("Kennedy"); Highlighter highlighter = new Highlighter(new QueryTermScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter(40)); int maxNumFragmentsRequired = 2; for (int i = 0; i < hits.totalHits; i++) { String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); System.out.println("\t" + result); } // Not sure we can assert anything here - just running to check we dont // throw any exceptions } public void testSpanHighlighting() throws Exception { Query query1 = new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(FIELD_NAME, "wordx")), new SpanTermQuery(new Term(FIELD_NAME, "wordy")) }, 1, false); Query query2 = new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(FIELD_NAME, "wordy")), new SpanTermQuery(new Term(FIELD_NAME, "wordc")) }, 1, false); BooleanQuery bquery = new BooleanQuery(); bquery.add(query1, Occur.SHOULD); bquery.add(query2, Occur.SHOULD); doSearching(bquery); TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { mode = QUERY; doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this); } }; helper.run(); assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 7); } public void testNotSpanSimpleQuery() throws Exception { doSearching(new SpanNotQuery(new SpanNearQuery(new SpanQuery[] { new SpanTermQuery(new Term(FIELD_NAME, "shot")), new SpanTermQuery(new Term(FIELD_NAME, "kennedy")) }, 3, false), new SpanTermQuery( new Term(FIELD_NAME, "john")))); TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { mode = QUERY; doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this); } }; helper.run(); assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); } public void testGetBestFragmentsSimpleQuery() throws Exception { TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { numHighlights = 0; doSearching("Kennedy"); doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this); assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); } }; helper.start(); } public void testGetFuzzyFragments() throws Exception { TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { numHighlights = 0; doSearching("Kinnedy~"); doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this, true); assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); } }; helper.start(); } public void testGetWildCardFragments() throws Exception { TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { numHighlights = 0; doSearching("K?nnedy"); doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this); assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); } }; helper.start(); } public void testGetMidWildCardFragments() throws Exception { TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { numHighlights = 0; doSearching("K*dy"); doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this); assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); } }; helper.start(); } public void testGetRangeFragments() throws Exception { TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { numHighlights = 0; String queryString = FIELD_NAME + ":[kannedy TO kznnedy]"; // Need to explicitly set the QueryParser property to use TermRangeQuery // rather // than RangeFilters QueryParser parser = new QueryParser(TEST_VERSION, FIELD_NAME, analyzer); parser.setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); query = parser.parse(queryString); doSearching(query); doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this); assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); } }; helper.start(); } public void testConstantScoreMultiTermQuery() throws Exception { numHighlights = 0; query = new WildcardQuery(new Term(FIELD_NAME, "ken*")); ((WildcardQuery)query).setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE); searcher = new IndexSearcher(ramDir, true); // can't rewrite ConstantScore if you want to highlight it - // it rewrites to ConstantScoreQuery which cannot be highlighted // query = unReWrittenQuery.rewrite(reader); System.out.println("Searching for: " + query.toString(FIELD_NAME)); hits = searcher.search(query, null, 1000); for (int i = 0; i < hits.totalHits; i++) { String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME); int maxNumFragmentsRequired = 2; String fragmentSeparator = "..."; QueryScorer scorer = null; TokenStream tokenStream = null; tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, new StringReader(text)); scorer = new QueryScorer(query, HighlighterTest.FIELD_NAME); Highlighter highlighter = new Highlighter(this, scorer); highlighter.setTextFragmenter(new SimpleFragmenter(20)); String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator); System.out.println("\t" + result); } assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); // try null field hits = searcher.search(query, null, 1000); numHighlights = 0; for (int i = 0; i < hits.totalHits; i++) { String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME); int maxNumFragmentsRequired = 2; String fragmentSeparator = "..."; QueryScorer scorer = null; TokenStream tokenStream = null; tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, new StringReader(text)); scorer = new QueryScorer(query, null); Highlighter highlighter = new Highlighter(this, scorer); highlighter.setTextFragmenter(new SimpleFragmenter(20)); String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator); System.out.println("\t" + result); } assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); // try default field hits = searcher.search(query, null, 1000); numHighlights = 0; for (int i = 0; i < hits.totalHits; i++) { String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME); int maxNumFragmentsRequired = 2; String fragmentSeparator = "..."; QueryScorer scorer = null; TokenStream tokenStream = null; tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, new StringReader(text)); scorer = new QueryScorer(query, "random_field", HighlighterTest.FIELD_NAME); Highlighter highlighter = new Highlighter(this, scorer); highlighter.setTextFragmenter(new SimpleFragmenter(20)); String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator); System.out.println("\t" + result); } assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); } public void testGetBestFragmentsPhrase() throws Exception { TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { numHighlights = 0; doSearching("\"John Kennedy\""); doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this); // Currently highlights "John" and "Kennedy" separately assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2); } }; helper.start(); } public void testGetBestFragmentsQueryScorer() throws Exception { TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { numHighlights = 0; SpanQuery clauses[] = { new SpanTermQuery(new Term("contents", "john")), new SpanTermQuery(new Term("contents", "kennedy")), }; SpanNearQuery snq = new SpanNearQuery(clauses, 1, true); doSearching(snq); doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this); // Currently highlights "John" and "Kennedy" separately assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2); } }; helper.start(); } public void testOffByOne() throws Exception { TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { TermQuery query = new TermQuery(new Term("data", "help")); Highlighter hg = new Highlighter(new SimpleHTMLFormatter(), new QueryTermScorer(query)); hg.setTextFragmenter(new NullFragmenter()); String match = null; match = hg.getBestFragment(analyzer, "data", "help me [54-65]"); assertEquals("help me [54-65]", match); } }; helper.start(); } public void testGetBestFragmentsFilteredQuery() throws Exception { TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { numHighlights = 0; TermRangeFilter rf = new TermRangeFilter("contents", "john", "john", true, true); SpanQuery clauses[] = { new SpanTermQuery(new Term("contents", "john")), new SpanTermQuery(new Term("contents", "kennedy")), }; SpanNearQuery snq = new SpanNearQuery(clauses, 1, true); FilteredQuery fq = new FilteredQuery(snq, rf); doSearching(fq); doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this); // Currently highlights "John" and "Kennedy" separately assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2); } }; helper.start(); } public void testGetBestFragmentsFilteredPhraseQuery() throws Exception { TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { numHighlights = 0; TermRangeFilter rf = new TermRangeFilter("contents", "john", "john", true, true); PhraseQuery pq = new PhraseQuery(); pq.add(new Term("contents", "john")); pq.add(new Term("contents", "kennedy")); FilteredQuery fq = new FilteredQuery(pq, rf); doSearching(fq); doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this); // Currently highlights "John" and "Kennedy" separately assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2); } }; helper.start(); } public void testGetBestFragmentsMultiTerm() throws Exception { TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { numHighlights = 0; doSearching("John Kenn*"); doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this); assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); } }; helper.start(); } public void testGetBestFragmentsWithOr() throws Exception { TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { numHighlights = 0; doSearching("JFK OR Kennedy"); doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this); assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); } }; helper.start(); } public void testGetBestSingleFragment() throws Exception { TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { doSearching("Kennedy"); numHighlights = 0; for (int i = 0; i < hits.totalHits; i++) { String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream, HighlighterTest.this); highlighter.setTextFragmenter(new SimpleFragmenter(40)); String result = highlighter.getBestFragment(tokenStream, text); System.out.println("\t" + result); } assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); numHighlights = 0; for (int i = 0; i < hits.totalHits; i++) { String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream, HighlighterTest.this); highlighter.getBestFragment(analyzer, FIELD_NAME, text); } assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); numHighlights = 0; for (int i = 0; i < hits.totalHits; i++) { String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream, HighlighterTest.this); highlighter.getBestFragments(analyzer, FIELD_NAME, text, 10); } assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); } }; helper.start(); } public void testGetBestSingleFragmentWithWeights() throws Exception { TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { WeightedSpanTerm[] wTerms = new WeightedSpanTerm[2]; wTerms[0] = new WeightedSpanTerm(10f, "hello"); List positionSpans = new ArrayList(); positionSpans.add(new PositionSpan(0, 0)); wTerms[0].addPositionSpans(positionSpans); wTerms[1] = new WeightedSpanTerm(1f, "kennedy"); positionSpans = new ArrayList(); positionSpans.add(new PositionSpan(14, 14)); wTerms[1].addPositionSpans(positionSpans); Highlighter highlighter = getHighlighter(wTerms, HighlighterTest.this);// new // Highlighter(new // QueryTermScorer(wTerms)); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(texts[0])); highlighter.setTextFragmenter(new SimpleFragmenter(2)); String result = highlighter.getBestFragment(tokenStream, texts[0]).trim(); assertTrue("Failed to find best section using weighted terms. Found: [" + result + "]", "Hello".equals(result)); // readjust weights wTerms[1].setWeight(50f); tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(texts[0])); highlighter = getHighlighter(wTerms, HighlighterTest.this); highlighter.setTextFragmenter(new SimpleFragmenter(2)); result = highlighter.getBestFragment(tokenStream, texts[0]).trim(); assertTrue("Failed to find best section using weighted terms. Found: " + result, "kennedy".equals(result)); } }; helper.start(); } // tests a "complex" analyzer that produces multiple // overlapping tokens public void testOverlapAnalyzer() throws Exception { TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { HashMap synonyms = new HashMap(); synonyms.put("football", "soccer,footie"); Analyzer analyzer = new SynonymAnalyzer(synonyms); String srchkey = "football"; String s = "football-soccer in the euro 2004 footie competition"; QueryParser parser = new QueryParser(TEST_VERSION, "bookid", analyzer); Query query = parser.parse(srchkey); TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(s)); Highlighter highlighter = getHighlighter(query, null, tokenStream, HighlighterTest.this); // Get 3 best fragments and seperate with a "..." tokenStream = analyzer.tokenStream(null, new StringReader(s)); String result = highlighter.getBestFragments(tokenStream, s, 3, "..."); String expectedResult = "football-soccer in the euro 2004 footie competition"; assertTrue("overlapping analyzer should handle highlights OK, expected:" + expectedResult + " actual:" + result, expectedResult.equals(result)); } }; helper.start(); } public void testGetSimpleHighlight() throws Exception { TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { numHighlights = 0; doSearching("Kennedy"); // new Highlighter(HighlighterTest.this, new QueryTermScorer(query)); for (int i = 0; i < hits.totalHits; i++) { String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream, HighlighterTest.this); String result = highlighter.getBestFragment(tokenStream, text); System.out.println("\t" + result); } assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); } }; helper.start(); } public void testGetTextFragments() throws Exception { TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { doSearching("Kennedy"); for (int i = 0; i < hits.totalHits; i++) { String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream, HighlighterTest.this);// new Highlighter(this, new // QueryTermScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter(20)); String stringResults[] = highlighter.getBestFragments(tokenStream, text, 10); tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); TextFragment fragmentResults[] = highlighter.getBestTextFragments(tokenStream, text, true, 10); assertTrue("Failed to find correct number of text Fragments: " + fragmentResults.length + " vs " + stringResults.length, fragmentResults.length == stringResults.length); for (int j = 0; j < stringResults.length; j++) { System.out.println(fragmentResults[j]); assertTrue("Failed to find same text Fragments: " + fragmentResults[j] + " found", fragmentResults[j].toString().equals(stringResults[j])); } } } }; helper.start(); } public void testMaxSizeHighlight() throws Exception { TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { numHighlights = 0; doSearching("meat"); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(texts[0])); Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream, HighlighterTest.this);// new Highlighter(this, new // QueryTermScorer(query)); highlighter.setMaxDocCharsToAnalyze(30); highlighter.getBestFragment(tokenStream, texts[0]); assertTrue("Setting MaxDocBytesToAnalyze should have prevented " + "us from finding matches for this record: " + numHighlights + " found", numHighlights == 0); } }; helper.start(); } public void testMaxSizeHighlightTruncates() throws Exception { TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { String goodWord = "goodtoken"; Set stopWords = new HashSet(1); stopWords.add("stoppedtoken"); TermQuery query = new TermQuery(new Term("data", goodWord)); String match = null; StringBuilder sb = new StringBuilder(); sb.append(goodWord); for (int i = 0; i < 10000; i++) { sb.append(" "); // only one stopword sb.append(stopWords.iterator().next()); } SimpleHTMLFormatter fm = new SimpleHTMLFormatter(); Highlighter hg = getHighlighter(query, "data", new StandardAnalyzer(TEST_VERSION, stopWords).tokenStream( "data", new StringReader(sb.toString())), fm);// new Highlighter(fm, // new // QueryTermScorer(query)); hg.setTextFragmenter(new NullFragmenter()); hg.setMaxDocCharsToAnalyze(100); match = hg.getBestFragment(new StandardAnalyzer(TEST_VERSION, stopWords), "data", sb.toString()); assertTrue("Matched text should be no more than 100 chars in length ", match.length() < hg .getMaxDocCharsToAnalyze()); // add another tokenized word to the overrall length - but set way // beyond // the length of text under consideration (after a large slug of stop // words // + whitespace) sb.append(" "); sb.append(goodWord); match = hg.getBestFragment(new StandardAnalyzer(TEST_VERSION, stopWords), "data", sb.toString()); assertTrue("Matched text should be no more than 100 chars in length ", match.length() < hg .getMaxDocCharsToAnalyze()); } }; helper.start(); } public void testMaxSizeEndHighlight() throws Exception { TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { Set stopWords = new HashSet(); stopWords.add("in"); stopWords.add("it"); TermQuery query = new TermQuery(new Term("text", "searchterm")); String text = "this is a text with searchterm in it"; SimpleHTMLFormatter fm = new SimpleHTMLFormatter(); Highlighter hg = getHighlighter(query, "text", new StandardAnalyzer(TEST_VERSION, stopWords).tokenStream("text", new StringReader(text)), fm); hg.setTextFragmenter(new NullFragmenter()); hg.setMaxDocCharsToAnalyze(36); String match = hg.getBestFragment(new StandardAnalyzer(TEST_VERSION, stopWords), "text", text); assertTrue( "Matched text should contain remainder of text after highlighted query ", match.endsWith("in it")); } }; helper.start(); } public void testUnRewrittenQuery() throws Exception { final TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { numHighlights = 0; // test to show how rewritten query can still be used searcher = new IndexSearcher(ramDir, true); Analyzer analyzer = new StandardAnalyzer(TEST_VERSION); QueryParser parser = new QueryParser(TEST_VERSION, FIELD_NAME, analyzer); Query query = parser.parse("JF? or Kenned*"); System.out.println("Searching with primitive query"); // forget to set this and... // query=query.rewrite(reader); TopDocs hits = searcher.search(query, null, 1000); // create an instance of the highlighter with the tags used to surround // highlighted text // QueryHighlightExtractor highlighter = new // QueryHighlightExtractor(this, // query, new StandardAnalyzer(TEST_VERSION)); int maxNumFragmentsRequired = 3; for (int i = 0; i < hits.totalHits; i++) { String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream, HighlighterTest.this, false); highlighter.setTextFragmenter(new SimpleFragmenter(40)); String highlightedText = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "..."); System.out.println(highlightedText); } // We expect to have zero highlights if the query is multi-terms and is // not // rewritten! assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 0); } }; helper.start(); } public void testNoFragments() throws Exception { TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { doSearching("AnInvalidQueryWhichShouldYieldNoResults"); for (int i = 0; i < texts.length; i++) { String text = texts[i]; TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text)); Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream, HighlighterTest.this); String result = highlighter.getBestFragment(tokenStream, text); assertNull("The highlight result should be null for text with no query terms", result); } } }; helper.start(); } /** * Demonstrates creation of an XHTML compliant doc using new encoding facilities. * * @throws Exception */ public void testEncoding() throws Exception { String rawDocContent = "\"Smith & sons' prices < 3 and >4\" claims article"; // run the highlighter on the raw content (scorer does not score any tokens // for // highlighting but scores a single fragment for selection Highlighter highlighter = new Highlighter(this, new SimpleHTMLEncoder(), new Scorer() { public void startFragment(TextFragment newFragment) { } public float getTokenScore() { return 0; } public float getFragmentScore() { return 1; } public TokenStream init(TokenStream tokenStream) { return null; } }); highlighter.setTextFragmenter(new SimpleFragmenter(2000)); TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(rawDocContent)); String encodedSnippet = highlighter.getBestFragments(tokenStream, rawDocContent, 1, ""); // An ugly bit of XML creation: String xhtml = "\n" + "\n" + "
\n" + "