/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using System.IO; using Lucene.Net.Analysis; using Lucene.Net.Analysis.Tokenattributes; using Lucene.Net.Documents; using Lucene.Net.Index; using Lucene.Net.QueryParsers; using Lucene.Net.Search; using Lucene.Net.Store; using NUnit.Framework; using Directory = Lucene.Net.Store.Directory; namespace Lucene.Net.Analysis.Shingle { /// /// A test class for ShingleAnalyzerWrapper as regards queries and scoring. /// public class ShingleAnalyzerWrapperTest : BaseTokenStreamTestCase { public IndexSearcher Searcher; /// /// Set up a new index in RAM with three test phrases and the supplied Analyzer. /// /// the analyzer to use /// an indexSearcher on the test index. public IndexSearcher SetUpSearcher(Analyzer analyzer) { Directory dir = new RAMDirectory(); var writer = new IndexWriter(dir, analyzer, true); var doc = new Document(); doc.Add(new Field("content", "please divide this sentence into shingles", Field.Store.YES, Field.Index.ANALYZED)); writer.AddDocument(doc); doc = new Document(); doc.Add(new Field("content", "just another test sentence", Field.Store.YES, Field.Index.ANALYZED)); writer.AddDocument(doc); doc = new Document(); doc.Add(new Field("content", "a sentence which contains no test", Field.Store.YES, Field.Index.ANALYZED)); writer.AddDocument(doc); writer.Close(); return new IndexSearcher(dir); } protected Hits QueryParsingTest(Analyzer analyzer, String qs) { Searcher = SetUpSearcher(analyzer); var qp = new QueryParser("content", analyzer); var q = qp.Parse(qs); return Searcher.Search(q); } protected void CompareRanks(Hits hits, int[] ranks) { Assert.AreEqual(ranks.Length, hits.Length()); for (int i = 0; i < ranks.Length; i++) { Assert.AreEqual(ranks[i], hits.Id(i)); } } /// /// Will not work on an index without unigrams, since QueryParser automatically tokenizes on whitespace. /// [Test] public void TestShingleAnalyzerWrapperQueryParsing() { var hits = QueryParsingTest(new ShingleAnalyzerWrapper (new WhitespaceAnalyzer(), 2), "test sentence"); var ranks = new[] {1, 2, 0}; CompareRanks(hits, ranks); } /// /// This one fails with an exception. /// [Test] public void TestShingleAnalyzerWrapperPhraseQueryParsingFails() { var hits = QueryParsingTest(new ShingleAnalyzerWrapper (new WhitespaceAnalyzer(), 2), "\"this sentence\""); var ranks = new[] {0}; CompareRanks(hits, ranks); } /// /// This one works, actually. /// [Test] public void TestShingleAnalyzerWrapperPhraseQueryParsing() { var hits = QueryParsingTest(new ShingleAnalyzerWrapper (new WhitespaceAnalyzer(), 2), "\"test sentence\""); var ranks = new[] {1}; CompareRanks(hits, ranks); } /// /// Same as above, is tokenized without using the analyzer. /// [Test] public void TestShingleAnalyzerWrapperRequiredQueryParsing() { var hits = QueryParsingTest(new ShingleAnalyzerWrapper (new WhitespaceAnalyzer(), 2), "+test +sentence"); var ranks = new[] {1, 2}; CompareRanks(hits, ranks); } /// /// This shows how to construct a phrase query containing shingles. /// [Test] public void TestShingleAnalyzerWrapperPhraseQuery() { Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2); Searcher = SetUpSearcher(analyzer); var q = new PhraseQuery(); var ts = analyzer.TokenStream("content", new StringReader("this sentence")); var j = -1; var posIncrAtt = (PositionIncrementAttribute) ts.AddAttribute(typeof (PositionIncrementAttribute)); var termAtt = (TermAttribute) ts.AddAttribute(typeof (TermAttribute)); while (ts.IncrementToken()) { j += posIncrAtt.GetPositionIncrement(); var termText = termAtt.Term(); q.Add(new Term("content", termText), j); } var hits = Searcher.Search(q); var ranks = new[] {0}; CompareRanks(hits, ranks); } /// /// How to construct a boolean query with shingles. A query like this will /// implicitly score those documents higher that contain the words in the query /// in the right order and adjacent to each other. /// [Test] public void TestShingleAnalyzerWrapperBooleanQuery() { Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2); Searcher = SetUpSearcher(analyzer); var q = new BooleanQuery(); var ts = analyzer.TokenStream("content", new StringReader("test sentence")); var termAtt = (TermAttribute) ts.AddAttribute(typeof (TermAttribute)); while (ts.IncrementToken()) { var termText = termAtt.Term(); q.Add(new TermQuery(new Term("content", termText)), BooleanClause.Occur.SHOULD); } var hits = Searcher.Search(q); var ranks = new[] {1, 2, 0}; CompareRanks(hits, ranks); } [Test] public void TestReusableTokenStream() { Analyzer a = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2); AssertAnalyzesToReuse(a, "please divide into shingles", new[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" }, new[] {0, 0, 7, 7, 14, 14, 19}, new[] {6, 13, 13, 18, 18, 27, 27}, new[] {1, 0, 1, 0, 1, 0, 1}); AssertAnalyzesToReuse(a, "divide me up again", new[] {"divide", "divide me", "me", "me up", "up", "up again", "again"}, new[] {0, 0, 7, 7, 10, 10, 13}, new[] {6, 9, 9, 12, 12, 18, 18}, new[] {1, 0, 1, 0, 1, 0, 1}); } /// /// subclass that acts just like whitespace analyzer for testing /// [Test] public void TestLucene1678BwComp() { Analyzer a = new ShingleWrapperSubclassAnalyzer(); AssertAnalyzesToReuse(a, "this is a test", new[] {"this", "is", "a", "test"}, new[] {0, 5, 8, 10}, new[] {4, 7, 9, 14}); } /// /// analyzer that does not support reuse it is LetterTokenizer on odd invocations, WhitespaceTokenizer on even. /// [Test] public void TestWrappedAnalyzerDoesNotReuse() { Analyzer a = new ShingleAnalyzerWrapper(new NonreusableAnalyzer()); AssertAnalyzesToReuse(a, "please divide into shingles.", new[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" }, new[] {0, 0, 7, 7, 14, 14, 19}, new[] {6, 13, 13, 18, 18, 27, 27}, new[] {1, 0, 1, 0, 1, 0, 1}); AssertAnalyzesToReuse(a, "please divide into shingles.", new[] { "please", "please divide", "divide", "divide into", "into", "into shingles.", "shingles." }, new[] {0, 0, 7, 7, 14, 14, 19}, new[] {6, 13, 13, 18, 18, 28, 28}, new[] {1, 0, 1, 0, 1, 0, 1}); AssertAnalyzesToReuse(a, "please divide into shingles.", new[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" }, new[] {0, 0, 7, 7, 14, 14, 19}, new[] {6, 13, 13, 18, 18, 27, 27}, new[] {1, 0, 1, 0, 1, 0, 1}); } #region Nested type: NonreusableAnalyzer private class NonreusableAnalyzer : Analyzer { private int _invocationCount; public override TokenStream TokenStream(String fieldName, TextReader reader) { if (++_invocationCount%2 == 0) return new WhitespaceTokenizer(reader); return new LetterTokenizer(reader); } } #endregion #region Nested type: ShingleWrapperSubclassAnalyzer private class ShingleWrapperSubclassAnalyzer : ShingleAnalyzerWrapper { public override TokenStream TokenStream(String fieldName, TextReader reader) { return new WhitespaceTokenizer(reader); } } ; #endregion } }