/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using NUnit.Framework; using StandardFilter = Lucene.Net.Analysis.Standard.StandardFilter; using StandardTokenizer = Lucene.Net.Analysis.Standard.StandardTokenizer; using PositionIncrementAttribute = Lucene.Net.Analysis.Tokenattributes.PositionIncrementAttribute; using TermAttribute = Lucene.Net.Analysis.Tokenattributes.TermAttribute; using AttributeSource = Lucene.Net.Util.AttributeSource; using English = Lucene.Net.Util.English; namespace Lucene.Net.Analysis { /// tests for the TestTeeSinkTokenFilter [TestFixture] public class TestTeeSinkTokenFilter:BaseTokenStreamTestCase { public class AnonymousClassSinkFilter:TeeSinkTokenFilter.SinkFilter { public override bool Accept(AttributeSource a) { TermAttribute termAtt = (TermAttribute) a.GetAttribute(typeof(TermAttribute)); return termAtt.Term().ToUpper().Equals("The".ToUpper()); } } public class AnonymousClassSinkFilter1:TeeSinkTokenFilter.SinkFilter { public override bool Accept(AttributeSource a) { TermAttribute termAtt = (TermAttribute) a.GetAttribute(typeof(TermAttribute)); return termAtt.Term().ToUpper().Equals("Dogs".ToUpper()); } } protected internal System.Text.StringBuilder buffer1; protected internal System.Text.StringBuilder buffer2; protected internal System.String[] tokens1; protected internal System.String[] tokens2; public TestTeeSinkTokenFilter(System.String s):base(s) { } public TestTeeSinkTokenFilter() { } [SetUp] public override void SetUp() { base.SetUp(); tokens1 = new System.String[]{"The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs"}; tokens2 = new System.String[]{"The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch"}; buffer1 = new System.Text.StringBuilder(); for (int i = 0; i < tokens1.Length; i++) { buffer1.Append(tokens1[i]).Append(' '); } buffer2 = new System.Text.StringBuilder(); for (int i = 0; i < tokens2.Length; i++) { buffer2.Append(tokens2[i]).Append(' '); } } internal static readonly TeeSinkTokenFilter.SinkFilter theFilter; internal static readonly TeeSinkTokenFilter.SinkFilter dogFilter; [Test] public virtual void TestGeneral() { TeeSinkTokenFilter source = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString()))); TokenStream sink1 = source.NewSinkTokenStream(); TokenStream sink2 = source.NewSinkTokenStream(theFilter); int i = 0; TermAttribute termAtt = (TermAttribute) source.GetAttribute(typeof(TermAttribute)); while (source.IncrementToken()) { Assert.AreEqual(tokens1[i], termAtt.Term()); i++; } Assert.AreEqual(tokens1.Length, i); i = 0; termAtt = (TermAttribute) sink1.GetAttribute(typeof(TermAttribute)); while (sink1.IncrementToken()) { Assert.AreEqual(tokens1[i], termAtt.Term()); i++; } Assert.AreEqual(tokens1.Length, i); i = 0; termAtt = (TermAttribute) sink2.GetAttribute(typeof(TermAttribute)); while (sink2.IncrementToken()) { Assert.IsTrue(termAtt.Term().ToUpper().Equals("The".ToUpper())); i++; } Assert.AreEqual(2, i, "there should be two times 'the' in the stream"); } [Test] public virtual void TestMultipleSources() { TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString()))); TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.NewSinkTokenStream(dogFilter); TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.NewSinkTokenStream(theFilter); TokenStream source1 = new CachingTokenFilter(tee1); TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer2.ToString()))); tee2.AddSinkTokenStream(dogDetector); tee2.AddSinkTokenStream(theDetector); TokenStream source2 = tee2; int i = 0; TermAttribute termAtt = (TermAttribute) source1.GetAttribute(typeof(TermAttribute)); while (source1.IncrementToken()) { Assert.AreEqual(tokens1[i], termAtt.Term()); i++; } Assert.AreEqual(tokens1.Length, i); i = 0; termAtt = (TermAttribute) source2.GetAttribute(typeof(TermAttribute)); while (source2.IncrementToken()) { Assert.AreEqual(tokens2[i], termAtt.Term()); i++; } Assert.AreEqual(tokens2.Length, i); i = 0; termAtt = (TermAttribute) theDetector.GetAttribute(typeof(TermAttribute)); while (theDetector.IncrementToken()) { Assert.IsTrue(termAtt.Term().ToUpper().Equals("The".ToUpper()), "'" + termAtt.Term() + "' is not equal to 'The'"); i++; } Assert.AreEqual(4, i, "there must be 4 times 'The' in the stream"); i = 0; termAtt = (TermAttribute) dogDetector.GetAttribute(typeof(TermAttribute)); while (dogDetector.IncrementToken()) { Assert.IsTrue(termAtt.Term().ToUpper().Equals("Dogs".ToUpper()), "'" + termAtt.Term() + "' is not equal to 'Dogs'"); i++; } Assert.AreEqual(2, i, "there must be 2 times 'Dog' in the stream"); source1.Reset(); TokenStream lowerCasing = new LowerCaseFilter(source1); i = 0; termAtt = (TermAttribute) lowerCasing.GetAttribute(typeof(TermAttribute)); while (lowerCasing.IncrementToken()) { Assert.AreEqual(tokens1[i].ToLower(), termAtt.Term()); i++; } Assert.AreEqual(i, tokens1.Length); } /// Not an explicit test, just useful to print out some info on performance /// /// /// Exception public virtual void Performance() { int[] tokCount = new int[]{100, 500, 1000, 2000, 5000, 10000}; int[] modCounts = new int[]{1, 2, 5, 10, 20, 50, 100, 200, 500}; for (int k = 0; k < tokCount.Length; k++) { System.Text.StringBuilder buffer = new System.Text.StringBuilder(); System.Console.Out.WriteLine("-----Tokens: " + tokCount[k] + "-----"); for (int i = 0; i < tokCount[k]; i++) { buffer.Append(English.IntToEnglish(i).ToUpper()).Append(' '); } //make sure we produce the same tokens TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString())))); TokenStream sink = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, 100)); teeStream.ConsumeAllTokens(); TokenStream stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), 100); TermAttribute tfTok = (TermAttribute) stream.AddAttribute(typeof(TermAttribute)); TermAttribute sinkTok = (TermAttribute) sink.AddAttribute(typeof(TermAttribute)); for (int i = 0; stream.IncrementToken(); i++) { Assert.IsTrue(sink.IncrementToken()); Assert.IsTrue(tfTok.Equals(sinkTok) == true, tfTok + " is not equal to " + sinkTok + " at token: " + i); } //simulate two fields, each being analyzed once, for 20 documents for (int j = 0; j < modCounts.Length; j++) { int tfPos = 0; long start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); for (int i = 0; i < 20; i++) { stream = new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))); PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.GetAttribute(typeof(PositionIncrementAttribute)); while (stream.IncrementToken()) { tfPos += posIncrAtt.GetPositionIncrement(); } stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), modCounts[j]); posIncrAtt = (PositionIncrementAttribute) stream.GetAttribute(typeof(PositionIncrementAttribute)); while (stream.IncrementToken()) { tfPos += posIncrAtt.GetPositionIncrement(); } } long finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms"); int sinkPos = 0; //simulate one field with one sink start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); for (int i = 0; i < 20; i++) { teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString())))); sink = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, modCounts[j])); PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) teeStream.GetAttribute(typeof(PositionIncrementAttribute)); while (teeStream.IncrementToken()) { sinkPos += posIncrAtt.GetPositionIncrement(); } //System.out.println("Modulo--------"); posIncrAtt = (PositionIncrementAttribute) sink.GetAttribute(typeof(PositionIncrementAttribute)); while (sink.IncrementToken()) { sinkPos += posIncrAtt.GetPositionIncrement(); } } finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms"); Assert.IsTrue(sinkPos == tfPos, sinkPos + " does not equal: " + tfPos); } System.Console.Out.WriteLine("- End Tokens: " + tokCount[k] + "-----"); } } internal class ModuloTokenFilter:TokenFilter { private void InitBlock(TestTeeSinkTokenFilter enclosingInstance) { this.enclosingInstance = enclosingInstance; } private TestTeeSinkTokenFilter enclosingInstance; public TestTeeSinkTokenFilter Enclosing_Instance { get { return enclosingInstance; } } internal int modCount; internal ModuloTokenFilter(TestTeeSinkTokenFilter enclosingInstance, TokenStream input, int mc):base(input) { InitBlock(enclosingInstance); modCount = mc; } internal int count = 0; //return every 100 tokens public override bool IncrementToken() { bool hasNext; for (hasNext = input.IncrementToken(); hasNext && count % modCount != 0; hasNext = input.IncrementToken()) { count++; } count++; return hasNext; } } internal class ModuloSinkFilter:TeeSinkTokenFilter.SinkFilter { private void InitBlock(TestTeeSinkTokenFilter enclosingInstance) { this.enclosingInstance = enclosingInstance; } private TestTeeSinkTokenFilter enclosingInstance; public TestTeeSinkTokenFilter Enclosing_Instance { get { return enclosingInstance; } } internal int count = 0; internal int modCount; internal ModuloSinkFilter(TestTeeSinkTokenFilter enclosingInstance, int mc) { InitBlock(enclosingInstance); modCount = mc; } public override bool Accept(AttributeSource a) { bool b = (a != null && count % modCount == 0); count++; return b; } } static TestTeeSinkTokenFilter() { theFilter = new AnonymousClassSinkFilter(); dogFilter = new AnonymousClassSinkFilter1(); } } }