/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using Lucene.Net.Analysis.Tokenattributes; using Lucene.Net.Test.Analysis; using NUnit.Framework; using StandardFilter = Lucene.Net.Analysis.Standard.StandardFilter; using StandardTokenizer = Lucene.Net.Analysis.Standard.StandardTokenizer; using AttributeSource = Lucene.Net.Util.AttributeSource; using English = Lucene.Net.Util.English; using Version = Lucene.Net.Util.Version; namespace Lucene.Net.Analysis { /// tests for the TestTeeSinkTokenFilter [TestFixture] public class TestTeeSinkTokenFilter : BaseTokenStreamTestCase { public class AnonymousClassSinkFilter : TeeSinkTokenFilter.SinkFilter { public override bool Accept(AttributeSource a) { ITermAttribute termAtt = a.GetAttribute(); return termAtt.Term.ToUpper().Equals("The".ToUpper()); } } public class AnonymousClassSinkFilter1 : TeeSinkTokenFilter.SinkFilter { public override bool Accept(AttributeSource a) { ITermAttribute termAtt = a.GetAttribute(); return termAtt.Term.ToUpper().Equals("Dogs".ToUpper()); } } protected internal System.Text.StringBuilder buffer1; protected internal System.Text.StringBuilder buffer2; protected internal System.String[] tokens1; protected internal System.String[] tokens2; public TestTeeSinkTokenFilter(System.String s) : base(s) { } public TestTeeSinkTokenFilter() { } [SetUp] public override void SetUp() { base.SetUp(); tokens1 = new System.String[] { "The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs" }; tokens2 = new System.String[] { "The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch" }; buffer1 = new System.Text.StringBuilder(); for (int i = 0; i < tokens1.Length; i++) { buffer1.Append(tokens1[i]).Append(' '); } buffer2 = new System.Text.StringBuilder(); for (int i = 0; i < tokens2.Length; i++) { buffer2.Append(tokens2[i]).Append(' '); } } internal static readonly TeeSinkTokenFilter.SinkFilter theFilter; internal static readonly TeeSinkTokenFilter.SinkFilter dogFilter; [Test] public virtual void TestGeneral() { TeeSinkTokenFilter source = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString()))); TokenStream sink1 = source.NewSinkTokenStream(); TokenStream sink2 = source.NewSinkTokenStream(theFilter); source.AddAttribute(); sink1.AddAttribute(); sink2.AddAttribute(); AssertTokenStreamContents(source, tokens1); AssertTokenStreamContents(sink1, tokens1); } [Test] public virtual void TestMultipleSources() { TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString()))); TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.NewSinkTokenStream(dogFilter); TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.NewSinkTokenStream(theFilter); TokenStream source1 = new CachingTokenFilter(tee1); tee1.AddAttribute(); dogDetector.AddAttribute(); theDetector.AddAttribute(); TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer2.ToString()))); tee2.AddSinkTokenStream(dogDetector); tee2.AddSinkTokenStream(theDetector); TokenStream source2 = tee2; AssertTokenStreamContents(source1, tokens1); AssertTokenStreamContents(source2, tokens2); AssertTokenStreamContents(theDetector, new String[] { "The", "the", "The", "the" }); source1.Reset(); TokenStream lowerCasing = new LowerCaseFilter(source1); String[] lowerCaseTokens = new String[tokens1.Length]; for (int i = 0; i < tokens1.Length; i++) lowerCaseTokens[i] = tokens1[i].ToLower(); } /// Not an explicit test, just useful to print out some info on performance /// /// /// Exception public virtual void Performance() { int[] tokCount = new int[] { 100, 500, 1000, 2000, 5000, 10000 }; int[] modCounts = new int[] { 1, 2, 5, 10, 20, 50, 100, 200, 500 }; for (int k = 0; k < tokCount.Length; k++) { System.Text.StringBuilder buffer = new System.Text.StringBuilder(); System.Console.Out.WriteLine("-----Tokens: " + tokCount[k] + "-----"); for (int i = 0; i < tokCount[k]; i++) { buffer.Append(English.IntToEnglish(i).ToUpper()).Append(' '); } //make sure we produce the same tokens TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString())))); TokenStream sink = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, 100)); teeStream.ConsumeAllTokens(); TokenStream stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString()))), 100); ITermAttribute tfTok = stream.AddAttribute(); ITermAttribute sinkTok = sink.AddAttribute(); for (int i = 0; stream.IncrementToken(); i++) { Assert.IsTrue(sink.IncrementToken()); Assert.IsTrue(tfTok.Equals(sinkTok) == true, tfTok + " is not equal to " + sinkTok + " at token: " + i); } //simulate two fields, each being analyzed once, for 20 documents for (int j = 0; j < modCounts.Length; j++) { int tfPos = 0; long start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); for (int i = 0; i < 20; i++) { stream = new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString()))); IPositionIncrementAttribute posIncrAtt = stream.GetAttribute(); while (stream.IncrementToken()) { tfPos += posIncrAtt.PositionIncrement; } stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString()))), modCounts[j]); posIncrAtt = stream.GetAttribute(); while (stream.IncrementToken()) { tfPos += posIncrAtt.PositionIncrement; } } long finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms"); int sinkPos = 0; //simulate one field with one sink start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); for (int i = 0; i < 20; i++) { teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(Version.LUCENE_CURRENT, new System.IO.StringReader(buffer.ToString())))); sink = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, modCounts[j])); IPositionIncrementAttribute posIncrAtt = teeStream.GetAttribute(); while (teeStream.IncrementToken()) { sinkPos += posIncrAtt.PositionIncrement; } //System.out.println("Modulo--------"); posIncrAtt = sink.GetAttribute(); while (sink.IncrementToken()) { sinkPos += posIncrAtt.PositionIncrement; } } finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms"); Assert.IsTrue(sinkPos == tfPos, sinkPos + " does not equal: " + tfPos); } System.Console.Out.WriteLine("- End Tokens: " + tokCount[k] + "-----"); } } internal class ModuloTokenFilter : TokenFilter { private void InitBlock(TestTeeSinkTokenFilter enclosingInstance) { this.enclosingInstance = enclosingInstance; } private TestTeeSinkTokenFilter enclosingInstance; public TestTeeSinkTokenFilter Enclosing_Instance { get { return enclosingInstance; } } internal int modCount; internal ModuloTokenFilter(TestTeeSinkTokenFilter enclosingInstance, TokenStream input, int mc) : base(input) { InitBlock(enclosingInstance); modCount = mc; } internal int count = 0; //return every 100 tokens public override bool IncrementToken() { bool hasNext; for (hasNext = input.IncrementToken(); hasNext && count % modCount != 0; hasNext = input.IncrementToken()) { count++; } count++; return hasNext; } } internal class ModuloSinkFilter : TeeSinkTokenFilter.SinkFilter { private void InitBlock(TestTeeSinkTokenFilter enclosingInstance) { this.enclosingInstance = enclosingInstance; } private TestTeeSinkTokenFilter enclosingInstance; public TestTeeSinkTokenFilter Enclosing_Instance { get { return enclosingInstance; } } internal int count = 0; internal int modCount; internal ModuloSinkFilter(TestTeeSinkTokenFilter enclosingInstance, int mc) { InitBlock(enclosingInstance); modCount = mc; } public override bool Accept(AttributeSource a) { bool b = (a != null && count % modCount == 0); count++; return b; } } static TestTeeSinkTokenFilter() { theFilter = new AnonymousClassSinkFilter(); dogFilter = new AnonymousClassSinkFilter1(); } } }