/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using NUnit.Framework;
using StandardFilter = Lucene.Net.Analysis.Standard.StandardFilter;
using StandardTokenizer = Lucene.Net.Analysis.Standard.StandardTokenizer;
using PositionIncrementAttribute = Lucene.Net.Analysis.Tokenattributes.PositionIncrementAttribute;
using TermAttribute = Lucene.Net.Analysis.Tokenattributes.TermAttribute;
using AttributeSource = Lucene.Net.Util.AttributeSource;
using English = Lucene.Net.Util.English;
namespace Lucene.Net.Analysis
{
/// tests for the TestTeeSinkTokenFilter
[TestFixture]
public class TestTeeSinkTokenFilter:BaseTokenStreamTestCase
{
public class AnonymousClassSinkFilter:TeeSinkTokenFilter.SinkFilter
{
public override bool Accept(AttributeSource a)
{
TermAttribute termAtt = (TermAttribute) a.GetAttribute(typeof(TermAttribute));
return termAtt.Term().ToUpper().Equals("The".ToUpper());
}
}
public class AnonymousClassSinkFilter1:TeeSinkTokenFilter.SinkFilter
{
public override bool Accept(AttributeSource a)
{
TermAttribute termAtt = (TermAttribute) a.GetAttribute(typeof(TermAttribute));
return termAtt.Term().ToUpper().Equals("Dogs".ToUpper());
}
}
protected internal System.Text.StringBuilder buffer1;
protected internal System.Text.StringBuilder buffer2;
protected internal System.String[] tokens1;
protected internal System.String[] tokens2;
public TestTeeSinkTokenFilter(System.String s):base(s)
{
}
public TestTeeSinkTokenFilter()
{
}
[SetUp]
public override void SetUp()
{
base.SetUp();
tokens1 = new System.String[]{"The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs"};
tokens2 = new System.String[]{"The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch"};
buffer1 = new System.Text.StringBuilder();
for (int i = 0; i < tokens1.Length; i++)
{
buffer1.Append(tokens1[i]).Append(' ');
}
buffer2 = new System.Text.StringBuilder();
for (int i = 0; i < tokens2.Length; i++)
{
buffer2.Append(tokens2[i]).Append(' ');
}
}
internal static readonly TeeSinkTokenFilter.SinkFilter theFilter;
internal static readonly TeeSinkTokenFilter.SinkFilter dogFilter;
[Test]
public virtual void TestGeneral()
{
TeeSinkTokenFilter source = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString())));
TokenStream sink1 = source.NewSinkTokenStream();
TokenStream sink2 = source.NewSinkTokenStream(theFilter);
int i = 0;
TermAttribute termAtt = (TermAttribute) source.GetAttribute(typeof(TermAttribute));
while (source.IncrementToken())
{
Assert.AreEqual(tokens1[i], termAtt.Term());
i++;
}
Assert.AreEqual(tokens1.Length, i);
i = 0;
termAtt = (TermAttribute) sink1.GetAttribute(typeof(TermAttribute));
while (sink1.IncrementToken())
{
Assert.AreEqual(tokens1[i], termAtt.Term());
i++;
}
Assert.AreEqual(tokens1.Length, i);
i = 0;
termAtt = (TermAttribute) sink2.GetAttribute(typeof(TermAttribute));
while (sink2.IncrementToken())
{
Assert.IsTrue(termAtt.Term().ToUpper().Equals("The".ToUpper()));
i++;
}
Assert.AreEqual(2, i, "there should be two times 'the' in the stream");
}
[Test]
public virtual void TestMultipleSources()
{
TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer1.ToString())));
TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.NewSinkTokenStream(dogFilter);
TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.NewSinkTokenStream(theFilter);
TokenStream source1 = new CachingTokenFilter(tee1);
TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(new System.IO.StringReader(buffer2.ToString())));
tee2.AddSinkTokenStream(dogDetector);
tee2.AddSinkTokenStream(theDetector);
TokenStream source2 = tee2;
int i = 0;
TermAttribute termAtt = (TermAttribute) source1.GetAttribute(typeof(TermAttribute));
while (source1.IncrementToken())
{
Assert.AreEqual(tokens1[i], termAtt.Term());
i++;
}
Assert.AreEqual(tokens1.Length, i);
i = 0;
termAtt = (TermAttribute) source2.GetAttribute(typeof(TermAttribute));
while (source2.IncrementToken())
{
Assert.AreEqual(tokens2[i], termAtt.Term());
i++;
}
Assert.AreEqual(tokens2.Length, i);
i = 0;
termAtt = (TermAttribute) theDetector.GetAttribute(typeof(TermAttribute));
while (theDetector.IncrementToken())
{
Assert.IsTrue(termAtt.Term().ToUpper().Equals("The".ToUpper()), "'" + termAtt.Term() + "' is not equal to 'The'");
i++;
}
Assert.AreEqual(4, i, "there must be 4 times 'The' in the stream");
i = 0;
termAtt = (TermAttribute) dogDetector.GetAttribute(typeof(TermAttribute));
while (dogDetector.IncrementToken())
{
Assert.IsTrue(termAtt.Term().ToUpper().Equals("Dogs".ToUpper()), "'" + termAtt.Term() + "' is not equal to 'Dogs'");
i++;
}
Assert.AreEqual(2, i, "there must be 2 times 'Dog' in the stream");
source1.Reset();
TokenStream lowerCasing = new LowerCaseFilter(source1);
i = 0;
termAtt = (TermAttribute) lowerCasing.GetAttribute(typeof(TermAttribute));
while (lowerCasing.IncrementToken())
{
Assert.AreEqual(tokens1[i].ToLower(), termAtt.Term());
i++;
}
Assert.AreEqual(i, tokens1.Length);
}
/// Not an explicit test, just useful to print out some info on performance
///
///
/// Exception
public virtual void Performance()
{
int[] tokCount = new int[]{100, 500, 1000, 2000, 5000, 10000};
int[] modCounts = new int[]{1, 2, 5, 10, 20, 50, 100, 200, 500};
for (int k = 0; k < tokCount.Length; k++)
{
System.Text.StringBuilder buffer = new System.Text.StringBuilder();
System.Console.Out.WriteLine("-----Tokens: " + tokCount[k] + "-----");
for (int i = 0; i < tokCount[k]; i++)
{
buffer.Append(English.IntToEnglish(i).ToUpper()).Append(' ');
}
//make sure we produce the same tokens
TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))));
TokenStream sink = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, 100));
teeStream.ConsumeAllTokens();
TokenStream stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), 100);
TermAttribute tfTok = (TermAttribute) stream.AddAttribute(typeof(TermAttribute));
TermAttribute sinkTok = (TermAttribute) sink.AddAttribute(typeof(TermAttribute));
for (int i = 0; stream.IncrementToken(); i++)
{
Assert.IsTrue(sink.IncrementToken());
Assert.IsTrue(tfTok.Equals(sinkTok) == true, tfTok + " is not equal to " + sinkTok + " at token: " + i);
}
//simulate two fields, each being analyzed once, for 20 documents
for (int j = 0; j < modCounts.Length; j++)
{
int tfPos = 0;
long start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
for (int i = 0; i < 20; i++)
{
stream = new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString())));
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.GetAttribute(typeof(PositionIncrementAttribute));
while (stream.IncrementToken())
{
tfPos += posIncrAtt.GetPositionIncrement();
}
stream = new ModuloTokenFilter(this, new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))), modCounts[j]);
posIncrAtt = (PositionIncrementAttribute) stream.GetAttribute(typeof(PositionIncrementAttribute));
while (stream.IncrementToken())
{
tfPos += posIncrAtt.GetPositionIncrement();
}
}
long finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
int sinkPos = 0;
//simulate one field with one sink
start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
for (int i = 0; i < 20; i++)
{
teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(new System.IO.StringReader(buffer.ToString()))));
sink = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, modCounts[j]));
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) teeStream.GetAttribute(typeof(PositionIncrementAttribute));
while (teeStream.IncrementToken())
{
sinkPos += posIncrAtt.GetPositionIncrement();
}
//System.out.println("Modulo--------");
posIncrAtt = (PositionIncrementAttribute) sink.GetAttribute(typeof(PositionIncrementAttribute));
while (sink.IncrementToken())
{
sinkPos += posIncrAtt.GetPositionIncrement();
}
}
finish = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
System.Console.Out.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
Assert.IsTrue(sinkPos == tfPos, sinkPos + " does not equal: " + tfPos);
}
System.Console.Out.WriteLine("- End Tokens: " + tokCount[k] + "-----");
}
}
internal class ModuloTokenFilter:TokenFilter
{
private void InitBlock(TestTeeSinkTokenFilter enclosingInstance)
{
this.enclosingInstance = enclosingInstance;
}
private TestTeeSinkTokenFilter enclosingInstance;
public TestTeeSinkTokenFilter Enclosing_Instance
{
get
{
return enclosingInstance;
}
}
internal int modCount;
internal ModuloTokenFilter(TestTeeSinkTokenFilter enclosingInstance, TokenStream input, int mc):base(input)
{
InitBlock(enclosingInstance);
modCount = mc;
}
internal int count = 0;
//return every 100 tokens
public override bool IncrementToken()
{
bool hasNext;
for (hasNext = input.IncrementToken(); hasNext && count % modCount != 0; hasNext = input.IncrementToken())
{
count++;
}
count++;
return hasNext;
}
}
internal class ModuloSinkFilter:TeeSinkTokenFilter.SinkFilter
{
private void InitBlock(TestTeeSinkTokenFilter enclosingInstance)
{
this.enclosingInstance = enclosingInstance;
}
private TestTeeSinkTokenFilter enclosingInstance;
public TestTeeSinkTokenFilter Enclosing_Instance
{
get
{
return enclosingInstance;
}
}
internal int count = 0;
internal int modCount;
internal ModuloSinkFilter(TestTeeSinkTokenFilter enclosingInstance, int mc)
{
InitBlock(enclosingInstance);
modCount = mc;
}
public override bool Accept(AttributeSource a)
{
bool b = (a != null && count % modCount == 0);
count++;
return b;
}
}
static TestTeeSinkTokenFilter()
{
theFilter = new AnonymousClassSinkFilter();
dogFilter = new AnonymousClassSinkFilter1();
}
}
}