/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using System.Collections.Generic; using Attribute = Lucene.Net.Util.Attribute; using AttributeSource = Lucene.Net.Util.AttributeSource; namespace Lucene.Net.Analysis { /// This TokenFilter provides the ability to set aside attribute states /// that have already been analyzed. This is useful in situations where multiple fields share /// many common analysis steps and then go their separate ways. ///

/// It is also useful for doing things like entity extraction or proper noun analysis as /// part of the analysis workflow and saving off those tokens for use in another field. /// /// /// TeeSinkTokenFilter source1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader1)); /// TeeSinkTokenFilter.SinkTokenStream sink1 = source1.newSinkTokenStream(); /// TeeSinkTokenFilter.SinkTokenStream sink2 = source1.newSinkTokenStream(); /// TeeSinkTokenFilter source2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader2)); /// source2.addSinkTokenStream(sink1); /// source2.addSinkTokenStream(sink2); /// TokenStream final1 = new LowerCaseFilter(source1); /// TokenStream final2 = source2; /// TokenStream final3 = new EntityDetect(sink1); /// TokenStream final4 = new URLDetect(sink2); /// d.add(new Field("f1", final1)); /// d.add(new Field("f2", final2)); /// d.add(new Field("f3", final3)); /// d.add(new Field("f4", final4)); /// /// In this example, sink1 and sink2 will both get tokens from both /// reader1 and reader2 after whitespace tokenizer /// and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired. /// It is important, that tees are consumed before sinks (in the above example, the field names must be /// less the sink's field names). If you are not sure, which stream is consumed first, you can simply /// add another sink and then pass all tokens to the sinks at once using . /// This TokenFilter is exhausted after this. In the above example, change /// the example above to: /// /// ... /// TokenStream final1 = new LowerCaseFilter(source1.newSinkTokenStream()); /// TokenStream final2 = source2.newSinkTokenStream(); /// sink1.consumeAllTokens(); /// sink2.consumeAllTokens(); /// ... /// /// In this case, the fields can be added in any order, because the sources are not used anymore and all sinks are ready. ///

Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene. ///

public sealed class TeeSinkTokenFilter:TokenFilter { public class AnonymousClassSinkFilter:SinkFilter { public override bool Accept(AttributeSource source) { return true; } } private readonly LinkedList sinks = new LinkedList(); /// Instantiates a new TeeSinkTokenFilter. public TeeSinkTokenFilter(TokenStream input):base(input) { } /// Returns a new that receives all tokens consumed by this stream. public SinkTokenStream NewSinkTokenStream() { return NewSinkTokenStream(ACCEPT_ALL_FILTER); } /// Returns a new that receives all tokens consumed by this stream /// that pass the supplied filter. /// /// /// public SinkTokenStream NewSinkTokenStream(SinkFilter filter) { var sink = new SinkTokenStream(this.CloneAttributes(), filter); sinks.AddLast(new WeakReference(sink)); return sink; } /// Adds a created by another TeeSinkTokenFilter /// to this one. The supplied stream will also receive all consumed tokens. /// This method can be used to pass tokens from two different tees to one sink. /// public void AddSinkTokenStream(SinkTokenStream sink) { // check that sink has correct factory if (!this.Factory.Equals(sink.Factory)) { throw new System.ArgumentException("The supplied sink is not compatible to this tee"); } // add eventually missing attribute impls to the existing sink foreach (var impl in this.CloneAttributes().GetAttributeImplsIterator()) { sink.AddAttributeImpl(impl); } sinks.AddLast(new WeakReference(sink)); } /// TeeSinkTokenFilter passes all tokens to the added sinks /// when itself is consumed. To be sure, that all tokens from the input /// stream are passed to the sinks, you can call this methods. /// This instance is exhausted after this, but all sinks are instant available. /// public void ConsumeAllTokens() { while (IncrementToken()) { } } public override bool IncrementToken() { if (input.IncrementToken()) { // capture state lazily - maybe no SinkFilter accepts this state State state = null; foreach(WeakReference wr in sinks) { var sink = (SinkTokenStream)wr.Target; if (sink != null) { if (sink.Accept(this)) { if (state == null) { state = this.CaptureState(); } sink.AddState(state); } } } return true; } return false; } public override void End() { base.End(); State finalState = CaptureState(); foreach(WeakReference wr in sinks) { var sink = (SinkTokenStream)wr.Target; if (sink != null) { sink.SetFinalState(finalState); } } } /// A filter that decides which states to store in the sink. public abstract class SinkFilter { /// Returns true, iff the current state of the passed-in shall be stored /// in the sink. /// public abstract bool Accept(AttributeSource source); /// Called by . This method does nothing by default /// and can optionally be overridden. /// public virtual void Reset() { // nothing to do; can be overridden } } public sealed class SinkTokenStream : TokenStream { private readonly LinkedList cachedStates = new LinkedList(); private State finalState; private IEnumerator it = null; private readonly SinkFilter filter; internal SinkTokenStream(AttributeSource source, SinkFilter filter) : base(source) { this.filter = filter; } internal /*private*/ bool Accept(AttributeSource source) { return filter.Accept(source); } internal /*private*/ void AddState(AttributeSource.State state) { if (it != null) { throw new System.SystemException("The tee must be consumed before sinks are consumed."); } cachedStates.AddLast(state); } internal /*private*/ void SetFinalState(AttributeSource.State finalState) { this.finalState = finalState; } public override bool IncrementToken() { // lazy init the iterator if (it == null) { it = cachedStates.GetEnumerator(); } if (!it.MoveNext()) { return false; } State state = it.Current; RestoreState(state); return true; } public override void End() { if (finalState != null) { RestoreState(finalState); } } public override void Reset() { it = cachedStates.GetEnumerator(); } protected override void Dispose(bool disposing) { // Do nothing. } } private static readonly SinkFilter ACCEPT_ALL_FILTER; static TeeSinkTokenFilter() { ACCEPT_ALL_FILTER = new AnonymousClassSinkFilter(); } } }