package org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.lang.ref.WeakReference; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.AttributeSource; /** * This TokenFilter provides the ability to set aside attribute states * that have already been analyzed. This is useful in situations where multiple fields share * many common analysis steps and then go their separate ways. *
* It is also useful for doing things like entity extraction or proper noun analysis as * part of the analysis workflow and saving off those tokens for use in another field. * *
TeeSinkTokenFilter source1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader1));
TeeSinkTokenFilter.SinkTokenStream sink1 = source1.newSinkTokenStream();
TeeSinkTokenFilter.SinkTokenStream sink2 = source1.newSinkTokenStream();
TeeSinkTokenFilter source2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader2));
source2.addSinkTokenStream(sink1);
source2.addSinkTokenStream(sink2);
TokenStream final1 = new LowerCaseFilter(source1);
TokenStream final2 = source2;
TokenStream final3 = new EntityDetect(sink1);
TokenStream final4 = new URLDetect(sink2);
d.add(new Field("f1", final1));
d.add(new Field("f2", final2));
d.add(new Field("f3", final3));
d.add(new Field("f4", final4));
*
* In this example, sink1 and sink2 will both get tokens from both
* reader1 and reader2 after whitespace tokenizer
* and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
* It is important, that tees are consumed before sinks (in the above example, the field names must be
* less the sink's field names). If you are not sure, which stream is consumed first, you can simply
* add another sink and then pass all tokens to the sinks at once using {@link #consumeAllTokens}.
* This TokenFilter is exhausted after this. In the above example, change
* the example above to:
* ... TokenStream final1 = new LowerCaseFilter(source1.newSinkTokenStream()); TokenStream final2 = source2.newSinkTokenStream(); sink1.consumeAllTokens(); sink2.consumeAllTokens(); ... ** In this case, the fields can be added in any order, because the sources are not used anymore and all sinks are ready. *
Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene.
*/
public final class TeeSinkTokenFilter extends TokenFilter {
private final ListTeeSinkTokenFilter
* to this one. The supplied stream will also receive all consumed tokens.
* This method can be used to pass tokens from two different tees to one sink.
*/
public void addSinkTokenStream(final SinkTokenStream sink) {
// check that sink has correct factory
if (!this.getAttributeFactory().equals(sink.getAttributeFactory())) {
throw new IllegalArgumentException("The supplied sink is not compatible to this tee");
}
// add eventually missing attribute impls to the existing sink
for (IteratorTeeSinkTokenFilter passes all tokens to the added sinks
* when itself is consumed. To be sure, that all tokens from the input
* stream are passed to the sinks, you can call this methods.
* This instance is exhausted after this, but all sinks are instant available.
*/
public void consumeAllTokens() throws IOException {
while (incrementToken());
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
// capture state lazily - maybe no SinkFilter accepts this state
AttributeSource.State state = null;
for (WeakReference