/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * Created on 28-Oct-2004 */ using System; using System.Collections.Generic; using System.IO; using System.Linq; using Lucene.Net.Analysis; using Lucene.Net.Analysis.Tokenattributes; using Lucene.Net.Documents; using Lucene.Net.Index; namespace Lucene.Net.Search.Highlight { ///

Hides implementation issues associated with obtaining a TokenStream for use with /// the higlighter - can obtain from TermFreqVectors with offsets and (optionally) positions or /// from Analyzer class reparsing the stored content. ///

public class TokenSources { public class StoredTokenStream : TokenStream { protected internal Token[] tokens; protected internal int currentToken = 0; protected internal ITermAttribute termAtt; protected internal IOffsetAttribute offsetAtt; protected internal StoredTokenStream(Token[] tokens) { this.tokens = tokens; termAtt = AddAttribute(); offsetAtt = AddAttribute(); } public override bool IncrementToken() { if (currentToken >= tokens.Length) { return false; } ClearAttributes(); Token token = tokens[currentToken++]; termAtt.SetTermBuffer(token.Term); offsetAtt.SetOffset(token.StartOffset, token.EndOffset); return true; } protected override void Dispose(bool disposing) { // do nothing } } ///

/// A convenience method that tries to first get a TermPositionVector for the specified docId, then, falls back to /// using the passed in {@link org.apache.lucene.document.Document} to retrieve the TokenStream. This is useful when /// you already have the document, but would prefer to use the vector first. ///

/// The to use to try and get the vector from /// The docId to retrieve. /// The field to retrieve on the document /// The document to fall back on /// The analyzer to use for creating the TokenStream if the vector doesn't exist /// The for the on the /// if there was an error loading public static TokenStream GetAnyTokenStream(IndexReader reader, int docId, String field, Document doc, Analyzer analyzer) { TokenStream ts = null; var tfv = reader.GetTermFreqVector(docId, field); if (tfv != null) { var termPositionVector = tfv as TermPositionVector; if (termPositionVector != null) { ts = GetTokenStream(termPositionVector); } } //No token info stored so fall back to analyzing raw content return ts ?? GetTokenStream(doc, field, analyzer); } ///

/// A convenience method that tries a number of approaches to getting a token stream. /// The cost of finding there are no termVectors in the index is minimal (1000 invocations still /// registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable ///

/// null if field not stored correctly public static TokenStream GetAnyTokenStream(IndexReader reader, int docId, String field, Analyzer analyzer) { TokenStream ts = null; var tfv = reader.GetTermFreqVector(docId, field); if (tfv != null) { var termPositionVector = tfv as TermPositionVector; if (termPositionVector != null) { ts = GetTokenStream(termPositionVector); } } //No token info stored so fall back to analyzing raw content return ts ?? GetTokenStream(reader, docId, field, analyzer); } public static TokenStream GetTokenStream(TermPositionVector tpv) { //assumes the worst and makes no assumptions about token position sequences. return GetTokenStream(tpv, false); } ///

/// Low level api. /// Returns a token stream or null if no offset info available in index. /// This can be used to feed the highlighter with a pre-parsed token stream /// /// In my tests the speeds to recreate 1000 token streams using this method are: /// - with TermVector offset only data stored - 420 milliseconds /// - with TermVector offset AND position data stored - 271 milliseconds /// (nb timings for TermVector with position data are based on a tokenizer with contiguous /// positions - no overlaps or gaps) /// The cost of not using TermPositionVector to store /// pre-parsed content and using an analyzer to re-parse the original content: /// - reanalyzing the original content - 980 milliseconds /// /// The re-analyze timings will typically vary depending on - /// 1) The complexity of the analyzer code (timings above were using a /// stemmer/lowercaser/stopword combo) /// 2) The number of other fields (Lucene reads ALL fields off the disk /// when accessing just one document field - can cost dear!) /// 3) Use of compression on field storage - could be faster due to compression (less disk IO) /// or slower (more CPU burn) depending on the content. ///

/// /// true if the token position numbers have no overlaps or gaps. If looking /// to eek out the last drops of performance, set to true. If in doubt, set to false. public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous) { //code to reconstruct the original sequence of Tokens String[] terms = tpv.GetTerms(); int[] freq = tpv.GetTermFrequencies(); int totalTokens = freq.Sum(); var tokensInOriginalOrder = new Token[totalTokens]; List unsortedTokens = null; for (int t = 0; t < freq.Length; t++) { TermVectorOffsetInfo[] offsets = tpv.GetOffsets(t); if (offsets == null) { return null; } int[] pos = null; if (tokenPositionsGuaranteedContiguous) { //try get the token position info to speed up assembly of tokens into sorted sequence pos = tpv.GetTermPositions(t); } if (pos == null) { //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later if (unsortedTokens == null) { unsortedTokens = new List(); } foreach (TermVectorOffsetInfo t1 in offsets) { var token = new Token(t1.StartOffset, t1.EndOffset); token.SetTermBuffer(terms[t]); unsortedTokens.Add(token); } } else { //We have positions stored and a guarantee that the token position information is contiguous // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or // creates jumps in position numbers - this code would fail under those circumstances //tokens stored with positions - can use this to index straight into sorted array for (int tp = 0; tp < pos.Length; tp++) { var token = new Token(terms[t], offsets[tp].StartOffset, offsets[tp].EndOffset); tokensInOriginalOrder[pos[tp]] = token; } } } //If the field has been stored without position data we must perform a sort if (unsortedTokens != null) { tokensInOriginalOrder = unsortedTokens.ToArray(); Array.Sort(tokensInOriginalOrder, (t1, t2) => { if (t1.StartOffset > t2.EndOffset) return 1; if (t1.StartOffset < t2.StartOffset) return -1; return 0; }); } return new StoredTokenStream(tokensInOriginalOrder); } public static TokenStream GetTokenStream(IndexReader reader, int docId, System.String field) { var tfv = reader.GetTermFreqVector(docId, field); if (tfv == null) { throw new ArgumentException(field + " in doc #" + docId + "does not have any term position data stored"); } if (tfv is TermPositionVector) { var tpv = (TermPositionVector) reader.GetTermFreqVector(docId, field); return GetTokenStream(tpv); } throw new ArgumentException(field + " in doc #" + docId + "does not have any term position data stored"); } //convenience method public static TokenStream GetTokenStream(IndexReader reader, int docId, String field, Analyzer analyzer) { Document doc = reader.Document(docId); return GetTokenStream(doc, field, analyzer); } public static TokenStream GetTokenStream(Document doc, String field, Analyzer analyzer) { String contents = doc.Get(field); if (contents == null) { throw new ArgumentException("Field " + field + " in document is not stored and cannot be analyzed"); } return GetTokenStream(field, contents, analyzer); } //convenience method public static TokenStream GetTokenStream(String field, String contents, Analyzer analyzer) { return analyzer.TokenStream(field, new StringReader(contents)); } } }