/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using Payload = Lucene.Net.Index.Payload; namespace Lucene.Net.Analysis { /// A Token is an occurence of a term from the text of a field. It consists of /// a term's text, the start and end offset of the term in the text of the field, /// and a type string. ///

/// The start and end offsets permit applications to re-associate a token with /// its source text, e.g., to display highlighted query terms in a document /// browser, or to show matching text fragments in a KWIC (KeyWord In Context) /// display, etc. ///

/// The type is an interned string, assigned by a lexical analyzer /// (a.k.a. tokenizer), naming the lexical or syntactic class that the token /// belongs to. For example an end of sentence marker token might be implemented /// with type "eos". The default token type is "word". ///

/// A Token can optionally have metadata (a.k.a. Payload) in the form of a variable /// length byte array. Use {@link TermPositions#GetPayloadLength()} and /// {@link TermPositions#GetPayload(byte[], int)} to retrieve the payloads from the index. ///

///

///

/// WARNING: The status of the Payloads feature is experimental. /// The APIs introduced here might change in the future and will not be /// supported anymore in such a case. ///

///

NOTE: As of 2.3, Token stores the term text /// internally as a malleable char[] termBuffer instead of /// String termText. The indexing code and core tokenizers /// have been changed re-use a single Token instance, changing /// its buffer and other fields in-place as the Token is /// processed. This provides substantially better indexing /// performance as it saves the GC cost of new'ing a Token and /// String for every term. The APIs that accept String /// termText are still available but a warning about the /// associated performance cost has been added (below). The /// {@link #TermText()} method has been deprecated.

///
///

Tokenizers and filters should try to re-use a Token /// instance when possible for best performance, by /// implementing the {@link TokenStream#Next(Token)} API. /// Failing that, to create a new Token you should first use /// one of the constructors that starts with null text. Then /// you should call either {@link #TermBuffer()} or {@link /// #ResizeTermBuffer(int)} to retrieve the Token's /// termBuffer. Fill in the characters of your term into this /// buffer, and finally call {@link #SetTermLength(int)} to /// set the length of the term text. See LUCENE-969 /// for details.

///
/// /// public class Token : System.ICloneable { public const System.String DEFAULT_TYPE = "word"; private static int MIN_BUFFER_SIZE = 10; /// : we will remove this when we remove the /// deprecated APIs /// private System.String termText; internal char[] termBuffer; // characters for the term text internal int termLength; // length of term text in buffer internal int startOffset; // start in source text internal int endOffset; // end in source text internal System.String type = DEFAULT_TYPE; // lexical type internal Payload payload; internal int positionIncrement = 1; /// Constructs a Token will null text. public Token() { } /// Constructs a Token with null text and start & end /// offsets. /// /// start offset /// /// end offset /// public Token(int start, int end) { startOffset = start; endOffset = end; } /// Constructs a Token with null text and start & end /// offsets plus the Token type. /// /// start offset /// /// end offset /// public Token(int start, int end, System.String typ) { startOffset = start; endOffset = end; type = typ; } /// Constructs a Token with the given term text, and start /// & end offsets. The type defaults to "word." /// NOTE: for better indexing speed you should /// instead use the char[] termBuffer methods to set the /// term text. /// /// term text /// /// start offset /// /// end offset /// public Token(System.String text, int start, int end) { termText = text; startOffset = start; endOffset = end; } /// Constructs a Token with the given text, start and end /// offsets, & type. NOTE: for better indexing /// speed you should instead use the char[] termBuffer /// methods to set the term text. /// /// term text /// /// start offset /// /// end offset /// /// token type /// public Token(System.String text, int start, int end, System.String typ) { termText = text; startOffset = start; endOffset = end; type = typ; } /// Set the position increment. This determines the position of this token /// relative to the previous Token in a {@link TokenStream}, used in phrase /// searching. /// ///

The default value is one. /// ///

Some common uses for this are:

///
/// /// public virtual void SetPositionIncrement(int positionIncrement) { if (positionIncrement < 0) throw new System.ArgumentException("Increment must be zero or greater: " + positionIncrement); this.positionIncrement = positionIncrement; } /// Returns the position increment of this Token. /// /// public virtual int GetPositionIncrement() { return positionIncrement; } /// Sets the Token's term text. NOTE: for better /// indexing speed you should instead use the char[] /// termBuffer methods to set the term text. /// public virtual void SetTermText(System.String text) { termText = text; termBuffer = null; } /// Returns the Token's term text. /// /// /// Use {@link #TermBuffer()} and {@link /// #TermLength()} instead. /// public System.String TermText() { if (termText == null && termBuffer != null) termText = new System.String(termBuffer, 0, termLength); return termText; } /// Copies the contents of buffer, starting at offset for /// length characters, into the termBuffer /// array. NOTE: for better indexing speed you /// should instead retrieve the termBuffer, using {@link /// #TermBuffer()} or {@link #ResizeTermBuffer(int)}, and /// fill it in directly to set the term text. This saves /// an extra copy. /// public void SetTermBuffer(char[] buffer, int offset, int length) { ResizeTermBuffer(length); Array.Copy(buffer, offset, termBuffer, 0, length); termLength = length; } /// Returns the internal termBuffer character array which /// you can then directly alter. If the array is too /// small for your token, use {@link /// #ResizeTermBuffer(int)} to increase it. After /// altering the buffer be sure to call {@link /// #setTermLength} to record the number of valid /// characters that were placed into the termBuffer. /// public char[] TermBuffer() { InitTermBuffer(); return termBuffer; } /// Grows the termBuffer to at least size newSize. /// minimum size of the new termBuffer /// /// newly created termBuffer with length >= newSize /// public virtual char[] ResizeTermBuffer(int newSize) { InitTermBuffer(); if (newSize > termBuffer.Length) { int size = termBuffer.Length; while (size < newSize) size *= 2; char[] newBuffer = new char[size]; Array.Copy(termBuffer, 0, newBuffer, 0, termBuffer.Length); termBuffer = newBuffer; } return termBuffer; } // TODO: once we remove the deprecated termText() method // and switch entirely to char[] termBuffer we don't need // to use this method anymore private void InitTermBuffer() { if (termBuffer == null) { if (termText == null) { termBuffer = new char[MIN_BUFFER_SIZE]; termLength = 0; } else { int length = termText.Length; if (length < MIN_BUFFER_SIZE) length = MIN_BUFFER_SIZE; termBuffer = new char[length]; termLength = termText.Length; int offset = 0; while (offset < termText.Length) { termBuffer[offset] = (char) termText[offset]; offset++; } termText = null; } } else if (termText != null) termText = null; } /// Return number of valid characters (length of the term) /// in the termBuffer array. /// public int TermLength() { InitTermBuffer(); return termLength; } /// Set number of valid characters (length of the term) in /// the termBuffer array. /// public void SetTermLength(int length) { InitTermBuffer(); termLength = length; } /// Returns this Token's starting offset, the position of the first character /// corresponding to this token in the source text. /// Note that the difference between endOffset() and startOffset() may not be /// equal to termText.length(), as the term text may have been altered by a /// stemmer or some other filter. /// public int StartOffset() { return startOffset; } /// Set the starting offset. /// /// public virtual void SetStartOffset(int offset) { this.startOffset = offset; } /// Returns this Token's ending offset, one greater than the position of the /// last character corresponding to this token in the source text. /// public int EndOffset() { return endOffset; } /// Set the ending offset. /// /// public virtual void SetEndOffset(int offset) { this.endOffset = offset; } /// Returns this Token's lexical type. Defaults to "word". public System.String Type() { return type; } /// Set the lexical type. /// /// public void SetType(System.String type) { this.type = type; } /// Returns this Token's payload. public virtual Payload GetPayload() { return this.payload; } /// Sets this Token's payload. public virtual void SetPayload(Payload payload) { this.payload = payload; } public override System.String ToString() { System.Text.StringBuilder sb = new System.Text.StringBuilder(); sb.Append('('); InitTermBuffer(); if (termBuffer == null) sb.Append("null"); else sb.Append(termBuffer, 0, termLength); sb.Append(',').Append(startOffset).Append(',').Append(endOffset); if (!type.Equals("word")) sb.Append(",type=").Append(type); if (positionIncrement != 1) sb.Append(",posIncr=").Append(positionIncrement); sb.Append(')'); return sb.ToString(); } /// Resets the term text, payload, and positionIncrement to default. /// Other fields such as startOffset, endOffset and the token type are /// not reset since they are normally overwritten by the tokenizer. /// public virtual void Clear() { payload = null; // Leave termBuffer to allow re-use termLength = 0; termText = null; positionIncrement = 1; // startOffset = endOffset = 0; // type = DEFAULT_TYPE; } public virtual System.Object Clone() { try { Token t = (Token) base.MemberwiseClone(); if (termBuffer != null) { t.termBuffer = null; t.SetTermBuffer(termBuffer, 0, termLength); } if (payload != null) { t.SetPayload((Payload) payload.Clone()); } return t; } catch (System.Exception e) { throw new System.SystemException("", e); // shouldn't happen } } } }