/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using Lucene.Net.Analysis.Tokenattributes; using Lucene.Net.Support; using Lucene.Net.Util; using Payload = Lucene.Net.Index.Payload; using TermPositions = Lucene.Net.Index.TermPositions; using ArrayUtil = Lucene.Net.Util.ArrayUtil; using Attribute = Lucene.Net.Util.Attribute; namespace Lucene.Net.Analysis { /// A Token is an occurrence of a term from the text of a field. It consists of /// a term's text, the start and end offset of the term in the text of the field, /// and a type string. ///

/// The start and end offsets permit applications to re-associate a token with /// its source text, e.g., to display highlighted query terms in a document /// browser, or to show matching text fragments in a KWIC display, etc. ///

/// The type is a string, assigned by a lexical analyzer /// (a.k.a. tokenizer), naming the lexical or syntactic class that the token /// belongs to. For example an end of sentence marker token might be implemented /// with type "eos". The default token type is "word". ///

/// A Token can optionally have metadata (a.k.a. Payload) in the form of a variable /// length byte array. Use and /// to retrieve the payloads from the index. ///

///

///
///

NOTE: As of 2.9, Token implements all interfaces /// that are part of core Lucene and can be found in the namespace. /// Even though it is not necessary to use Token anymore, with the new TokenStream API it can /// be used as convenience class that implements all s, which is especially useful /// to easily switch from the old to the new TokenStream API. ///

///

Tokenizers and TokenFilters should try to re-use a Token instance when /// possible for best performance, by implementing the /// API. /// Failing that, to create a new Token you should first use /// one of the constructors that starts with null text. To load /// the token from a char[] use . /// To load from a String use or . /// Alternatively you can get the Token's termBuffer by calling either , /// if you know that your text is shorter than the capacity of the termBuffer /// or , if there is any possibility /// that you may need to grow the buffer. Fill in the characters of your term into this /// buffer, with if loading from a string, /// or with , and finally call to /// set the length of the term text. See LUCENE-969 /// for details.

///

Typical Token reuse patterns: /// /// Copying text from a string (type is reset to if not /// specified):
/// /// return reusableToken.reinit(string, startOffset, endOffset[, type]); /// ///
/// Copying some text from a string (type is reset to /// if not specified):
/// /// return reusableToken.reinit(string, 0, string.length(), startOffset, endOffset[, type]); /// ///
/// Copying text from char[] buffer (type is reset to /// if not specified):
/// /// return reusableToken.reinit(buffer, 0, buffer.length, startOffset, endOffset[, type]); /// ///
/// Copying some text from a char[] buffer (type is reset to /// if not specified):
/// /// return reusableToken.reinit(buffer, start, end - start, startOffset, endOffset[, type]); /// ///
/// Copying from one one Token to another (type is reset to /// if not specified):
/// /// return reusableToken.reinit(source.termBuffer(), 0, source.termLength(), source.startOffset(), source.endOffset()[, source.type()]); /// ///
///
/// A few things to note: /// /// clear() initializes all of the fields to default values. This was changed in contrast to Lucene 2.4, but should affect no one. /// Because TokenStreams can be chained, one cannot assume that the Token's current type is correct. /// The startOffset and endOffset represent the start and offset in the /// source text, so be careful in adjusting them. /// When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again. /// ///

///

/// /// [Serializable] public class Token : Attribute, ITermAttribute, ITypeAttribute, IPositionIncrementAttribute, IFlagsAttribute, IOffsetAttribute, IPayloadAttribute { public const String DEFAULT_TYPE = "word"; private const int MIN_BUFFER_SIZE = 10; private char[] termBuffer; private int termLength; private int startOffset, endOffset; private string type = DEFAULT_TYPE; private int flags; private Payload payload; private int positionIncrement = 1; /// Constructs a Token will null text. public Token() { } /// Constructs a Token with null text and start & end /// offsets. /// /// start offset in the source text /// end offset in the source text public Token(int start, int end) { startOffset = start; endOffset = end; } /// Constructs a Token with null text and start & end /// offsets plus the Token type. /// /// start offset in the source text /// end offset in the source text /// the lexical type of this Token public Token(int start, int end, String typ) { startOffset = start; endOffset = end; type = typ; } /// Constructs a Token with null text and start & end /// offsets plus flags. NOTE: flags is EXPERIMENTAL. /// /// start offset in the source text /// end offset in the source text /// The bits to set for this token public Token(int start, int end, int flags) { startOffset = start; endOffset = end; this.flags = flags; } /// Constructs a Token with the given term text, and start /// & end offsets. The type defaults to "word." /// NOTE: for better indexing speed you should /// instead use the char[] termBuffer methods to set the /// term text. /// /// term text /// start offset /// end offset public Token(String text, int start, int end) { SetTermBuffer(text); startOffset = start; endOffset = end; } /// Constructs a Token with the given text, start and end /// offsets, & type. NOTE: for better indexing /// speed you should instead use the char[] termBuffer /// methods to set the term text. /// /// term text /// start offset /// end offset /// token type public Token(System.String text, int start, int end, System.String typ) { SetTermBuffer(text); startOffset = start; endOffset = end; type = typ; } /// Constructs a Token with the given text, start and end /// offsets, & type. NOTE: for better indexing /// speed you should instead use the char[] termBuffer /// methods to set the term text. /// /// /// /// /// token type bits public Token(System.String text, int start, int end, int flags) { SetTermBuffer(text); startOffset = start; endOffset = end; this.flags = flags; } /// Constructs a Token with the given term buffer (offset /// & length), start and end /// offsets /// /// /// /// /// /// public Token(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) { SetTermBuffer(startTermBuffer, termBufferOffset, termBufferLength); startOffset = start; endOffset = end; } /// Set the position increment. This determines the position of this token /// relative to the previous Token in a , used in phrase /// searching. /// ///

The default value is one. /// ///

Some common uses for this are: /// /// Set it to zero to put multiple terms in the same position. This is /// useful if, e.g., a word has multiple stems. Searches for phrases /// including either stem will match. In this case, all but the first stem's /// increment should be set to zero: the increment of the first instance /// should be one. Repeating a token with an increment of zero can also be /// used to boost the scores of matches on that token. /// /// Set it to values greater than one to inhibit exact phrase matches. /// If, for example, one does not want phrases to match across removed stop /// words, then one could build a stop word filter that removes stop words and /// also sets the increment to the number of stop words removed before each /// non-stop word. Then exact phrase queries will only match when the terms /// occur with no intervening stop words. /// /// ///

/// the distance from the prior term /// /// public virtual int PositionIncrement { set { if (value < 0) throw new System.ArgumentException("Increment must be zero or greater: " + value); this.positionIncrement = value; } get { return positionIncrement; } } /// Returns the Token's term text. /// /// This method has a performance penalty /// because the text is stored internally in a char[]. If /// possible, use and /// directly instead. If you really need a /// String, use this method, which is nothing more than /// a convenience call to new String(token.termBuffer(), 0, token.termLength()) /// public string Term { get { InitTermBuffer(); return new System.String(termBuffer, 0, termLength); } } /// Copies the contents of buffer, starting at offset for /// length characters, into the termBuffer array. /// /// the buffer to copy /// the index in the buffer of the first character to copy /// the number of characters to copy public void SetTermBuffer(char[] buffer, int offset, int length) { GrowTermBuffer(length); Array.Copy(buffer, offset, termBuffer, 0, length); termLength = length; } /// Copies the contents of buffer into the termBuffer array. /// the buffer to copy /// public void SetTermBuffer(System.String buffer) { int length = buffer.Length; GrowTermBuffer(length); TextSupport.GetCharsFromString(buffer, 0, length, termBuffer, 0); termLength = length; } /// Copies the contents of buffer, starting at offset and continuing /// for length characters, into the termBuffer array. /// /// the buffer to copy /// /// the index in the buffer of the first character to copy /// /// the number of characters to copy /// public void SetTermBuffer(System.String buffer, int offset, int length) { System.Diagnostics.Debug.Assert(offset <= buffer.Length); System.Diagnostics.Debug.Assert(offset + length <= buffer.Length); GrowTermBuffer(length); TextSupport.GetCharsFromString(buffer, offset, offset + length, termBuffer, 0); termLength = length; } /// Returns the internal termBuffer character array which /// you can then directly alter. If the array is too /// small for your token, use /// to increase it. After /// altering the buffer be sure to call /// to record the number of valid /// characters that were placed into the termBuffer. /// public char[] TermBuffer() { InitTermBuffer(); return termBuffer; } /// Grows the termBuffer to at least size newSize, preserving the /// existing content. Note: If the next operation is to change /// the contents of the term buffer use /// , /// , or /// /// to optimally combine the resize with the setting of the termBuffer. /// /// minimum size of the new termBuffer /// /// newly created termBuffer with length >= newSize /// public virtual char[] ResizeTermBuffer(int newSize) { if (termBuffer == null) { termBuffer = new char[ArrayUtil.GetNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)]; } else { if (termBuffer.Length < newSize) { // Not big enough; create a new array with slight // over allocation and preserve content var newCharBuffer = new char[ArrayUtil.GetNextSize(newSize)]; Array.Copy(termBuffer, 0, newCharBuffer, 0, termBuffer.Length); termBuffer = newCharBuffer; } } return termBuffer; } /// Allocates a buffer char[] of at least newSize, without preserving the existing content. /// its always used in places that set the content /// /// minimum size of the buffer /// private void GrowTermBuffer(int newSize) { if (termBuffer == null) { // The buffer is always at least MIN_BUFFER_SIZE termBuffer = new char[ArrayUtil.GetNextSize(newSize < MIN_BUFFER_SIZE?MIN_BUFFER_SIZE:newSize)]; } else { if (termBuffer.Length < newSize) { // Not big enough; create a new array with slight // over allocation: termBuffer = new char[ArrayUtil.GetNextSize(newSize)]; } } } private void InitTermBuffer() { if (termBuffer == null) { termBuffer = new char[ArrayUtil.GetNextSize(MIN_BUFFER_SIZE)]; termLength = 0; } } /// Return number of valid characters (length of the term) /// in the termBuffer array. /// public int TermLength() { InitTermBuffer(); return termLength; } /// Set number of valid characters (length of the term) in /// the termBuffer array. Use this to truncate the termBuffer /// or to synchronize with external manipulation of the termBuffer. /// Note: to grow the size of the array, /// use first. /// /// the truncated length /// public void SetTermLength(int length) { InitTermBuffer(); if (length > termBuffer.Length) throw new System.ArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.Length + ")"); termLength = length; } /// Gets or sets this Token's starting offset, the position of the first character /// corresponding to this token in the source text. /// Note that the difference between endOffset() and startOffset() may not be /// equal to , as the term text may have been altered by a /// stemmer or some other filter. /// public virtual int StartOffset { get { return startOffset; } set { this.startOffset = value; } } /// Gets or sets this Token's ending offset, one greater than the position of the /// last character corresponding to this token in the source text. The length /// of the token in the source text is (endOffset - startOffset). /// public virtual int EndOffset { get { return endOffset; } set { this.endOffset = value; } } /// Set the starting and ending offset. /// See StartOffset() and EndOffset() /// public virtual void SetOffset(int startOffset, int endOffset) { this.startOffset = startOffset; this.endOffset = endOffset; } /// Returns this Token's lexical type. Defaults to "word". public string Type { get { return type; } set { this.type = value; } } /// EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long. ///

/// /// Get the bitset for any bits that have been set. This is completely distinct from , although they do share similar purposes. /// The flags can be used to encode information about the token for use by other s. /// /// ///

/// The bits public virtual int Flags { get { return flags; } set { flags = value; } } /// Returns this Token's payload. public virtual Payload Payload { get { return payload; } set { payload = value; } } public override String ToString() { var sb = new System.Text.StringBuilder(); sb.Append('('); InitTermBuffer(); if (termBuffer == null) sb.Append("null"); else sb.Append(termBuffer, 0, termLength); sb.Append(',').Append(startOffset).Append(',').Append(endOffset); if (!type.Equals("word")) sb.Append(",type=").Append(type); if (positionIncrement != 1) sb.Append(",posIncr=").Append(positionIncrement); sb.Append(')'); return sb.ToString(); } /// Resets the term text, payload, flags, and positionIncrement, /// startOffset, endOffset and token type to default. /// public override void Clear() { payload = null; // Leave termBuffer to allow re-use termLength = 0; positionIncrement = 1; flags = 0; startOffset = endOffset = 0; type = DEFAULT_TYPE; } public override System.Object Clone() { var t = (Token) base.Clone(); // Do a deep clone if (termBuffer != null) { t.termBuffer = new char[termBuffer.Length]; termBuffer.CopyTo(t.termBuffer, 0); } if (payload != null) { t.payload = (Payload) payload.Clone(); } return t; } /// Makes a clone, but replaces the term buffer & /// start/end offset in the process. This is more /// efficient than doing a full clone (and then calling /// setTermBuffer) because it saves a wasted copy of the old /// termBuffer. /// public virtual Token Clone(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) { var t = new Token(newTermBuffer, newTermOffset, newTermLength, newStartOffset, newEndOffset) {positionIncrement = positionIncrement, flags = flags, type = type}; if (payload != null) t.payload = (Payload) payload.Clone(); return t; } public override bool Equals(Object obj) { if (obj == this) return true; var other = obj as Token; if (other == null) return false; InitTermBuffer(); other.InitTermBuffer(); if (termLength == other.termLength && startOffset == other.startOffset && endOffset == other.endOffset && flags == other.flags && positionIncrement == other.positionIncrement && SubEqual(type, other.type) && SubEqual(payload, other.payload)) { for (int i = 0; i < termLength; i++) if (termBuffer[i] != other.termBuffer[i]) return false; return true; } return false; } private bool SubEqual(System.Object o1, System.Object o2) { if (o1 == null) return o2 == null; return o1.Equals(o2); } public override int GetHashCode() { InitTermBuffer(); int code = termLength; code = code * 31 + startOffset; code = code * 31 + endOffset; code = code * 31 + flags; code = code * 31 + positionIncrement; code = code * 31 + type.GetHashCode(); code = (payload == null?code:code * 31 + payload.GetHashCode()); code = code * 31 + ArrayUtil.HashCode(termBuffer, 0, termLength); return code; } // like clear() but doesn't clear termBuffer/text private void ClearNoTermBuffer() { payload = null; positionIncrement = 1; flags = 0; startOffset = endOffset = 0; type = DEFAULT_TYPE; } /// Shorthand for calling , /// , /// , /// , /// /// /// this Token instance /// public virtual Token Reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, System.String newType) { ClearNoTermBuffer(); payload = null; positionIncrement = 1; SetTermBuffer(newTermBuffer, newTermOffset, newTermLength); startOffset = newStartOffset; endOffset = newEndOffset; type = newType; return this; } /// Shorthand for calling , /// , /// , /// /// on Token.DEFAULT_TYPE /// /// this Token instance /// public virtual Token Reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) { ClearNoTermBuffer(); SetTermBuffer(newTermBuffer, newTermOffset, newTermLength); startOffset = newStartOffset; endOffset = newEndOffset; type = DEFAULT_TYPE; return this; } /// Shorthand for calling , /// , /// , /// /// /// /// this Token instance /// public virtual Token Reinit(System.String newTerm, int newStartOffset, int newEndOffset, System.String newType) { ClearNoTermBuffer(); SetTermBuffer(newTerm); startOffset = newStartOffset; endOffset = newEndOffset; type = newType; return this; } /// Shorthand for calling , /// , /// , /// /// /// /// this Token instance /// public virtual Token Reinit(System.String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, System.String newType) { ClearNoTermBuffer(); SetTermBuffer(newTerm, newTermOffset, newTermLength); startOffset = newStartOffset; endOffset = newEndOffset; type = newType; return this; } /// Shorthand for calling , /// , /// , /// /// on Token.DEFAULT_TYPE /// /// this Token instance /// public virtual Token Reinit(System.String newTerm, int newStartOffset, int newEndOffset) { ClearNoTermBuffer(); SetTermBuffer(newTerm); startOffset = newStartOffset; endOffset = newEndOffset; type = DEFAULT_TYPE; return this; } /// Shorthand for calling , /// , /// , /// /// on Token.DEFAULT_TYPE /// /// this Token instance /// public virtual Token Reinit(System.String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) { ClearNoTermBuffer(); SetTermBuffer(newTerm, newTermOffset, newTermLength); startOffset = newStartOffset; endOffset = newEndOffset; type = DEFAULT_TYPE; return this; } /// Copy the prototype token's fields into this one. Note: Payloads are shared. /// /// public virtual void Reinit(Token prototype) { prototype.InitTermBuffer(); SetTermBuffer(prototype.termBuffer, 0, prototype.termLength); positionIncrement = prototype.positionIncrement; flags = prototype.flags; startOffset = prototype.startOffset; endOffset = prototype.endOffset; type = prototype.type; payload = prototype.payload; } /// Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared. /// /// /// /// public virtual void Reinit(Token prototype, System.String newTerm) { SetTermBuffer(newTerm); positionIncrement = prototype.positionIncrement; flags = prototype.flags; startOffset = prototype.startOffset; endOffset = prototype.endOffset; type = prototype.type; payload = prototype.payload; } /// Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared. /// /// /// /// /// /// /// /// public virtual void Reinit(Token prototype, char[] newTermBuffer, int offset, int length) { SetTermBuffer(newTermBuffer, offset, length); positionIncrement = prototype.positionIncrement; flags = prototype.flags; startOffset = prototype.startOffset; endOffset = prototype.endOffset; type = prototype.type; payload = prototype.payload; } public override void CopyTo(Attribute target) { if (target is Token) { var to = (Token) target; to.Reinit(this); // reinit shares the payload, so clone it: if (payload != null) { to.payload = (Payload) payload.Clone(); } } else { InitTermBuffer(); ((ITermAttribute) target).SetTermBuffer(termBuffer, 0, termLength); ((IOffsetAttribute) target).SetOffset(startOffset, endOffset); ((IPositionIncrementAttribute) target).PositionIncrement = positionIncrement; ((IPayloadAttribute) target).Payload = (payload == null)?null:(Payload) payload.Clone(); ((IFlagsAttribute) target).Flags = flags; ((ITypeAttribute) target).Type = type; } } /// /// Convenience factory that returns Token as implementation for the basic /// attributes and return the default impl (with "Impl" appended) for all other /// attributes. /// @since 3.0 /// public static AttributeSource.AttributeFactory TOKEN_ATTRIBUTE_FACTORY = new TokenAttributeFactory(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); /// /// Expert: Creates an AttributeFactory returning {@link Token} as instance for the basic attributes /// and for all other attributes calls the given delegate factory. /// public class TokenAttributeFactory : AttributeSource.AttributeFactory { private readonly AttributeSource.AttributeFactory _delegateFactory; /// /// Expert: Creates an AttributeFactory returning {@link Token} as instance for the basic attributes /// and for all other attributes calls the given delegate factory. /// public TokenAttributeFactory(AttributeSource.AttributeFactory delegateFactory) { this._delegateFactory = delegateFactory; } public override Attribute CreateAttributeInstance() { return typeof(T).IsAssignableFrom(typeof(Token)) ? new Token() : _delegateFactory.CreateAttributeInstance(); } public override bool Equals(Object other) { if (this == other) return true; var af = other as TokenAttributeFactory; return af != null && _delegateFactory.Equals(af._delegateFactory); } public override int GetHashCode() { return _delegateFactory.GetHashCode() ^ 0x0a45aa31; } } } }