/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Support;
using Lucene.Net.Util;
using Payload = Lucene.Net.Index.Payload;
using TermPositions = Lucene.Net.Index.TermPositions;
using ArrayUtil = Lucene.Net.Util.ArrayUtil;
using Attribute = Lucene.Net.Util.Attribute;
namespace Lucene.Net.Analysis
{
/// A Token is an occurrence of a term from the text of a field. It consists of
/// a term's text, the start and end offset of the term in the text of the field,
/// and a type string.
///
/// The start and end offsets permit applications to re-associate a token with
/// its source text, e.g., to display highlighted query terms in a document
/// browser, or to show matching text fragments in a KWIC display, etc.
///
/// The type is a string, assigned by a lexical analyzer
/// (a.k.a. tokenizer), naming the lexical or syntactic class that the token
/// belongs to. For example an end of sentence marker token might be implemented
/// with type "eos". The default token type is "word".
///
/// A Token can optionally have metadata (a.k.a. Payload) in the form of a variable
/// length byte array. Use and
/// to retrieve the payloads from the index.
///
///
///
/// NOTE: As of 2.9, Token implements all interfaces
/// that are part of core Lucene and can be found in the namespace.
/// Even though it is not necessary to use Token anymore, with the new TokenStream API it can
/// be used as convenience class that implements all s, which is especially useful
/// to easily switch from the old to the new TokenStream API.
///
/// Tokenizers and TokenFilters should try to re-use a Token instance when
/// possible for best performance, by implementing the
/// API.
/// Failing that, to create a new Token you should first use
/// one of the constructors that starts with null text. To load
/// the token from a char[] use .
/// To load from a String use or .
/// Alternatively you can get the Token's termBuffer by calling either ,
/// if you know that your text is shorter than the capacity of the termBuffer
/// or , if there is any possibility
/// that you may need to grow the buffer. Fill in the characters of your term into this
/// buffer, with if loading from a string,
/// or with , and finally call to
/// set the length of the term text. See LUCENE-969
/// for details.
/// Typical Token reuse patterns:
///
/// - Copying text from a string (type is reset to if not
/// specified):
///
/// return reusableToken.reinit(string, startOffset, endOffset[, type]);
///
///
/// - Copying some text from a string (type is reset to
/// if not specified):
///
/// return reusableToken.reinit(string, 0, string.length(), startOffset, endOffset[, type]);
///
///
/// - Copying text from char[] buffer (type is reset to
/// if not specified):
///
/// return reusableToken.reinit(buffer, 0, buffer.length, startOffset, endOffset[, type]);
///
///
/// - Copying some text from a char[] buffer (type is reset to
/// if not specified):
///
/// return reusableToken.reinit(buffer, start, end - start, startOffset, endOffset[, type]);
///
///
/// - Copying from one one Token to another (type is reset to
/// if not specified):
///
/// return reusableToken.reinit(source.termBuffer(), 0, source.termLength(), source.startOffset(), source.endOffset()[, source.type()]);
///
///
///
/// A few things to note:
///
/// - clear() initializes all of the fields to default values. This was changed in contrast to Lucene 2.4, but should affect no one.
/// - Because TokenStreams can be chained, one cannot assume that the Token's current type is correct.
/// - The startOffset and endOffset represent the start and offset in the
/// source text, so be careful in adjusting them.
/// - When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.
///
///
///
///
///
[Serializable]
public class Token : Attribute, ITermAttribute, ITypeAttribute, IPositionIncrementAttribute, IFlagsAttribute, IOffsetAttribute, IPayloadAttribute
{
public const System.String DEFAULT_TYPE = "word";
private static int MIN_BUFFER_SIZE = 10;
private char[] termBuffer;
private int termLength;
private int startOffset, endOffset;
private string type = DEFAULT_TYPE;
private int flags;
private Payload payload;
private int positionIncrement = 1;
/// Constructs a Token will null text.
public Token()
{
}
/// Constructs a Token with null text and start & end
/// offsets.
///
/// start offset in the source text
/// end offset in the source text
public Token(int start, int end)
{
startOffset = start;
endOffset = end;
}
/// Constructs a Token with null text and start & end
/// offsets plus the Token type.
///
/// start offset in the source text
/// end offset in the source text
/// the lexical type of this Token
public Token(int start, int end, System.String typ)
{
startOffset = start;
endOffset = end;
type = typ;
}
/// Constructs a Token with null text and start & end
/// offsets plus flags. NOTE: flags is EXPERIMENTAL.
///
/// start offset in the source text
/// end offset in the source text
/// The bits to set for this token
public Token(int start, int end, int flags)
{
startOffset = start;
endOffset = end;
this.flags = flags;
}
/// Constructs a Token with the given term text, and start
/// & end offsets. The type defaults to "word."
/// NOTE: for better indexing speed you should
/// instead use the char[] termBuffer methods to set the
/// term text.
///
/// term text
/// start offset
/// end offset
public Token(System.String text, int start, int end)
{
SetTermBuffer(text);
startOffset = start;
endOffset = end;
}
/// Constructs a Token with the given text, start and end
/// offsets, & type. NOTE: for better indexing
/// speed you should instead use the char[] termBuffer
/// methods to set the term text.
///
/// term text
/// start offset
/// end offset
/// token type
public Token(System.String text, int start, int end, System.String typ)
{
SetTermBuffer(text);
startOffset = start;
endOffset = end;
type = typ;
}
/// Constructs a Token with the given text, start and end
/// offsets, & type. NOTE: for better indexing
/// speed you should instead use the char[] termBuffer
/// methods to set the term text.
///
///
///
///
/// token type bits
public Token(System.String text, int start, int end, int flags)
{
SetTermBuffer(text);
startOffset = start;
endOffset = end;
this.flags = flags;
}
/// Constructs a Token with the given term buffer (offset
/// & length), start and end
/// offsets
///
///
///
///
///
///
public Token(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end)
{
SetTermBuffer(startTermBuffer, termBufferOffset, termBufferLength);
startOffset = start;
endOffset = end;
}
/// Set the position increment. This determines the position of this token
/// relative to the previous Token in a , used in phrase
/// searching.
///
/// The default value is one.
///
/// Some common uses for this are:
///
/// - Set it to zero to put multiple terms in the same position. This is
/// useful if, e.g., a word has multiple stems. Searches for phrases
/// including either stem will match. In this case, all but the first stem's
/// increment should be set to zero: the increment of the first instance
/// should be one. Repeating a token with an increment of zero can also be
/// used to boost the scores of matches on that token.
///
/// - Set it to values greater than one to inhibit exact phrase matches.
/// If, for example, one does not want phrases to match across removed stop
/// words, then one could build a stop word filter that removes stop words and
/// also sets the increment to the number of stop words removed before each
/// non-stop word. Then exact phrase queries will only match when the terms
/// occur with no intervening stop words.
///
///
///
/// the distance from the prior term
///
///
public virtual int PositionIncrement
{
set
{
if (value < 0)
throw new System.ArgumentException("Increment must be zero or greater: " + value);
this.positionIncrement = value;
}
get { return positionIncrement; }
}
/// Returns the Token's term text.
///
/// This method has a performance penalty
/// because the text is stored internally in a char[]. If
/// possible, use and
/// directly instead. If you really need a
/// String, use this method, which is nothing more than
/// a convenience call to new String(token.termBuffer(), 0, token.termLength())
///
public System.String Term()
{
InitTermBuffer();
return new System.String(termBuffer, 0, termLength);
}
/// Copies the contents of buffer, starting at offset for
/// length characters, into the termBuffer array.
///
/// the buffer to copy
/// the index in the buffer of the first character to copy
/// the number of characters to copy
public void SetTermBuffer(char[] buffer, int offset, int length)
{
GrowTermBuffer(length);
Array.Copy(buffer, offset, termBuffer, 0, length);
termLength = length;
}
/// Copies the contents of buffer into the termBuffer array.
/// the buffer to copy
///
public void SetTermBuffer(System.String buffer)
{
int length = buffer.Length;
GrowTermBuffer(length);
TextSupport.GetCharsFromString(buffer, 0, length, termBuffer, 0);
termLength = length;
}
/// Copies the contents of buffer, starting at offset and continuing
/// for length characters, into the termBuffer array.
///
/// the buffer to copy
///
/// the index in the buffer of the first character to copy
///
/// the number of characters to copy
///
public void SetTermBuffer(System.String buffer, int offset, int length)
{
System.Diagnostics.Debug.Assert(offset <= buffer.Length);
System.Diagnostics.Debug.Assert(offset + length <= buffer.Length);
GrowTermBuffer(length);
TextSupport.GetCharsFromString(buffer, offset, offset + length, termBuffer, 0);
termLength = length;
}
/// Returns the internal termBuffer character array which
/// you can then directly alter. If the array is too
/// small for your token, use
/// to increase it. After
/// altering the buffer be sure to call
/// to record the number of valid
/// characters that were placed into the termBuffer.
///
public char[] TermBuffer()
{
InitTermBuffer();
return termBuffer;
}
/// Grows the termBuffer to at least size newSize, preserving the
/// existing content. Note: If the next operation is to change
/// the contents of the term buffer use
/// ,
/// , or
///
/// to optimally combine the resize with the setting of the termBuffer.
///
/// minimum size of the new termBuffer
///
/// newly created termBuffer with length >= newSize
///
public virtual char[] ResizeTermBuffer(int newSize)
{
if (termBuffer == null)
{
termBuffer = new char[ArrayUtil.GetNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)];
}
else
{
if (termBuffer.Length < newSize)
{
// Not big enough; create a new array with slight
// over allocation and preserve content
char[] newCharBuffer = new char[ArrayUtil.GetNextSize(newSize)];
Array.Copy(termBuffer, 0, newCharBuffer, 0, termBuffer.Length);
termBuffer = newCharBuffer;
}
}
return termBuffer;
}
/// Allocates a buffer char[] of at least newSize, without preserving the existing content.
/// its always used in places that set the content
///
/// minimum size of the buffer
///
private void GrowTermBuffer(int newSize)
{
if (termBuffer == null)
{
// The buffer is always at least MIN_BUFFER_SIZE
termBuffer = new char[ArrayUtil.GetNextSize(newSize < MIN_BUFFER_SIZE?MIN_BUFFER_SIZE:newSize)];
}
else
{
if (termBuffer.Length < newSize)
{
// Not big enough; create a new array with slight
// over allocation:
termBuffer = new char[ArrayUtil.GetNextSize(newSize)];
}
}
}
private void InitTermBuffer()
{
if (termBuffer == null)
{
termBuffer = new char[ArrayUtil.GetNextSize(MIN_BUFFER_SIZE)];
termLength = 0;
}
}
/// Return number of valid characters (length of the term)
/// in the termBuffer array.
///
public int TermLength()
{
InitTermBuffer();
return termLength;
}
/// Set number of valid characters (length of the term) in
/// the termBuffer array. Use this to truncate the termBuffer
/// or to synchronize with external manipulation of the termBuffer.
/// Note: to grow the size of the array,
/// use first.
///
/// the truncated length
///
public void SetTermLength(int length)
{
InitTermBuffer();
if (length > termBuffer.Length)
throw new System.ArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.Length + ")");
termLength = length;
}
/// Gets or sets this Token's starting offset, the position of the first character
/// corresponding to this token in the source text.
/// Note that the difference between endOffset() and startOffset() may not be
/// equal to , as the term text may have been altered by a
/// stemmer or some other filter.
///
public virtual int StartOffset
{
get { return startOffset; }
set { this.startOffset = value; }
}
/// Gets or sets this Token's ending offset, one greater than the position of the
/// last character corresponding to this token in the source text. The length
/// of the token in the source text is (endOffset - startOffset).
///
public virtual int EndOffset
{
get { return endOffset; }
set { this.endOffset = value; }
}
/// Set the starting and ending offset.
/// See StartOffset() and EndOffset()
///
public virtual void SetOffset(int startOffset, int endOffset)
{
this.startOffset = startOffset;
this.endOffset = endOffset;
}
/// Returns this Token's lexical type. Defaults to "word".
public string Type
{
get { return type; }
set { this.type = value; }
}
/// EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long.
///
///
/// Get the bitset for any bits that have been set. This is completely distinct from , although they do share similar purposes.
/// The flags can be used to encode information about the token for use by other s.
///
///
///
/// The bits
public virtual int Flags
{
get { return flags; }
set { this.flags = value; }
}
/// Returns this Token's payload.
public virtual Payload Payload
{
get { return this.payload; }
set { this.payload = value; }
}
public override System.String ToString()
{
System.Text.StringBuilder sb = new System.Text.StringBuilder();
sb.Append('(');
InitTermBuffer();
if (termBuffer == null)
sb.Append("null");
else
sb.Append(termBuffer, 0, termLength);
sb.Append(',').Append(startOffset).Append(',').Append(endOffset);
if (!type.Equals("word"))
sb.Append(",type=").Append(type);
if (positionIncrement != 1)
sb.Append(",posIncr=").Append(positionIncrement);
sb.Append(')');
return sb.ToString();
}
/// Resets the term text, payload, flags, and positionIncrement,
/// startOffset, endOffset and token type to default.
///
public override void Clear()
{
payload = null;
// Leave termBuffer to allow re-use
termLength = 0;
positionIncrement = 1;
flags = 0;
startOffset = endOffset = 0;
type = DEFAULT_TYPE;
}
public override System.Object Clone()
{
Token t = (Token) base.Clone();
// Do a deep clone
if (termBuffer != null)
{
t.termBuffer = new char[termBuffer.Length];
termBuffer.CopyTo(t.termBuffer, 0);
}
if (payload != null)
{
t.payload = (Payload) payload.Clone();
}
return t;
}
/// Makes a clone, but replaces the term buffer &
/// start/end offset in the process. This is more
/// efficient than doing a full clone (and then calling
/// setTermBuffer) because it saves a wasted copy of the old
/// termBuffer.
///
public virtual Token Clone(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset)
{
Token t = new Token(newTermBuffer, newTermOffset, newTermLength, newStartOffset, newEndOffset);
t.positionIncrement = positionIncrement;
t.flags = flags;
t.type = type;
if (payload != null)
t.payload = (Payload) payload.Clone();
return t;
}
public override bool Equals(System.Object obj)
{
if (obj == this)
return true;
if (obj is Token)
{
Token other = (Token) obj;
InitTermBuffer();
other.InitTermBuffer();
if (termLength == other.termLength && startOffset == other.startOffset && endOffset == other.endOffset && flags == other.flags && positionIncrement == other.positionIncrement && SubEqual(type, other.type) && SubEqual(payload, other.payload))
{
for (int i = 0; i < termLength; i++)
if (termBuffer[i] != other.termBuffer[i])
return false;
return true;
}
else
return false;
}
else
return false;
}
private bool SubEqual(System.Object o1, System.Object o2)
{
if (o1 == null)
return o2 == null;
else
return o1.Equals(o2);
}
public override int GetHashCode()
{
InitTermBuffer();
int code = termLength;
code = code * 31 + startOffset;
code = code * 31 + endOffset;
code = code * 31 + flags;
code = code * 31 + positionIncrement;
code = code * 31 + type.GetHashCode();
code = (payload == null?code:code * 31 + payload.GetHashCode());
code = code * 31 + ArrayUtil.HashCode(termBuffer, 0, termLength);
return code;
}
// like clear() but doesn't clear termBuffer/text
private void ClearNoTermBuffer()
{
payload = null;
positionIncrement = 1;
flags = 0;
startOffset = endOffset = 0;
type = DEFAULT_TYPE;
}
/// Shorthand for calling ,
/// ,
/// ,
/// ,
///
///
/// this Token instance
///
public virtual Token Reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, System.String newType)
{
ClearNoTermBuffer();
payload = null;
positionIncrement = 1;
SetTermBuffer(newTermBuffer, newTermOffset, newTermLength);
startOffset = newStartOffset;
endOffset = newEndOffset;
type = newType;
return this;
}
/// Shorthand for calling ,
/// ,
/// ,
///
/// on Token.DEFAULT_TYPE
///
/// this Token instance
///
public virtual Token Reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset)
{
ClearNoTermBuffer();
SetTermBuffer(newTermBuffer, newTermOffset, newTermLength);
startOffset = newStartOffset;
endOffset = newEndOffset;
type = DEFAULT_TYPE;
return this;
}
/// Shorthand for calling ,
/// ,
/// ,
///
///
///
/// this Token instance
///
public virtual Token Reinit(System.String newTerm, int newStartOffset, int newEndOffset, System.String newType)
{
ClearNoTermBuffer();
SetTermBuffer(newTerm);
startOffset = newStartOffset;
endOffset = newEndOffset;
type = newType;
return this;
}
/// Shorthand for calling ,
/// ,
/// ,
///
///
///
/// this Token instance
///
public virtual Token Reinit(System.String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, System.String newType)
{
ClearNoTermBuffer();
SetTermBuffer(newTerm, newTermOffset, newTermLength);
startOffset = newStartOffset;
endOffset = newEndOffset;
type = newType;
return this;
}
/// Shorthand for calling ,
/// ,
/// ,
///
/// on Token.DEFAULT_TYPE
///
/// this Token instance
///
public virtual Token Reinit(System.String newTerm, int newStartOffset, int newEndOffset)
{
ClearNoTermBuffer();
SetTermBuffer(newTerm);
startOffset = newStartOffset;
endOffset = newEndOffset;
type = DEFAULT_TYPE;
return this;
}
/// Shorthand for calling ,
/// ,
/// ,
///
/// on Token.DEFAULT_TYPE
///
/// this Token instance
///
public virtual Token Reinit(System.String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset)
{
ClearNoTermBuffer();
SetTermBuffer(newTerm, newTermOffset, newTermLength);
startOffset = newStartOffset;
endOffset = newEndOffset;
type = DEFAULT_TYPE;
return this;
}
/// Copy the prototype token's fields into this one. Note: Payloads are shared.
///
///
public virtual void Reinit(Token prototype)
{
prototype.InitTermBuffer();
SetTermBuffer(prototype.termBuffer, 0, prototype.termLength);
positionIncrement = prototype.positionIncrement;
flags = prototype.flags;
startOffset = prototype.startOffset;
endOffset = prototype.endOffset;
type = prototype.type;
payload = prototype.payload;
}
/// Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared.
///
///
///
///
public virtual void Reinit(Token prototype, System.String newTerm)
{
SetTermBuffer(newTerm);
positionIncrement = prototype.positionIncrement;
flags = prototype.flags;
startOffset = prototype.startOffset;
endOffset = prototype.endOffset;
type = prototype.type;
payload = prototype.payload;
}
/// Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared.
///
///
///
///
///
///
///
///
public virtual void Reinit(Token prototype, char[] newTermBuffer, int offset, int length)
{
SetTermBuffer(newTermBuffer, offset, length);
positionIncrement = prototype.positionIncrement;
flags = prototype.flags;
startOffset = prototype.startOffset;
endOffset = prototype.endOffset;
type = prototype.type;
payload = prototype.payload;
}
public override void CopyTo(Attribute target)
{
if (target is Token)
{
Token to = (Token) target;
to.Reinit(this);
// reinit shares the payload, so clone it:
if (payload != null)
{
to.payload = (Payload) payload.Clone();
}
}
else
{
InitTermBuffer();
((ITermAttribute) target).SetTermBuffer(termBuffer, 0, termLength);
((IOffsetAttribute) target).SetOffset(startOffset, endOffset);
((IPositionIncrementAttribute) target).PositionIncrement = positionIncrement;
((IPayloadAttribute) target).Payload = (payload == null)?null:(Payload) payload.Clone();
((IFlagsAttribute) target).Flags = flags;
((ITypeAttribute) target).Type = type;
}
}
///
/// Convenience factory that returns Token
as implementation for the basic
/// attributes and return the default impl (with "Impl" appended) for all other
/// attributes.
/// @since 3.0
///
public static AttributeSource.AttributeFactory TOKEN_ATTRIBUTE_FACTORY =
new TokenAttributeFactory(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
///
/// Expert: Creates an AttributeFactory returning {@link Token} as instance for the basic attributes
/// and for all other attributes calls the given delegate factory.
///
public class TokenAttributeFactory : AttributeSource.AttributeFactory
{
private AttributeSource.AttributeFactory _delegateFactory;
///
/// Expert: Creates an AttributeFactory returning {@link Token} as instance for the basic attributes
/// and for all other attributes calls the given delegate factory.
///
public TokenAttributeFactory(AttributeSource.AttributeFactory delegateFactory)
{
this._delegateFactory = delegateFactory;
}
public override Attribute CreateAttributeInstance()
{
return typeof(T).IsAssignableFrom(typeof(Token))
? new Token()
: _delegateFactory.CreateAttributeInstance();
}
public override bool Equals(Object other)
{
if (this == other) return true;
if (other is TokenAttributeFactory)
{
TokenAttributeFactory af = (TokenAttributeFactory)other;
return this._delegateFactory.Equals(af._delegateFactory);
}
return false;
}
public override int GetHashCode()
{
return _delegateFactory.GetHashCode() ^ 0x0a45aa31;
}
}
}
}