/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using Payload = Lucene.Net.Index.Payload;
namespace Lucene.Net.Analysis
{
/// A Token is an occurence of a term from the text of a field. It consists of
/// a term's text, the start and end offset of the term in the text of the field,
/// and a type string.
///
/// The start and end offsets permit applications to re-associate a token with
/// its source text, e.g., to display highlighted query terms in a document
/// browser, or to show matching text fragments in a KWIC (KeyWord In Context)
/// display, etc.
///
/// The type is an interned string, assigned by a lexical analyzer
/// (a.k.a. tokenizer), naming the lexical or syntactic class that the token
/// belongs to. For example an end of sentence marker token might be implemented
/// with type "eos". The default token type is "word".
///
/// A Token can optionally have metadata (a.k.a. Payload) in the form of a variable
/// length byte array. Use {@link TermPositions#GetPayloadLength()} and
/// {@link TermPositions#GetPayload(byte[], int)} to retrieve the payloads from the index.
///
///
///
/// WARNING: The status of the Payloads feature is experimental.
/// The APIs introduced here might change in the future and will not be
/// supported anymore in such a case.
///
///
NOTE: As of 2.3, Token stores the term text
/// internally as a malleable char[] termBuffer instead of
/// String termText. The indexing code and core tokenizers
/// have been changed re-use a single Token instance, changing
/// its buffer and other fields in-place as the Token is
/// processed. This provides substantially better indexing
/// performance as it saves the GC cost of new'ing a Token and
/// String for every term. The APIs that accept String
/// termText are still available but a warning about the
/// associated performance cost has been added (below). The
/// {@link #TermText()} method has been deprecated.
///
/// Tokenizers and filters should try to re-use a Token
/// instance when possible for best performance, by
/// implementing the {@link TokenStream#Next(Token)} API.
/// Failing that, to create a new Token you should first use
/// one of the constructors that starts with null text. Then
/// you should call either {@link #TermBuffer()} or {@link
/// #ResizeTermBuffer(int)} to retrieve the Token's
/// termBuffer. Fill in the characters of your term into this
/// buffer, and finally call {@link #SetTermLength(int)} to
/// set the length of the term text. See LUCENE-969
/// for details.
///
///
///
public class Token : System.ICloneable
{
public const System.String DEFAULT_TYPE = "word";
private static int MIN_BUFFER_SIZE = 10;
/// : we will remove this when we remove the
/// deprecated APIs
///
private System.String termText;
internal char[] termBuffer; // characters for the term text
internal int termLength; // length of term text in buffer
internal int startOffset; // start in source text
internal int endOffset; // end in source text
internal System.String type = DEFAULT_TYPE; // lexical type
internal Payload payload;
internal int positionIncrement = 1;
/// Constructs a Token will null text.
public Token()
{
}
/// Constructs a Token with null text and start & end
/// offsets.
///
/// start offset
///
/// end offset
///
public Token(int start, int end)
{
startOffset = start;
endOffset = end;
}
/// Constructs a Token with null text and start & end
/// offsets plus the Token type.
///
/// start offset
///
/// end offset
///
public Token(int start, int end, System.String typ)
{
startOffset = start;
endOffset = end;
type = typ;
}
/// Constructs a Token with the given term text, and start
/// & end offsets. The type defaults to "word."
/// NOTE: for better indexing speed you should
/// instead use the char[] termBuffer methods to set the
/// term text.
///
/// term text
///
/// start offset
///
/// end offset
///
public Token(System.String text, int start, int end)
{
termText = text;
startOffset = start;
endOffset = end;
}
/// Constructs a Token with the given text, start and end
/// offsets, & type. NOTE: for better indexing
/// speed you should instead use the char[] termBuffer
/// methods to set the term text.
///
/// term text
///
/// start offset
///
/// end offset
///
/// token type
///
public Token(System.String text, int start, int end, System.String typ)
{
termText = text;
startOffset = start;
endOffset = end;
type = typ;
}
/// Set the position increment. This determines the position of this token
/// relative to the previous Token in a {@link TokenStream}, used in phrase
/// searching.
///
/// The default value is one.
///
///
Some common uses for this are:
///
/// - Set it to zero to put multiple terms in the same position. This is
/// useful if, e.g., a word has multiple stems. Searches for phrases
/// including either stem will match. In this case, all but the first stem's
/// increment should be set to zero: the increment of the first instance
/// should be one. Repeating a token with an increment of zero can also be
/// used to boost the scores of matches on that token.
///
///
- Set it to values greater than one to inhibit exact phrase matches.
/// If, for example, one does not want phrases to match across removed stop
/// words, then one could build a stop word filter that removes stop words and
/// also sets the increment to the number of stop words removed before each
/// non-stop word. Then exact phrase queries will only match when the terms
/// occur with no intervening stop words.
///
///
///
///
///
public virtual void SetPositionIncrement(int positionIncrement)
{
if (positionIncrement < 0)
throw new System.ArgumentException("Increment must be zero or greater: " + positionIncrement);
this.positionIncrement = positionIncrement;
}
/// Returns the position increment of this Token.
///
///
public virtual int GetPositionIncrement()
{
return positionIncrement;
}
/// Sets the Token's term text. NOTE: for better
/// indexing speed you should instead use the char[]
/// termBuffer methods to set the term text.
///
public virtual void SetTermText(System.String text)
{
termText = text;
termBuffer = null;
}
/// Returns the Token's term text.
///
///
/// Use {@link #TermBuffer()} and {@link
/// #TermLength()} instead.
///
public System.String TermText()
{
if (termText == null && termBuffer != null)
termText = new System.String(termBuffer, 0, termLength);
return termText;
}
/// Copies the contents of buffer, starting at offset for
/// length characters, into the termBuffer
/// array. NOTE: for better indexing speed you
/// should instead retrieve the termBuffer, using {@link
/// #TermBuffer()} or {@link #ResizeTermBuffer(int)}, and
/// fill it in directly to set the term text. This saves
/// an extra copy.
///
public void SetTermBuffer(char[] buffer, int offset, int length)
{
ResizeTermBuffer(length);
Array.Copy(buffer, offset, termBuffer, 0, length);
termLength = length;
}
/// Returns the internal termBuffer character array which
/// you can then directly alter. If the array is too
/// small for your token, use {@link
/// #ResizeTermBuffer(int)} to increase it. After
/// altering the buffer be sure to call {@link
/// #setTermLength} to record the number of valid
/// characters that were placed into the termBuffer.
///
public char[] TermBuffer()
{
InitTermBuffer();
return termBuffer;
}
/// Grows the termBuffer to at least size newSize.
/// minimum size of the new termBuffer
///
/// newly created termBuffer with length >= newSize
///
public virtual char[] ResizeTermBuffer(int newSize)
{
InitTermBuffer();
if (newSize > termBuffer.Length)
{
int size = termBuffer.Length;
while (size < newSize)
size *= 2;
char[] newBuffer = new char[size];
Array.Copy(termBuffer, 0, newBuffer, 0, termBuffer.Length);
termBuffer = newBuffer;
}
return termBuffer;
}
// TODO: once we remove the deprecated termText() method
// and switch entirely to char[] termBuffer we don't need
// to use this method anymore
private void InitTermBuffer()
{
if (termBuffer == null)
{
if (termText == null)
{
termBuffer = new char[MIN_BUFFER_SIZE];
termLength = 0;
}
else
{
int length = termText.Length;
if (length < MIN_BUFFER_SIZE)
length = MIN_BUFFER_SIZE;
termBuffer = new char[length];
termLength = termText.Length;
int offset = 0;
while (offset < termText.Length)
{
termBuffer[offset] = (char) termText[offset];
offset++;
}
termText = null;
}
}
else if (termText != null)
termText = null;
}
/// Return number of valid characters (length of the term)
/// in the termBuffer array.
///
public int TermLength()
{
InitTermBuffer();
return termLength;
}
/// Set number of valid characters (length of the term) in
/// the termBuffer array.
///
public void SetTermLength(int length)
{
InitTermBuffer();
termLength = length;
}
/// Returns this Token's starting offset, the position of the first character
/// corresponding to this token in the source text.
/// Note that the difference between endOffset() and startOffset() may not be
/// equal to termText.length(), as the term text may have been altered by a
/// stemmer or some other filter.
///
public int StartOffset()
{
return startOffset;
}
/// Set the starting offset.
///
///
public virtual void SetStartOffset(int offset)
{
this.startOffset = offset;
}
/// Returns this Token's ending offset, one greater than the position of the
/// last character corresponding to this token in the source text.
///
public int EndOffset()
{
return endOffset;
}
/// Set the ending offset.
///
///
public virtual void SetEndOffset(int offset)
{
this.endOffset = offset;
}
/// Returns this Token's lexical type. Defaults to "word".
public System.String Type()
{
return type;
}
/// Set the lexical type.
///
///
public void SetType(System.String type)
{
this.type = type;
}
/// Returns this Token's payload.
public virtual Payload GetPayload()
{
return this.payload;
}
/// Sets this Token's payload.
public virtual void SetPayload(Payload payload)
{
this.payload = payload;
}
public override System.String ToString()
{
System.Text.StringBuilder sb = new System.Text.StringBuilder();
sb.Append('(');
InitTermBuffer();
if (termBuffer == null)
sb.Append("null");
else
sb.Append(termBuffer, 0, termLength);
sb.Append(',').Append(startOffset).Append(',').Append(endOffset);
if (!type.Equals("word"))
sb.Append(",type=").Append(type);
if (positionIncrement != 1)
sb.Append(",posIncr=").Append(positionIncrement);
sb.Append(')');
return sb.ToString();
}
/// Resets the term text, payload, and positionIncrement to default.
/// Other fields such as startOffset, endOffset and the token type are
/// not reset since they are normally overwritten by the tokenizer.
///
public virtual void Clear()
{
payload = null;
// Leave termBuffer to allow re-use
termLength = 0;
termText = null;
positionIncrement = 1;
// startOffset = endOffset = 0;
// type = DEFAULT_TYPE;
}
public virtual System.Object Clone()
{
try
{
Token t = (Token) base.MemberwiseClone();
if (termBuffer != null)
{
t.termBuffer = null;
t.SetTermBuffer(termBuffer, 0, termLength);
}
if (payload != null)
{
t.SetPayload((Payload) payload.Clone());
}
return t;
}
catch (System.Exception e)
{
throw new System.SystemException("", e); // shouldn't happen
}
}
}
}