/* Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ parcel Lucy; /** Unit of text. * * Token is the fundamental unit used by Apache Lucy's Analyzer subclasses. * Each Token has 5 attributes: text, start_offset, * end_offset, boost, and pos_inc. * * The text attribute is a Unicode string encoded as UTF-8. * * start_offset is the start point of the token text, measured in * Unicode code points from the top of the stored field; * end_offset delimits the corresponding closing boundary. * start_offset and end_offset locate the Token * within a larger context, even if the Token's text attribute gets modified * -- by stemming, for instance. The Token for "beating" in the text "beating * a dead horse" begins life with a start_offset of 0 and an end_offset of 7; * after stemming, the text is "beat", but the start_offset is still 0 and the * end_offset is still 7. This allows "beating" to be highlighted correctly * after a search matches "beat". * * boost is a per-token weight. Use this when you want to assign * more or less importance to a particular token, as you might for emboldened * text within an HTML document, for example. (Note: The field this token * belongs to must be spec'd to use a posting of type * L.) * * pos_inc"three blind mice". However, if you * set the position increment for "blind" to, say, 1000, then the three tokens * will end up assigned to positions 0, 1, and 1001 -- and will no longer * produce a phrase match for the query "three blind mice". */ class Lucy::Analysis::Token inherits Clownfish::Obj { char *text; size_t len; uint32_t start_offset; uint32_t end_offset; float boost; int32_t pos_inc; int32_t pos; inert incremented Token* new(const char *text, size_t len, uint32_t start_offset, uint32_t end_offset, float boost = 1.0, int32_t pos_inc = 1); inert Token* init(Token *self, const char *text, size_t len, uint32_t start_offset, uint32_t end_offset, float boost = 1.0, int32_t pos_inc = 1); /** Sort_quicksort-compatible comparison routine. */ inert int compare(void *context, const void *va, const void *vb); uint32_t Get_Start_Offset(Token *self); uint32_t Get_End_Offset(Token *self); float Get_Boost(Token *self); int32_t Get_Pos_Inc(Token *self); char* Get_Text(Token *self); size_t Get_Len(Token *self); void Set_Text(Token *self, char *text, size_t len); public void Destroy(Token *self); }