# -*- coding: utf-8 -*- # ==================================================================== # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ==================================================================== # # Port of java/org/apache/lucene/analysis/icu/ICUTransformFilter.java # using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org) # # A TokenFilter that transforms text with ICU. # # ICU provides text-transformation functionality via its Transliteration API. # Although script conversion is its most common use, a Transliterator can # actually perform a more general class of tasks. In fact, Transliterator # defines a very general API which specifies only that a segment of the input # text is replaced by new text. The particulars of this conversion are # determined entirely by subclasses of Transliterator. # # Some useful transformations for search are built-in: # - Conversion from Traditional to Simplified Chinese characters # - Conversion from Hiragana to Katakana # - Conversion from Fullwidth to Halfwidth forms. # - Script conversions, for example Serbian Cyrillic to Latin # # Example usage:
stream = new ICUTransformFilter(stream, # Transliterator.getInstance("Traditional-Simplified"));
# # For more details, see the ICU User Guide at: # http://userguide.icu-project.org/transforms/general # # ==================================================================== from lucene import PythonTokenFilter, CharTermAttribute from icu import Transliterator, UTransPosition class ICUTransformFilter(PythonTokenFilter): # Create a new ICUTransformFilter that transforms text on the given # stream. # # @param input {@link TokenStream} to filter. # @param transform Transliterator to transform the text. def __init__(self, input, transform): super(ICUTransformFilter, self).__init__(input) # Reusable position object self.position = UTransPosition() # term attribute, will be updated with transformed text. self.termAtt = self.addAttribute(CharTermAttribute.class_) self.input = input self.transform = transform def incrementToken(self): if self.input.incrementToken(): text = self.termAtt.toString() length = len(text) self.position.start = 0 self.position.limit = length self.position.contextStart = 0 self.position.contextLimit = length text = self.transform.filteredTransliterate(text, self.position, False) self.termAtt.setEmpty() self.termAtt.append(text) return True return False