# -*- coding: utf-8 -*- # ==================================================================== # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ==================================================================== # # Port of java/org/apache/lucene/analysis/icu/ICUNormalizer2Filter.java # using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org) # # Normalize token text with ICU's {@link com.ibm.icu.text.Normalizer2} # # With this filter, you can normalize text in the following ways: # - NFKC Normalization, Case Folding, and removing Ignorables (the default) # - Using a standard Normalization mode (NFC, NFD, NFKC, NFKD) # - Based on rules from a custom normalization mapping. # # If you use the defaults, this filter is a simple way to standardize # Unicode text in a language-independent way for search: # - The case folding that it does can be seen as a replacement for # LowerCaseFilter: For example, it handles cases such as the Greek # sigma, so that "Μάϊος" and "ΜΆΪΟΣ" will match correctly. # - The normalization will standardizes different forms of the same # character in Unicode. For example, CJK full-width numbers will be # standardized to their ASCII forms. # - Ignorables such as Zero-Width Joiner and Variation Selectors are # removed. These are typically modifier characters that affect display. # # ==================================================================== from icu import Normalizer2, UNormalizationMode2, UNormalizationCheckResult from org.apache.lucene.analysis.tokenattributes import CharTermAttribute from org.apache.pylucene.analysis import PythonTokenFilter class ICUNormalizer2Filter(PythonTokenFilter): def __init__(self, input, normalizer=None): super(ICUNormalizer2Filter, self).__init__(input) self.input = input self.termAtt = self.addAttribute(CharTermAttribute.class_); if normalizer is None: normalizer = Normalizer2.getInstance(None, "nfkc_cf", UNormalizationMode2.COMPOSE) self.normalizer = normalizer def incrementToken(self): if self.input.incrementToken(): text = self.termAtt.toString() if self.normalizer.quickCheck(text) != UNormalizationCheckResult.YES: self.termAtt.setEmpty() self.termAtt.append(self.normalizer.normalize(text)) return True return False