# -*- coding: utf-8 -*- # ==================================================================== # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ==================================================================== # # Port of java/org/apache/lucene/analysis/icu/ICUFoldingFilter.java # using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org) try: from icu import Normalizer2, UNormalizationMode2 except ImportError as e: pass import sys, lucene, unittest from BaseTokenStreamTestCase import BaseTokenStreamTestCase from org.apache.lucene.analysis import Analyzer from org.apache.lucene.analysis.core import WhitespaceTokenizer from org.apache.pylucene.analysis import PythonAnalyzer class TestICUFoldingFilter(BaseTokenStreamTestCase): def testDefaults(self): from lucene.ICUFoldingFilter import ICUFoldingFilter class _analyzer(PythonAnalyzer): def createComponents(_self, fieldName): source = WhitespaceTokenizer() return Analyzer.TokenStreamComponents(source, ICUFoldingFilter(source)) def initReader(_self, fieldName, reader): return reader a = _analyzer() # case folding self._assertAnalyzesTo(a, "This is a test", [ "this", "is", "a", "test" ]) # case folding self._assertAnalyzesTo(a, "Ruß", [ "russ" ]) # case folding with accent removal self._assertAnalyzesTo(a, "ΜΆΪΟΣ", [ "μαιοσ" ]) self._assertAnalyzesTo(a, "Μάϊος", [ "μαιοσ" ]) # supplementary case folding self._assertAnalyzesTo(a, "𐐖", [ "𐐾" ]) # normalization self._assertAnalyzesTo(a, "ﴳﴺﰧ", [ "طمطمطم" ]) # removal of default ignorables self._assertAnalyzesTo(a, "क्‍ष", [ "कष" ]) # removal of latin accents (composed) self._assertAnalyzesTo(a, "résumé", [ "resume" ]) # removal of latin accents (decomposed) self._assertAnalyzesTo(a, "re\u0301sume\u0301", [ "resume" ]) # fold native digits self._assertAnalyzesTo(a, "৭০৬", [ "706" ]) # ascii-folding-filter type stuff self._assertAnalyzesTo(a, "đis is cræzy", [ "dis", "is", "craezy" ]) if __name__ == "__main__": try: import icu except ImportError: pass else: if icu.ICU_VERSION >= '49' and icu.ICU_VERSION <= '59.1': lucene.initVM(vmargs=['-Djava.awt.headless=true']) if '-loop' in sys.argv: sys.argv.remove('-loop') while True: try: unittest.main() except: pass else: unittest.main() else: print("ICU version [49 - 59.1] is required, running:", icu.ICU_VERSION, file=sys.stderr)