# -*- coding: utf-8 -*- # ==================================================================== # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ==================================================================== from unittest import TestCase, main from lucene import ThaiAnalyzer, StringReader, Version from BaseTokenStreamTestCase import BaseTokenStreamTestCase class ThaiAnalyzerTestCase(BaseTokenStreamTestCase): def testAnalyzer(self): analyzer = ThaiAnalyzer(Version.LUCENE_CURRENT) self._assertAnalyzesTo(analyzer, "", []) self._assertAnalyzesTo(analyzer, "การที่ได้ต้องแสดงว่างานดี", [ "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" ]) self._assertAnalyzesTo(analyzer, "บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com", [ "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" ]) # English stop words self._assertAnalyzesTo(analyzer, "ประโยคว่า The quick brown fox jumped over the lazy dogs", [ "ประโยค", "ว่า", "quick", "brown", "fox", "jumped", "over", "lazy", "dogs" ]) if __name__ == "__main__": import sys, lucene lucene.initVM() if '-loop' in sys.argv: sys.argv.remove('-loop') while True: try: main() except: pass else: main()