# ==================================================================== # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ==================================================================== from lucene import \ SimpleAnalyzer, StandardAnalyzer, StringReader, Version, \ TermAttribute, PositionIncrementAttribute, TypeAttribute, OffsetAttribute class AnalyzerUtils(object): def main(cls, argv): print "SimpleAnalyzer" cls.displayTokensWithFullDetails(SimpleAnalyzer(), "The quick brown fox....") print "\n----" print "StandardAnalyzer" cls.displayTokensWithFullDetails(StandardAnalyzer(Version.LUCENE_CURRENT), "I'll e-mail you at xyz@example.com") def setPositionIncrement(cls, source, posIncr): attr = source.addAttribute(PositionIncrementAttribute.class_) attr.setPositionIncrement(posIncr) def getPositionIncrement(cls, source): attr = source.addAttribute(PositionIncrementAttribute.class_) return attr.getPositionIncrement() def setTerm(cls, source, term): attr = source.addAttribute(TermAttribute.class_) attr.setTermBuffer(term) def getTerm(cls, source): attr = source.addAttribute(TermAttribute.class_) return attr.term() def setType(cls, source, type): attr = source.addAttribute(TypeAttribute.class_) attr.setType(type) def getType(cls, source): attr = source.addAttribute(TypeAttribute.class_) return attr.type() def displayTokens(cls, analyzer, text): tokenStream = analyzer.tokenStream("contents", StringReader(text)) term = tokenStream.addAttribute(TermAttribute.class_) while tokenStream.incrementToken(): print "[%s]" %(term.term()), def displayTokensWithPositions(cls, analyzer, text): stream = analyzer.tokenStream("contents", StringReader(text)) term = stream.addAttribute(TermAttribute.class_) posIncr = stream.addAttribute(PositionIncrementAttribute.class_) position = 0 while stream.incrementToken(): increment = posIncr.getPositionIncrement() if increment > 0: position = position + increment print "\n%d:" %(position), print "[%s]" %(term.term()), print def displayTokensWithFullDetails(cls, analyzer, text): stream = analyzer.tokenStream("contents", StringReader(text)) term = stream.addAttribute(TermAttribute.class_) posIncr = stream.addAttribute(PositionIncrementAttribute.class_) offset = stream.addAttribute(OffsetAttribute.class_) type = stream.addAttribute(TypeAttribute.class_) position = 0 while stream.incrementToken(): increment = posIncr.getPositionIncrement() if increment > 0: position = position + increment print "\n%d:" %(position), print "[%s:%d->%d:%s]" %(term.term(), offset.startOffset(), offset.endOffset(), type.type()), print def assertAnalyzesTo(cls, analyzer, input, outputs): stream = analyzer.tokenStream("field", StringReader(input)) termAttr = stream.addAttribute(TermAttribute.class_) for output in outputs: if not stream.incrementToken(): raise AssertionError, 'stream.incremementToken()' if output != termAttr.term(): raise AssertionError, 'output == termAttr.term())' if stream.incrementToken(): raise AssertionError, 'not stream.incremementToken()' stream.close() main = classmethod(main) setPositionIncrement = classmethod(setPositionIncrement) getPositionIncrement = classmethod(getPositionIncrement) setTerm = classmethod(setTerm) getTerm = classmethod(getTerm) setType = classmethod(setType) getType = classmethod(getType) displayTokens = classmethod(displayTokens) displayTokensWithPositions = classmethod(displayTokensWithPositions) displayTokensWithFullDetails = classmethod(displayTokensWithFullDetails) assertAnalyzesTo = classmethod(assertAnalyzesTo) if __name__ == "__main__": import sys AnalyzerUtils.main(sys.argv)