from lucene import \ StandardAnalyzer, RAMDirectory, Document, Field, Version, \ IndexWriter, IndexReader, TermPositionVector, initVM if __name__ == '__main__': initVM() directory = RAMDirectory() iwriter = IndexWriter(directory, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.LIMITED) ts = ["this bernhard is the text to be index text", "this claudia is the text to be index"] for t in ts: doc = Document() doc.add(Field("fieldname", t, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) iwriter.addDocument(doc) iwriter.optimize() iwriter.close() ireader = IndexReader.open(directory, True) tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'fieldname')) for (t,f,i) in zip(tpv.getTerms(),tpv.getTermFrequencies(),xrange(100000)): print 'term %s' % t print ' freq: %i' % f try: print ' pos: ' + str([p for p in tpv.getTermPositions(i)]) except: print ' no pos' try: print ' off: ' + \ str(["%i-%i" % (o.getStartOffset(), o.getEndOffset()) for o in tpv.getOffsets(i)]) except: print ' no offsets'