# ==================================================================== # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ==================================================================== import sys, lucene, unittest from PyLuceneTestCase import PyLuceneTestCase from java.lang import Float from org.apache.lucene.analysis.core import SimpleAnalyzer from org.apache.lucene.document import \ Document, Field, TextField, FloatDocValuesField from org.apache.lucene.index import \ DocValues, FieldInvertState, IndexReader, LeafReaderContext, Term from org.apache.lucene.search import TermQuery from org.apache.pylucene.search.similarities import \ PythonPerFieldSimilarityWrapper, PythonSimilarity from org.apache.lucene.store import Directory from org.apache.lucene.util import BytesRef # # Tests the use of indexdocvalues in scoring. # # In the example, a docvalues field is used as a per-document boost (separate # from the norm) # SCORE_EPSILON = 0.001 # for comparing floats # # Similarity that wraps another similarity and boosts the final score # according to whats in a docvalues field. # class BoostingSimilarity(PythonSimilarity): def __init__(self, sim, boostField): super(BoostingSimilarity, self).__init__() self.sim = sim self.boostField = boostField def computeNorm(self, state): return self.sim.computeNorm(state) def computeWeight(self, boost, collectionStats, termStats): return self.sim.computeWeight(boost, collectionStats, termStats) def simScorer(self, stats, context): sub = self.sim.simScorer(stats, context) values = DocValues.getNumeric(context.reader(), self.boostField) class _SimScorer(PythonSimilarity.PythonSimScorer): def getValueForDoc(_self, doc): curDocID = values.docID() if doc < curDocID: raise ValueError("doc=" + doc + " is before curDocID=" + curDocID) if doc > curDocID: curDocID = values.advance(doc) if curDocID == doc: return Float.intBitsToFloat(int(values.longValue())) else: return 0.0 def score(_self, doc, freq): return _self.getValueForDoc(doc) * sub.score(doc, freq) return _SimScorer() class TestDocValuesScoring(PyLuceneTestCase): def testSimple(self): writer = self.getWriter(analyzer=SimpleAnalyzer()) doc = Document() field = Field("foo", "", TextField.TYPE_NOT_STORED) doc.add(field) dvField = FloatDocValuesField("foo_boost", 0.0) doc.add(dvField) field2 = Field("bar", "", TextField.TYPE_NOT_STORED) doc.add(field2) field.setStringValue("quick brown fox") field2.setStringValue("quick brown fox") dvField.setFloatValue(2.0) # boost x2 writer.addDocument(doc) field.setStringValue("jumps over lazy brown dog") field2.setStringValue("jumps over lazy brown dog") dvField.setFloatValue(4.0) # boost x4 writer.addDocument(doc) reader = writer.getReader() writer.close() # no boosting searcher1 = self.getSearcher(reader=reader) base = searcher1.getSimilarity(True) # boosting searcher2 = self.getSearcher(reader=reader) class _similarity(PythonPerFieldSimilarityWrapper): def __init__(_self, base): super(_similarity, _self).__init__() _self.base = base _self.fooSim = BoostingSimilarity(base, "foo_boost") def get(_self, field): return _self.fooSim if "foo" == field else _self.base searcher2.setSimilarity(_similarity(base)) # in this case, we searched on field "foo". first document should have # 2x the score. tq = TermQuery(Term("foo", "quick")) noboost = searcher1.search(tq, 10) boost = searcher2.search(tq, 10) self.assertEqual(1, noboost.totalHits) self.assertEqual(1, boost.totalHits) self.assertEqual( boost.scoreDocs[0].score, noboost.scoreDocs[0].score * 2.0, SCORE_EPSILON) # this query matches only the second document, which should have 4x # the score. tq = TermQuery(Term("foo", "jumps")) noboost = searcher1.search(tq, 10) boost = searcher2.search(tq, 10) self.assertEqual(1, noboost.totalHits) self.assertEqual(1, boost.totalHits) self.assertEqual( boost.scoreDocs[0].score, noboost.scoreDocs[0].score * 4.0, SCORE_EPSILON) # search on on field bar just for kicks, nothing should happen, since # we setup our sim provider to only use foo_boost for field foo. tq = TermQuery(Term("bar", "quick")) noboost = searcher1.search(tq, 10) boost = searcher2.search(tq, 10) self.assertEqual(1, noboost.totalHits) self.assertEqual(1, boost.totalHits) self.assertEqual( boost.scoreDocs[0].score, noboost.scoreDocs[0].score, SCORE_EPSILON) reader.close() if __name__ == "__main__": lucene.initVM(vmargs=['-Djava.awt.headless=true']) if '-loop' in sys.argv: sys.argv.remove('-loop') while True: try: unittest.main() except: pass else: unittest.main()