# ==================================================================== # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ==================================================================== import math from itertools import izip from random import randint from unittest import TestCase, main from lucene import * NUM_STRINGS = 6000 class SortTestCase(TestCase): """ Unit tests for sorting code, ported from Java Lucene """ def __init__(self, *args, **kwds): super(SortTestCase, self).__init__(*args, **kwds) self.data = [ # tracer contents int float string custom i18n long double, short, byte, custom parser encoding' [ "A", "x a", "5", "4f", "c", "A-3", u"p\u00EAche", "10", "-4.0", "3", "126", "J" ], [ "B", "y a", "5", "3.4028235E38", "i", "B-10", "HAT", "1000000000", "40.0", "24", "1", "I" ], [ "C", "x a b c", "2147483647", "1.0", "j", "A-2", u"p\u00E9ch\u00E9", "99999999", "40.00002343", "125", "15", "H" ], [ "D", "y a b c", "-1", "0.0f", "a", "C-0", "HUT", str(Long.MAX_VALUE), str(Double.MIN_VALUE), str(Short.MIN_VALUE), str(Byte.MIN_VALUE), "G" ], [ "E", "x a b c d", "5", "2f", "h", "B-8", "peach", str(Long.MIN_VALUE), str(Double.MAX_VALUE), str(Short.MAX_VALUE), str(Byte.MAX_VALUE), "F" ], [ "F", "y a b c d", "2", "3.14159f", "g", "B-1", u"H\u00C5T", "-44", "343.034435444", "-3", "0", "E" ], [ "G", "x a b c d", "3", "-1.0", "f", "C-100", "sin", "323254543543", "4.043544", "5", "100", "D" ], [ "H", "y a b c d", "0", "1.4E-45", "e", "C-88", u"H\u00D8T", "1023423423005", "4.043545", "10", "-50", "C" ], [ "I", "x a b c d e f", "-2147483648", "1.0e+0", "d", "A-10", u"s\u00EDn", "332422459999", "4.043546", "-340", "51", "B" ], [ "J", "y a b c d e f", "4", ".5", "b", "C-7", "HOT", "34334543543", "4.0000220343", "300", "2", "A" ], [ "W", "g", "1", None, None, None, None, None, None, None, None, None ], [ "X", "g", "1", "0.1", None, None, None, None, None, None, None, None ], [ "Y", "g", "1", "0.2", None, None, None, None, None, None, None, None ], [ "Z", "f g", None, None, None, None, None, None, None, None, None, None ], ] def _getIndex(self, even, odd): indexStore = RAMDirectory() writer = IndexWriter(indexStore, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.LIMITED) writer.setMaxBufferedDocs(2) writer.setMergeFactor(1000) for i in xrange(len(self.data)): if (i % 2 == 0 and even) or (i % 2 == 1 and odd): doc = Document() doc.add(Field("tracer", self.data[i][0], Field.Store.YES, Field.Index.NO)) doc.add(Field("contents", self.data[i][1], Field.Store.NO, Field.Index.ANALYZED)) if self.data[i][2] is not None: doc.add(Field("int", self.data[i][2], Field.Store.NO, Field.Index.NOT_ANALYZED)) if self.data[i][3] is not None: doc.add(Field("float", self.data[i][3], Field.Store.NO, Field.Index.NOT_ANALYZED)) if self.data[i][4] is not None: doc.add(Field("string", self.data[i][4], Field.Store.NO, Field.Index.NOT_ANALYZED)) if self.data[i][5] is not None: doc.add(Field("custom", self.data[i][5], Field.Store.NO, Field.Index.NOT_ANALYZED)) if self.data[i][6] is not None: doc.add(Field("i18n", self.data[i][6], Field.Store.NO, Field.Index.NOT_ANALYZED)) if self.data[i][7] is not None: doc.add(Field("long", self.data[i][7], Field.Store.NO, Field.Index.NOT_ANALYZED)) if self.data[i][8] is not None: doc.add(Field("double", self.data[i][8], Field.Store.NO, Field.Index.NOT_ANALYZED)) if self.data[i][9] is not None: doc.add(Field("short", self.data[i][9], Field.Store.NO, Field.Index.NOT_ANALYZED)) if self.data[i][10] is not None: doc.add(Field("byte", self.data[i][10], Field.Store.NO, Field.Index.NOT_ANALYZED)) if self.data[i][11] is not None: doc.add(Field("parser", self.data[i][11], Field.Store.NO, Field.Index.NOT_ANALYZED)) doc.setBoost(2.0) # produce some scores above 1.0 writer.addDocument(doc) # writer.optimize() writer.close() s = IndexSearcher(indexStore) s.setDefaultFieldSortScoring(True, True) return s def _getFullIndex(self): return self._getIndex(True, True) def getFullStrings(self): indexStore = RAMDirectory() writer = IndexWriter(indexStore, SimpleAnalyzer(), True, IndexWriter.MaxFieldLength.LIMITED) writer.setMaxBufferedDocs(4) writer.setMergeFactor(97) for i in xrange(NUM_STRINGS): doc = Document() num = self.getRandomCharString(self.getRandomNumber(2, 8), 48, 52) doc.add(Field("tracer", num, Field.Store.YES, Field.Index.NO)) # doc.add(Field("contents", str(i), Field.Store.NO, # Field.Index.ANALYZED)) doc.add(Field("string", num, Field.Store.NO, Field.Index.NOT_ANALYZED)) num2 = self.getRandomCharString(self.getRandomNumber(1, 4), 48, 50) doc.add(Field("string2", num2, Field.Store.NO, Field.Index.NOT_ANALYZED)) doc.add(Field("tracer2", num2, Field.Store.YES, Field.Index.NO)) doc.setBoost(2.0) # produce some scores above 1.0 writer.setMaxBufferedDocs(self.getRandomNumber(2, 12)) writer.addDocument(doc) # writer.optimize() # print writer.getSegmentCount() writer.close() return IndexSearcher(indexStore) def getRandomNumberString(self, num, low, high): return ''.join([self.getRandomNumber(low, high) for i in xrange(num)]) def getRandomCharString(self, num): return self.getRandomCharString(num, 48, 122) def getRandomCharString(self, num, start, end): return ''.join([chr(self.getRandomNumber(start, end)) for i in xrange(num)]) def getRandomNumber(self, low, high): return randint(low, high) def _getXIndex(self): return self._getIndex(True, False) def _getYIndex(self): return self._getIndex(False, True) def _getEmptyIndex(self): return self._getIndex(False, False) def setUp(self): self.full = self._getFullIndex() self.searchX = self._getXIndex() self.searchY = self._getYIndex() self.queryX = TermQuery(Term("contents", "x")) self.queryY = TermQuery(Term("contents", "y")) self.queryA = TermQuery(Term("contents", "a")) self.queryE = TermQuery(Term("contents", "e")) self.queryF = TermQuery(Term("contents", "f")) self.queryG = TermQuery(Term("contents", "g")) def testBuiltInSorts(self): """ test the sorts by score and document number """ sort = Sort() self._assertMatches(self.full, self.queryX, sort, "ACEGI") self._assertMatches(self.full, self.queryY, sort, "BDFHJ") sort.setSort(SortField.FIELD_DOC) self._assertMatches(self.full, self.queryX, sort, "ACEGI") self._assertMatches(self.full, self.queryY, sort, "BDFHJ") def testTypedSort(self): """ test sorts where the type of field is specified """ sort = Sort() sort.setSort([SortField("int", SortField.INT), SortField.FIELD_DOC]) self._assertMatches(self.full, self.queryX, sort, "IGAEC") self._assertMatches(self.full, self.queryY, sort, "DHFJB") sort.setSort([SortField("float", SortField.FLOAT), SortField.FIELD_DOC]) self._assertMatches(self.full, self.queryX, sort, "GCIEA") self._assertMatches(self.full, self.queryY, sort, "DHJFB") sort.setSort([SortField("long", SortField.LONG), SortField.FIELD_DOC]) self._assertMatches(self.full, self.queryX, sort, "EACGI") self._assertMatches(self.full, self.queryY, sort, "FBJHD") sort.setSort([SortField("double", SortField.DOUBLE), SortField.FIELD_DOC]) self._assertMatches(self.full, self.queryX, sort, "AGICE") self._assertMatches(self.full, self.queryY, sort, "DJHBF") sort.setSort([SortField("byte", SortField.BYTE), SortField.FIELD_DOC]) self._assertMatches(self.full, self.queryX, sort, "CIGAE") self._assertMatches(self.full, self.queryY, sort, "DHFBJ") sort.setSort([SortField("short", SortField.SHORT), SortField.FIELD_DOC]) self._assertMatches(self.full, self.queryX, sort, "IAGCE") self._assertMatches(self.full, self.queryY, sort, "DFHBJ") sort.setSort([SortField("string", SortField.STRING), SortField.FIELD_DOC]) self._assertMatches(self.full, self.queryX, sort, "AIGEC") self._assertMatches(self.full, self.queryY, sort, "DJHFB") def testStringSort(self): """ Test String sorting: small queue to many matches, multi field sort, reverse sort """ sort = Sort() searcher = self.getFullStrings() sort.setSort([SortField("string", SortField.STRING), SortField("string2", SortField.STRING, True), SortField.FIELD_DOC]) result = searcher.search(MatchAllDocsQuery(), None, 500, sort).scoreDocs buff = [] last = None lastSub = None lastDocId = 0 fail = False for scoreDoc in result: doc2 = searcher.doc(scoreDoc.doc) v = doc2.getValues("tracer") v2 = doc2.getValues("tracer2") for _v, _v2 in izip(v, v2): if last is not None: _cmp = cmp(_v, last) if _cmp < 0: # ensure first field is in order fail = True print "fail:", _v, "<", last if _cmp == 0: # ensure second field is in reverse order _cmp = cmp(_v2, lastSub) if _cmp > 0: fail = True print "rev field fail:", _v2, ">", lastSub elif _cmp == 0: # ensure docid is in order if scoreDoc.doc < lastDocId: fail = True print "doc fail:", scoreDoc.doc, ">", lastDocId last = _v lastSub = _v2 lastDocId = scoreDoc.doc buff.append(_v + "(" + _v2 + ")(" + str(scoreDoc.doc) + ") ") if fail: print "topn field1(field2)(docID):", ''.join(buff) self.assert_(not fail, "Found sort results out of order") def testCustomFieldParserSort(self): """ test sorts where the type of field is specified and a custom field parser is used, that uses a simple char encoding. The sorted string contains a character beginning from 'A' that is mapped to a numeric value using some "funny" algorithm to be different for each data type. """ # since tests explicitly use different parsers on the same field name # we explicitly check/purge the FieldCache between each assertMatch fc = FieldCache.DEFAULT class intParser(PythonIntParser): def parseInt(_self, val): return (ord(val[0]) - ord('A')) * 123456 class floatParser(PythonFloatParser): def parseFloat(_self, val): return math.sqrt(ord(val[0])) class longParser(PythonLongParser): def parseLong(_self, val): return (ord(val[0]) - ord('A')) * 1234567890L class doubleParser(PythonDoubleParser): def parseDouble(_self, val): return math.pow(ord(val[0]), ord(val[0]) - ord('A')) class byteParser(PythonByteParser): def parseByte(_self, val): return chr(ord(val[0]) - ord('A')) class shortParser(PythonShortParser): def parseShort(_self, val): return ord(val[0]) - ord('A') sort = Sort() sort.setSort([SortField("parser", intParser()), SortField.FIELD_DOC]) self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA") self._assertSaneFieldCaches(self.getName() + " IntParser") fc.purgeAllCaches() sort.setSort([SortField("parser", floatParser()), SortField.FIELD_DOC]) self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA") self._assertSaneFieldCaches(self.getName() + " FloatParser") fc.purgeAllCaches() sort.setSort([SortField("parser", longParser()), SortField.FIELD_DOC]) self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA") self._assertSaneFieldCaches(self.getName() + " LongParser") fc.purgeAllCaches() sort.setSort([SortField("parser", doubleParser()), SortField.FIELD_DOC]) self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA") self._assertSaneFieldCaches(self.getName() + " DoubleParser") fc.purgeAllCaches() sort.setSort([SortField("parser", byteParser()), SortField.FIELD_DOC]) self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA") self._assertSaneFieldCaches(self.getName() + " ByteParser") fc.purgeAllCaches() sort.setSort([SortField("parser", shortParser()), SortField.FIELD_DOC]) self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA") self._assertSaneFieldCaches(self.getName() + " ShortParser") fc.purgeAllCaches() def testEmptyIndex(self): """ test sorts when there's nothing in the index """ sort = Sort() empty = self._getEmptyIndex() self._assertMatches(empty, self.queryX, sort, "") sort.setSort(SortField.FIELD_DOC) self._assertMatches(empty, self.queryX, sort, "") sort.setSort([SortField("int", SortField.INT), SortField.FIELD_DOC]) self._assertMatches(empty, self.queryX, sort, "") sort.setSort([SortField("string", SortField.STRING, True), SortField.FIELD_DOC]) self._assertMatches(empty, self.queryX, sort, "") sort.setSort([SortField("float", SortField.FLOAT), SortField("string", SortField.STRING)]) self._assertMatches(empty, self.queryX, sort, "") def testNewCustomFieldParserSort(self): """ Test sorting w/ custom FieldComparator """ sort = Sort() sort.setSort([SortField("parser", MyFieldComparatorSource())]) self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA") def testAutoSort(self): """ test sorts where the type of field is determined dynamically """ sort = Sort() sort.setSort("int") self._assertMatches(self.full, self.queryX, sort, "IGAEC") self._assertMatches(self.full, self.queryY, sort, "DHFJB") sort.setSort("float") self._assertMatches(self.full, self.queryX, sort, "GCIEA") self._assertMatches(self.full, self.queryY, sort, "DHJFB") sort.setSort("string") self._assertMatches(self.full, self.queryX, sort, "AIGEC") self._assertMatches(self.full, self.queryY, sort, "DJHFB") def testReverseSort(self): """ test sorts in reverse """ sort = Sort() sort.setSort([SortField(None, SortField.SCORE, True), SortField.FIELD_DOC]) self._assertMatches(self.full, self.queryX, sort, "IEGCA") self._assertMatches(self.full, self.queryY, sort, "JFHDB") sort.setSort(SortField(None, SortField.DOC, True)) self._assertMatches(self.full, self.queryX, sort, "IGECA") self._assertMatches(self.full, self.queryY, sort, "JHFDB") sort.setSort("int", True) self._assertMatches(self.full, self.queryX, sort, "CAEGI") self._assertMatches(self.full, self.queryY, sort, "BJFHD") sort.setSort("float", True) self._assertMatches(self.full, self.queryX, sort, "AECIG") self._assertMatches(self.full, self.queryY, sort, "BFJHD") sort.setSort("string", True) self._assertMatches(self.full, self.queryX, sort, "CEGIA") self._assertMatches(self.full, self.queryY, sort, "BFHJD") def testEmptyFieldSort(self): """ test sorting when the sort field is empty(undefined) for some of the documents """ sort = Sort() sort.setSort("string") self._assertMatches(self.full, self.queryF, sort, "ZJI") sort.setSort("string", True) self._assertMatches(self.full, self.queryF, sort, "IJZ") sort.setSort(SortField("i18n", Locale.ENGLISH)) self._assertMatches(self.full, self.queryF, sort, "ZJI") sort.setSort(SortField("i18n", Locale.ENGLISH, True)) self._assertMatches(self.full, self.queryF, sort, "IJZ") sort.setSort("int") self._assertMatches(self.full, self.queryF, sort, "IZJ") sort.setSort("int", True) self._assertMatches(self.full, self.queryF, sort, "JZI") sort.setSort("float") self._assertMatches(self.full, self.queryF, sort, "ZJI") # using a nonexisting field as first sort key shouldn't make a # difference: sort.setSort([SortField("nosuchfield", SortField.STRING), SortField("float")]) self._assertMatches(self.full, self.queryF, sort, "ZJI") sort.setSort("float", True) self._assertMatches(self.full, self.queryF, sort, "IJZ") # When a field is None for both documents, the next SortField should # be used. # Works for sort.setSort([SortField("int"), SortField("string", SortField.STRING), SortField("float")]) self._assertMatches(self.full, self.queryG, sort, "ZWXY") # Reverse the last criterium to make sure the test didn't pass by # chance sort.setSort([SortField("int"), SortField("string", SortField.STRING), SortField("float", True)]) self._assertMatches(self.full, self.queryG, sort, "ZYXW") # Do the same for a MultiSearcher multiSearcher = MultiSearcher([self.full]) sort.setSort([SortField("int"), SortField("string", SortField.STRING), SortField("float")]) self._assertMatches(multiSearcher, self.queryG, sort, "ZWXY") sort.setSort([SortField("int"), SortField("string", SortField.STRING), SortField("float", True)]) self._assertMatches(multiSearcher, self.queryG, sort, "ZYXW") # Don't close the multiSearcher. it would close the full searcher too! # Do the same for a ParallelMultiSearcher parallelSearcher = ParallelMultiSearcher([self.full]) sort.setSort([SortField("int"), SortField("string", SortField.STRING), SortField("float")]) self._assertMatches(parallelSearcher, self.queryG, sort, "ZWXY") sort.setSort([SortField("int"), SortField("string", SortField.STRING), SortField("float", True)]) self._assertMatches(parallelSearcher, self.queryG, sort, "ZYXW") # Don't close the parallelSearcher. it would close the full searcher # too! def testSortCombos(self): """ test sorts using a series of fields """ sort = Sort() sort.setSort(["int", "float"]) self._assertMatches(self.full, self.queryX, sort, "IGEAC") sort.setSort([SortField("int", True), SortField(None, SortField.DOC, True)]) self._assertMatches(self.full, self.queryX, sort, "CEAGI") sort.setSort(["float", "string"]) self._assertMatches(self.full, self.queryX, sort, "GICEA") def testLocaleSort(self): """ test using a Locale for sorting strings """ sort = Sort() sort.setSort([SortField("string", Locale.US)]) self._assertMatches(self.full, self.queryX, sort, "AIGEC") self._assertMatches(self.full, self.queryY, sort, "DJHFB") sort.setSort([SortField("string", Locale.US, True)]) self._assertMatches(self.full, self.queryX, sort, "CEGIA") self._assertMatches(self.full, self.queryY, sort, "BFHJD") def testInternationalSort(self): """ test using various international locales with accented characters (which sort differently depending on locale) """ sort = Sort() sort.setSort(SortField("i18n", Locale.US)) self._assertMatches(self.full, self.queryY, sort, "BFJDH") sort.setSort(SortField("i18n", Locale("sv", "se"))) self._assertMatches(self.full, self.queryY, sort, "BJDFH") sort.setSort(SortField("i18n", Locale("da", "dk"))) self._assertMatches(self.full, self.queryY, sort, "BJDHF") sort.setSort(SortField("i18n", Locale.US)) self._assertMatches(self.full, self.queryX, sort, "ECAGI") sort.setSort(SortField("i18n", Locale.FRANCE)) self._assertMatches(self.full, self.queryX, sort, "EACGI") def testInternationalMultiSearcherSort(self): """ Test the MultiSearcher's ability to preserve locale-sensitive ordering by wrapping it around a single searcher """ sort = Sort() multiSearcher = MultiSearcher([self.full]) sort.setSort(SortField("i18n", Locale("sv", "se"))) self._assertMatches(multiSearcher, self.queryY, sort, "BJDFH") sort.setSort(SortField("i18n", Locale.US)) self._assertMatches(multiSearcher, self.queryY, sort, "BFJDH") sort.setSort(SortField("i18n", Locale("da", "dk"))) self._assertMatches(multiSearcher, self.queryY, sort, "BJDHF") def testCustomSorts(self): """ test a custom sort function """ sort = Sort() sort.setSort(SortField("custom", SampleComparable.getComparatorSource())) self._assertMatches(self.full, self.queryX, sort, "CAIEG") sort.setSort(SortField("custom", SampleComparable.getComparatorSource(), True)) self._assertMatches(self.full, self.queryY, sort, "HJDBF") custom = SampleComparable.getComparator() sort.setSort(SortField("custom", custom)) self._assertMatches(self.full, self.queryX, sort, "CAIEG") sort.setSort(SortField("custom", custom, True)) self._assertMatches(self.full, self.queryY, sort, "HJDBF") def testMultiSort(self): """ test a variety of sorts using more than one searcher """ searcher = MultiSearcher([self.searchX, self.searchY]) self.runMultiSorts(searcher, False) def testParallelMultiSort(self): """ test a variety of sorts using a parallel multisearcher """ searcher = ParallelMultiSearcher([self.searchX, self.searchY]) self.runMultiSorts(searcher, False) def testNormalizedScores(self): """ test that the relevancy scores are the same even if hits are sorted """ # capture relevancy scores scoresX = self.getScores(self.full.search(self.queryX, None, 1000).scoreDocs, self.full) scoresY = self.getScores(self.full.search(self.queryY, None, 1000).scoreDocs, self.full) scoresA = self.getScores(self.full.search(self.queryA, None, 1000).scoreDocs, self.full) # we'll test searching locally, remote and multi multi = MultiSearcher([self.searchX, self.searchY]) # change sorting and make sure relevancy stays the same sort = Sort() self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full)) self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi)) self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full)) self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi)) self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full)) self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi)) sort.setSort(SortField.FIELD_DOC) self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full)) self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi)) self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full)) self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi)) self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full)) self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi)) sort.setSort("int") self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full)) self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi)) self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full)) self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi)) self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full)) self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi)) sort.setSort("float") self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full)) self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi)) self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full)) self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi)) self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full)) self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi)) sort.setSort("string") self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full)) self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi)) self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full)) self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi)) self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full)) self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi)) sort.setSort(["int", "float"]) self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full)) self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi)) self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full)) self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi)) self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full)) self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi)) sort.setSort([SortField("int", True), SortField(None, SortField.DOC, True)]) self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full)) self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi)) self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full)) self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi)) self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full)) self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi)) sort.setSort(["float", "string"]) self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full)) self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi)) self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full)) self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi)) self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full)) self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi)) def testTopDocsScores(self): """ There was previously a bug in FieldSortedHitQueue.maxscore when only a single doc was added. That is what the following tests for. """ sort = Sort() nDocs = 10 # try to pick a query that will result in an unnormalized # score greater than 1 to test for correct normalization docs1 = self.full.search(self.queryE, None, nDocs, sort) # a filter that only allows through the first hit class filter(PythonFilter): def bits(_self, reader): bs = BitSet(reader.maxDoc()) bs.set(0, reader.maxDoc()) bs.set(docs1.scoreDocs[0].doc) return bs filt = filter() docs2 = self.full.search(self.queryE, filt, nDocs, sort) self.assertEqual(docs1.scoreDocs[0].score, docs2.scoreDocs[0].score, 1e-6) def testSortWithoutFillFields(self): """ There was previously a bug in TopFieldCollector when fillFields was set to False - the same doc and score was set in ScoreDoc[] array. This test asserts that if fillFields is False, the documents are set properly. It does not use Searcher's default search methods(with Sort) since all set fillFields to True. """ sorts = [Sort(SortField.FIELD_DOC), Sort()] for sort in sorts: q = MatchAllDocsQuery() tdc = TopFieldCollector.create(sort, 10, False, False, False, True) self.full.search(q, tdc) sds = tdc.topDocs().scoreDocs for i in xrange(1, len(sds)): self.assert_(sds[i].doc != sds[i - 1].doc) def testSortWithoutScoreTracking(self): """ Two Sort criteria to instantiate the multi/single comparators. """ sorts = [Sort(SortField.FIELD_DOC), Sort()] for sort in sorts: q = MatchAllDocsQuery() tdc = TopFieldCollector.create(sort, 10, True, False, False, True) self.full.search(q, tdc) tds = tdc.topDocs() sds = tds.scoreDocs for sd in sds: self.assert_(Float.isNaN_(sd.score)) self.assert_(Float.isNaN_(tds.getMaxScore())) def testSortWithScoreNoMaxScoreTracking(self): """ Two Sort criteria to instantiate the multi/single comparators. """ sorts = [Sort(SortField.FIELD_DOC), Sort()] for sort in sorts: q = MatchAllDocsQuery() tdc = TopFieldCollector.create(sort, 10, True, True, False, True) self.full.search(q, tdc) tds = tdc.topDocs() sds = tds.scoreDocs for sd in sds: self.assert_(not Float.isNaN_(sd.score)) self.assert_(Float.isNaN_(tds.getMaxScore())) def testSortWithScoreAndMaxScoreTracking(self): """ Two Sort criteria to instantiate the multi/single comparators. """ sorts = [Sort(SortField.FIELD_DOC), Sort()] for sort in sorts: q = MatchAllDocsQuery() tdc = TopFieldCollector.create(sort, 10, True, True, True, True) self.full.search(q, tdc) tds = tdc.topDocs() sds = tds.scoreDocs for sd in sds: self.assert_(not Float.isNaN_(sd.score)) self.assert_(not Float.isNaN_(tds.getMaxScore())) def testOutOfOrderDocsScoringSort(self): """ Two Sort criteria to instantiate the multi/single comparators. """ sorts = [Sort(SortField.FIELD_DOC), Sort()] tfcOptions = [[False, False, False], [False, False, True], [False, True, False], [False, True, True], [True, False, False], [True, False, True], [True, True, False], [True, True, True]] actualTFCClasses = [ "OutOfOrderOneComparatorNonScoringCollector", "OutOfOrderOneComparatorScoringMaxScoreCollector", "OutOfOrderOneComparatorScoringNoMaxScoreCollector", "OutOfOrderOneComparatorScoringMaxScoreCollector", "OutOfOrderOneComparatorNonScoringCollector", "OutOfOrderOneComparatorScoringMaxScoreCollector", "OutOfOrderOneComparatorScoringNoMaxScoreCollector", "OutOfOrderOneComparatorScoringMaxScoreCollector" ] # Save the original value to set later. origVal = BooleanQuery.getAllowDocsOutOfOrder() BooleanQuery.setAllowDocsOutOfOrder(True) bq = BooleanQuery() # Add a Query with SHOULD, since bw.scorer() returns BooleanScorer2 # which delegates to BS if there are no mandatory clauses. bq.add(MatchAllDocsQuery(), BooleanClause.Occur.SHOULD) # Set minNrShouldMatch to 1 so that BQ will not optimize rewrite to # return the clause instead of BQ. bq.setMinimumNumberShouldMatch(1) try: for sort in sorts: for tfcOption, actualTFCClass in izip(tfcOptions, actualTFCClasses): tdc = TopFieldCollector.create(sort, 10, tfcOption[0], tfcOption[1], tfcOption[2], False) self.assert_(tdc.getClass().getName().endswith("$" + actualTFCClass)) self.full.search(bq, tdc) tds = tdc.topDocs() sds = tds.scoreDocs self.assertEqual(10, len(sds)) finally: # Whatever happens, reset BooleanQuery.allowDocsOutOfOrder to the # original value. Don't set it to False in case the # implementation in BQ will change some day. BooleanQuery.setAllowDocsOutOfOrder(origVal) def testSortWithScoreAndMaxScoreTrackingNoResults(self): """ Two Sort criteria to instantiate the multi/single comparators. """ sorts = [Sort(SortField.FIELD_DOC), Sort()] for sort in sorts: tdc = TopFieldCollector.create(sort, 10, True, True, True, True) tds = tdc.topDocs() self.assertEqual(0, tds.totalHits) self.assert_(Float.isNaN_(tds.getMaxScore())) def runMultiSorts(self, multi, isFull): """ runs a variety of sorts useful for multisearchers """ sort = Sort() sort.setSort(SortField.FIELD_DOC) expected = isFull and "ABCDEFGHIJ" or "ACEGIBDFHJ" self._assertMatches(multi, self.queryA, sort, expected) sort.setSort(SortField("int", SortField.INT)) expected = isFull and "IDHFGJABEC" or "IDHFGJAEBC" self._assertMatches(multi, self.queryA, sort, expected) sort.setSort([SortField("int", SortField.INT), SortField.FIELD_DOC]) expected = isFull and "IDHFGJABEC" or "IDHFGJAEBC" self._assertMatches(multi, self.queryA, sort, expected) sort.setSort("int") expected = isFull and "IDHFGJABEC" or "IDHFGJAEBC" self._assertMatches(multi, self.queryA, sort, expected) sort.setSort([SortField("float", SortField.FLOAT), SortField.FIELD_DOC]) self._assertMatches(multi, self.queryA, sort, "GDHJCIEFAB") sort.setSort("float") self._assertMatches(multi, self.queryA, sort, "GDHJCIEFAB") sort.setSort("string") self._assertMatches(multi, self.queryA, sort, "DJAIHGFEBC") sort.setSort("int", True) expected = isFull and "CABEJGFHDI" or "CAEBJGFHDI" self._assertMatches(multi, self.queryA, sort, expected) sort.setSort("float", True) self._assertMatches(multi, self.queryA, sort, "BAFECIJHDG") sort.setSort("string", True) self._assertMatches(multi, self.queryA, sort, "CBEFGHIAJD") sort.setSort(["int", "float"]) self._assertMatches(multi, self.queryA, sort, "IDHFGJEABC") sort.setSort(["float", "string"]) self._assertMatches(multi, self.queryA, sort, "GDHJICEFAB") sort.setSort("int") self._assertMatches(multi, self.queryF, sort, "IZJ") sort.setSort("int", True) self._assertMatches(multi, self.queryF, sort, "JZI") sort.setSort("float") self._assertMatches(multi, self.queryF, sort, "ZJI") sort.setSort("string") self._assertMatches(multi, self.queryF, sort, "ZJI") sort.setSort("string", True) self._assertMatches(multi, self.queryF, sort, "IJZ") # up to this point, all of the searches should have "sane" # FieldCache behavior, and should have reused hte cache in several # cases self._assertSaneFieldCaches(self.getName() + " various") # next we'll check Locale based(String[]) for 'string', so purge first FieldCache.DEFAULT.purgeAllCaches() sort.setSort([SortField("string", Locale.US)]) self._assertMatches(multi, self.queryA, sort, "DJAIHGFEBC") sort.setSort([SortField("string", Locale.US, True)]) self._assertMatches(multi, self.queryA, sort, "CBEFGHIAJD") sort.setSort([SortField("string", Locale.UK)]) self._assertMatches(multi, self.queryA, sort, "DJAIHGFEBC") self._assertSaneFieldCaches(self.getName() + " Locale.US + Locale.UK") FieldCache.DEFAULT.purgeAllCaches() def _assertMatches(self, searcher, query, sort, expectedResult): """ make sure the documents returned by the search match the expected list """ # ScoreDoc[] result = searcher.search(query, None, 1000, sort).scoreDocs hits = searcher.search(query, None, len(expectedResult), sort) sds = hits.scoreDocs self.assertEqual(hits.totalHits, len(expectedResult)) buff = [] for sd in sds: doc = searcher.doc(sd.doc) v = doc.getValues("tracer") for _v in v: buff.append(_v) self.assertEqual(expectedResult, ''.join(buff)) def getScores(self, hits, searcher): scoreMap = {} for hit in hits: doc = searcher.doc(hit.doc) v = doc.getValues("tracer") self.assertEqual(len(v), 1) scoreMap[v[0]] = hit.score return scoreMap def _assertSameValues(self, m1, m2): """ make sure all the values in the maps match """ self.assertEquals(len(m1), len(m2)) for key in m1.iterkeys(): self.assertEquals(m1[key], m2[key], 1e-6) def getName(self): return type(self).__name__ def _assertSaneFieldCaches(self, msg): entries = FieldCache.DEFAULT.getCacheEntries() insanity = FieldCacheSanityChecker.checkSanity(entries) self.assertEqual(0, len(insanity), msg + ": Insane FieldCache usage(s) found") class MyFieldComparator(PythonFieldComparator): def __init__(self, numHits): super(MyFieldComparator, self).__init__() self.slotValues = [0] * numHits def copy(self, slot, doc): self.slotValues[slot] = self.docValues[doc] def compare(self, slot1, slot2): return self.slotValues[slot1] - self.slotValues[slot2] def compareBottom(self, doc): return self.bottomValue - self.docValues[doc] def setBottom(self, bottom): self.bottomValue = self.slotValues[bottom] def setNextReader(self, reader, docBase): class intParser(PythonIntParser): def parseInt(_self, val): return (ord(val[0]) - ord('A')) * 123456 self.docValues = FieldCache.DEFAULT.getInts(reader, "parser", intParser()) def value(self, slot): return Integer(self.slotValues[slot]) class MyFieldComparatorSource(PythonFieldComparatorSource): def newComparator(self, fieldname, numHits, sortPos, reversed): return MyFieldComparator(numHits) class SampleComparable(PythonComparable): def __init__(self, s): super(SampleComparable, self).__init__() self.string_part, self.int_part = s.split('-', 1) self.int_part = int(self.int_part) def compareTo(self, o): # The not-so-obvious code below first checks that o needs to be # downcast into a SampleComparable, does it if needed, and sets it # to the inner python instance to work, getting it through '.self'. # If the compareTo() call came from Java then o may just be # wrapped with java.lang.Object, hence not an instance of # SampleComparable even though the actual Java object is. # In that case isinstance() returns False but cast_() succeeds. if not isinstance(o, SampleComparable): o = SampleComparable.cast_(o).self i = cmp(self.string_part, o.string_part) if i == 0: return cmp(self.int_part, o.int_part) return i def getComparatorSource(cls): class sortComparatorSource(PythonSortComparatorSource): def newComparator(_self, reader, fieldName): enumerator = reader.terms(Term(fieldName, "")) try: class scoreDocComparator(PythonScoreDocComparator): def __init__(self_): super(scoreDocComparator, self_).__init__() self_.cachedValues = _self.fillCache(reader, enumerator, fieldName) def compare(self_, i, j): return self_.cachedValues[i.doc].compareTo(self_.cachedValues[j.doc]) def sortValue(self_, i): return self_.cachedValues[i.doc] def sortType(self_): return SortField.CUSTOM return scoreDocComparator() finally: enumerator.close() def fillCache(_self, reader, enumerator, fieldName): """ Returns an array of objects which represent that natural order of the term values in the given field. param reader Terms are in this index. param enumerator Use this to get the term values and TermDocs. param fieldName Comparables should be for this field. return Array of objects representing natural order of terms in field. throws IOException If an error occurs reading the index. """ retArray = [None] * reader.maxDoc() if len(retArray) > 0: termDocs = reader.termDocs() try: if enumerator.term() is None: raise AssertionError, "no terms in field " + fieldName while True: term = enumerator.term() if term.field() != fieldName: break termval = _self.getComparable(term.text()) termDocs.seek(enumerator) while termDocs.next(): retArray[termDocs.doc()] = termval if not enumerator.next(): break finally: termDocs.close() return retArray def getComparable(_self, termtext): return SampleComparable(termtext) return sortComparatorSource() def getComparator(cls): class sortComparator(PythonSortComparator): def getComparable(_self, termtext): return SampleComparable(termtext) def hashCode(_self): return _self.getClass().getName().hashCode() def equals(_self, that): return _self.getClass().equals(that.getClass()) return sortComparator() getComparatorSource = classmethod(getComparatorSource) getComparator = classmethod(getComparator) if __name__ == "__main__": import sys, lucene env = lucene.initVM(lucene.CLASSPATH) if '-loop' in sys.argv: sys.argv.remove('-loop') while True: try: main() except: pass # refs = sorted(env._dumpRefs(classes=True).items(), # key=lambda x: x[1], reverse=True) # print refs[0:4] else: main()