You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by va...@apache.org on 2009/09/29 23:55:59 UTC
svn commit: r820102 [2/2] - in /lucene/pylucene/trunk: ./
java/org/apache/pylucene/search/ java/org/apache/pylucene/search/highlight/
jcc/ jcc/_jcc/java/lang/ jcc/jcc/ jcc/jcc/sources/
samples/LuceneInAction/lia/searching/ test/
Modified: lucene/pylucene/trunk/test/test_Sort.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_Sort.py?rev=820102&r1=820101&r2=820102&view=diff
==============================================================================
--- lucene/pylucene/trunk/test/test_Sort.py (original)
+++ lucene/pylucene/trunk/test/test_Sort.py Tue Sep 29 21:55:57 2009
@@ -12,286 +12,156 @@
# limitations under the License.
# ====================================================================
-import re
+import math
+from itertools import izip
+from random import randint
from unittest import TestCase, main
from lucene import *
+NUM_STRINGS = 6000
+
+
class SortTestCase(TestCase):
"""
Unit tests for sorting code, ported from Java Lucene
"""
- # document data:
- # the tracer field is used to determine which document was hit
- # the contents field is used to search and sort by relevance
- # the int field to sort by int
- # the float field to sort by float
- # the string field to sort by string
-
- data = [
- # tracer contents int float string custom
- [ "A", "x a", "5", "4f", "c", "A-3" ],
- [ "B", "y a", "5", "3.4028235E38", "i", "B-10" ],
- [ "C", "x a b c", "2147483647", "1.0", "j", "A-2" ],
- [ "D", "y a b c", "-1", "0.0f", "a", "C-0" ],
- [ "E", "x a b c d", "5", "2f", "h", "B-8" ],
- [ "F", "y a b c d", "2", "3.14159f", "g", "B-1" ],
- [ "G", "x a b c d", "3", "-1.0", "f", "C-100" ],
- [ "H", "y a b c d", "0", "1.4E-45", "e", "C-88" ],
- [ "I", "x a b c d e f", "-2147483648", "1.0e+0", "d", "A-10" ],
- [ "J", "y a b c d e f", "4", ".5", "b", "C-7" ],
- [ "W", "g", "1", None, None, None ],
- [ "X", "g", "1", "0.1", None, None ],
- [ "Y", "g", "1", "0.2", None, None ],
- [ "Z", "f g", None, None, None, None ]
- ]
+ def __init__(self, *args, **kwds):
+ super(SortTestCase, self).__init__(*args, **kwds)
+
+ self.data = [
+ # tracer contents int float string custom i18n long double, short, byte, custom parser encoding'
+ [ "A", "x a", "5", "4f", "c", "A-3", u"p\u00EAche", "10", "-4.0", "3", "126", "J" ],
+ [ "B", "y a", "5", "3.4028235E38", "i", "B-10", "HAT", "1000000000", "40.0", "24", "1", "I" ],
+ [ "C", "x a b c", "2147483647", "1.0", "j", "A-2", u"p\u00E9ch\u00E9", "99999999", "40.00002343", "125", "15", "H" ],
+ [ "D", "y a b c", "-1", "0.0f", "a", "C-0", "HUT", str(Long.MAX_VALUE), str(Double.MIN_VALUE), str(Short.MIN_VALUE), str(Byte.MIN_VALUE), "G" ],
+ [ "E", "x a b c d", "5", "2f", "h", "B-8", "peach", str(Long.MIN_VALUE), str(Double.MAX_VALUE), str(Short.MAX_VALUE), str(Byte.MAX_VALUE), "F" ],
+ [ "F", "y a b c d", "2", "3.14159f", "g", "B-1", u"H\u00C5T", "-44", "343.034435444", "-3", "0", "E" ],
+ [ "G", "x a b c d", "3", "-1.0", "f", "C-100", "sin", "323254543543", "4.043544", "5", "100", "D" ],
+ [ "H", "y a b c d", "0", "1.4E-45", "e", "C-88", u"H\u00D8T", "1023423423005", "4.043545", "10", "-50", "C" ],
+ [ "I", "x a b c d e f", "-2147483648", "1.0e+0", "d", "A-10", u"s\u00EDn", "332422459999", "4.043546", "-340", "51", "B" ],
+ [ "J", "y a b c d e f", "4", ".5", "b", "C-7", "HOT", "34334543543", "4.0000220343", "300", "2", "A" ],
+ [ "W", "g", "1", None, None, None, None, None, None, None, None, None ],
+ [ "X", "g", "1", "0.1", None, None, None, None, None, None, None, None ],
+ [ "Y", "g", "1", "0.2", None, None, None, None, None, None, None, None ],
+ [ "Z", "f g", None, None, None, None, None, None, None, None, None, None ],
+ ]
def _getIndex(self, even, odd):
- """
- Create an index of all the documents, or just the x,
- or just the y documents
- """
-
+
indexStore = RAMDirectory()
- writer = IndexWriter(indexStore, SimpleAnalyzer(), True)
+ writer = IndexWriter(indexStore, SimpleAnalyzer(), True,
+ IndexWriter.MaxFieldLength.LIMITED)
+ writer.setMaxBufferedDocs(2)
+ writer.setMergeFactor(1000)
- for i in xrange(0, len(self.data)):
- if i % 2 == 0 and even or i % 2 == 1 and odd:
+ for i in xrange(len(self.data)):
+ if (i % 2 == 0 and even) or (i % 2 == 1 and odd):
doc = Document()
- doc.add(Field("tracer", self.data[i][0],
- Field.Store.YES, Field.Index.NO))
- doc.add(Field("contents", self.data[i][1],
- Field.Store.NO, Field.Index.TOKENIZED))
+ doc.add(Field("tracer", self.data[i][0], Field.Store.YES,
+ Field.Index.NO))
+ doc.add(Field("contents", self.data[i][1], Field.Store.NO,
+ Field.Index.ANALYZED))
if self.data[i][2] is not None:
- doc.add(Field("int", self.data[i][2],
- Field.Store.NO, Field.Index.UN_TOKENIZED))
+ doc.add(Field("int", self.data[i][2], Field.Store.NO,
+ Field.Index.NOT_ANALYZED))
if self.data[i][3] is not None:
- doc.add(Field("float", self.data[i][3],
- Field.Store.NO, Field.Index.UN_TOKENIZED))
+ doc.add(Field("float", self.data[i][3], Field.Store.NO,
+ Field.Index.NOT_ANALYZED))
if self.data[i][4] is not None:
- doc.add(Field("string", self.data[i][4],
- Field.Store.NO, Field.Index.UN_TOKENIZED))
+ doc.add(Field("string", self.data[i][4], Field.Store.NO,
+ Field.Index.NOT_ANALYZED))
if self.data[i][5] is not None:
- doc.add(Field("custom", self.data[i][5],
- Field.Store.NO, Field.Index.UN_TOKENIZED))
+ doc.add(Field("custom", self.data[i][5], Field.Store.NO,
+ Field.Index.NOT_ANALYZED))
+ if self.data[i][6] is not None:
+ doc.add(Field("i18n", self.data[i][6], Field.Store.NO,
+ Field.Index.NOT_ANALYZED))
+ if self.data[i][7] is not None:
+ doc.add(Field("long", self.data[i][7], Field.Store.NO,
+ Field.Index.NOT_ANALYZED))
+ if self.data[i][8] is not None:
+ doc.add(Field("double", self.data[i][8], Field.Store.NO,
+ Field.Index.NOT_ANALYZED))
+ if self.data[i][9] is not None:
+ doc.add(Field("short", self.data[i][9], Field.Store.NO,
+ Field.Index.NOT_ANALYZED))
+ if self.data[i][10] is not None:
+ doc.add(Field("byte", self.data[i][10], Field.Store.NO,
+ Field.Index.NOT_ANALYZED))
+ if self.data[i][11] is not None:
+ doc.add(Field("parser", self.data[i][11], Field.Store.NO,
+ Field.Index.NOT_ANALYZED))
+ doc.setBoost(2.0) # produce some scores above 1.0
writer.addDocument(doc)
-
- writer.optimize()
+ # writer.optimize()
writer.close()
+ s = IndexSearcher(indexStore)
+ s.setDefaultFieldSortScoring(True, True)
- return IndexSearcher(indexStore)
+ return s
def _getFullIndex(self):
return self._getIndex(True, True)
- def _getXIndex(self):
- return self._getIndex(True, False)
-
- def _getYIndex(self):
- return self._getIndex(False, True)
-
- def _getEmptyIndex(self):
- return self._getIndex(False, False)
+ def getFullStrings(self):
- def _assertMatches(self, searcher, query, sort, expectedResult):
- """
- Make sure the documents returned by the search match the expected list
- """
+ indexStore = RAMDirectory()
+ writer = IndexWriter(indexStore, SimpleAnalyzer(), True,
+ IndexWriter.MaxFieldLength.LIMITED)
+ writer.setMaxBufferedDocs(4)
+ writer.setMergeFactor(97)
- buff = ''.join([''.join(Hit.cast_(hit).getDocument().getValues("tracer"))
- for hit in searcher.search(query, sort)])
-
- self.assertEqual(expectedResult, buff)
-
- def _assertMatchesPattern(self, searcher, query, sort, pattern):
- """
- make sure the documents returned by the search match the expected
- list pattern
- """
-
- buff = ''.join([''.join(Hit.cast_(hit).getDocument().getValues("tracer"))
- for hit in searcher.search(query, sort)])
-
- self.assert_(re.compile(pattern).match(buff))
-
- def _getComparatorSource(self):
- return self._getComparator()
-
- def _getComparable(self, termtext):
-
- class comparable(PythonComparable):
- def __init__(self, termText):
- super(comparable, self).__init__()
- self.string_part, self.int_part = termText.split('-')
- self.int_part = int(self.int_part)
- def compareTo(self, o):
- return (cmp(self.string_part, o.string_part) or
- cmp(self.int_part, o.int_part))
-
- return comparable(termtext)
-
- def _getComparator(self):
-
- class comparator(PythonSortComparator):
-
- def getComparable(_self, termText):
- return self._getComparable(termText)
-
- def newComparator(_self, reader, fieldname):
- enumerator = reader.terms(Term(fieldname, ""))
-
- class comparator(PythonScoreDocComparator):
- def __init__(_self, cache):
- super(comparator, _self).__init__()
- _self.cache = cache
- def compare(_self, i, j):
- return _self.cache[i.doc].compareTo(_self.cache[j.doc])
- def sortType(_self):
- return SortField.CUSTOM
- def sortValue(_self, i):
- return _self.cache[i.doc]
-
- try:
- cache = self._fillCache(reader, enumerator, fieldname)
- return comparator(cache)
- finally:
- enumerator.close()
-
- return comparator()
-
- def _fillCache(self, reader, enumerator, fieldName):
- """
- Returns an array of objects which represent that natural order
- of the term values in the given field.
-
- @param reader Terms are in this index.
- @param enumerator Use this to get the term values and TermDocs.
- @param fieldname Comparables should be for this field.
- @return Array of objects representing natural order of terms in field.
- """
-
- retArray = [None] * reader.maxDoc()
- if len(retArray) > 0:
- termDocs = reader.termDocs()
- try:
- if enumerator.term() is None:
- raise RuntimeError, "no terms in field " + fieldName
- while True:
- term = enumerator.term()
- if term.field() != fieldName:
- break
- termval = self._getComparable(term.text())
- termDocs.seek(enumerator)
- while termDocs.next():
- retArray[termDocs.doc()] = termval
- if not enumerator.next():
- break
- finally:
- termDocs.close()
-
- return retArray
-
- def _runMultiSorts(self, multi):
- """
- runs a variety of sorts useful for multisearchers
- """
-
- sort = Sort()
+ for i in xrange(NUM_STRINGS):
+ doc = Document()
+ num = self.getRandomCharString(self.getRandomNumber(2, 8), 48, 52)
+ doc.add(Field("tracer", num, Field.Store.YES, Field.Index.NO))
+ # doc.add(Field("contents", str(i), Field.Store.NO,
+ # Field.Index.ANALYZED))
+ doc.add(Field("string", num, Field.Store.NO,
+ Field.Index.NOT_ANALYZED))
+ num2 = self.getRandomCharString(self.getRandomNumber(1, 4), 48, 50)
+ doc.add(Field("string2", num2, Field.Store.NO,
+ Field.Index.NOT_ANALYZED))
+ doc.add(Field("tracer2", num2, Field.Store.YES, Field.Index.NO))
+ doc.setBoost(2.0) # produce some scores above 1.0
+ writer.setMaxBufferedDocs(self.getRandomNumber(2, 12))
+ writer.addDocument(doc)
+
+ # writer.optimize()
+ # print writer.getSegmentCount()
+ writer.close()
- sort.setSort(SortField.FIELD_DOC)
- self._assertMatchesPattern(multi, self.queryA, sort,
- "[AB]{2}[CD]{2}[EF]{2}[GH]{2}[IJ]{2}")
+ return IndexSearcher(indexStore)
+
+ def getRandomNumberString(self, num, low, high):
- sort.setSort(SortField("int", SortField.INT))
- self._assertMatchesPattern(multi, self.queryA, sort,
- "IDHFGJ[ABE]{3}C")
+ return ''.join([self.getRandomNumber(low, high) for i in xrange(num)])
+
+ def getRandomCharString(self, num):
+
+ return self.getRandomCharString(num, 48, 122)
+
+ def getRandomCharString(self, num, start, end):
- sort.setSort([SortField("int", SortField.INT),
- SortField.FIELD_DOC])
- self._assertMatchesPattern(multi, self.queryA, sort,
- "IDHFGJ[AB]{2}EC")
-
- sort.setSort("int")
- self._assertMatchesPattern(multi, self.queryA, sort,
- "IDHFGJ[AB]{2}EC")
-
- sort.setSort([SortField("float", SortField.FLOAT),
- SortField.FIELD_DOC])
- self._assertMatchesPattern(multi, self.queryA, sort,
- "GDHJ[CI]{2}EFAB")
-
- sort.setSort("float")
- self._assertMatchesPattern(multi, self.queryA, sort,
- "GDHJ[CI]{2}EFAB")
-
- sort.setSort("string")
- self._assertMatches(multi, self.queryA, sort, "DJAIHGFEBC")
-
- sort.setSort("int", True)
- self._assertMatchesPattern(multi, self.queryA, sort,
- "C[AB]{2}EJGFHDI")
-
- sort.setSort("float", True)
- self._assertMatchesPattern(multi, self.queryA, sort,
- "BAFE[IC]{2}JHDG")
-
- sort.setSort("string", True)
- self._assertMatches(multi, self.queryA, sort, "CBEFGHIAJD")
-
- sort.setSort([SortField("string", Locale.US)])
- self._assertMatches(multi, self.queryA, sort, "DJAIHGFEBC")
-
- sort.setSort([SortField("string", Locale.US, True)])
- self._assertMatches(multi, self.queryA, sort, "CBEFGHIAJD")
-
- sort.setSort(["int", "float"])
- self._assertMatches(multi, self.queryA, sort, "IDHFGJEABC")
-
- sort.setSort(["float", "string"])
- self._assertMatches(multi, self.queryA, sort, "GDHJICEFAB")
-
- sort.setSort("int")
- self._assertMatches(multi, self.queryF, sort, "IZJ")
-
- sort.setSort("int", True)
- self._assertMatches(multi, self.queryF, sort, "JZI")
-
- sort.setSort("float")
- self._assertMatches(multi, self.queryF, sort, "ZJI")
+ return ''.join([chr(self.getRandomNumber(start, end))
+ for i in xrange(num)])
+
+ def getRandomNumber(self, low, high):
+
+ return randint(low, high)
- sort.setSort("string")
- self._assertMatches(multi, self.queryF, sort, "ZJI")
-
- sort.setSort("string", True)
- self._assertMatches(multi, self.queryF, sort, "IJZ")
-
- def _getScores(self, hits):
-
- scoreMap = {}
-
- for hit in hits:
- hit = Hit.cast_(hit)
- doc = hit.getDocument()
- v = doc.getValues("tracer")
- self.assertEqual(len(v), 1)
- scoreMap[v[0]] = float(hit.getScore())
-
- return scoreMap
-
- def _assertSameValues(self, m1, m2):
- """
- make sure all the values in the maps match
- """
+ def _getXIndex(self):
+ return self._getIndex(True, False)
- n = len(m1)
- m = len(m2)
- self.assertEqual(n, m)
+ def _getYIndex(self):
+ return self._getIndex(False, True)
- for key in m1.iterkeys():
- self.assertEqual(m1[key], m2[key])
+ def _getEmptyIndex(self):
+ return self._getIndex(False, False)
def setUp(self):
@@ -301,17 +171,9 @@
self.queryX = TermQuery(Term("contents", "x"))
self.queryY = TermQuery(Term("contents", "y"))
self.queryA = TermQuery(Term("contents", "a"))
+ self.queryE = TermQuery(Term("contents", "e"))
self.queryF = TermQuery(Term("contents", "f"))
-
- def tearDown(self):
-
- del self.full
- del self.queryX
- del self.queryY
- del self.queryA
- del self.queryF
- del self.searchX
- del self.searchY
+ self.queryG = TermQuery(Term("contents", "g"))
def testBuiltInSorts(self):
"""
@@ -332,6 +194,7 @@
"""
sort = Sort()
+
sort.setSort([SortField("int", SortField.INT),
SortField.FIELD_DOC])
self._assertMatches(self.full, self.queryX, sort, "IGAEC")
@@ -342,11 +205,156 @@
self._assertMatches(self.full, self.queryX, sort, "GCIEA")
self._assertMatches(self.full, self.queryY, sort, "DHJFB")
+ sort.setSort([SortField("long", SortField.LONG),
+ SortField.FIELD_DOC])
+ self._assertMatches(self.full, self.queryX, sort, "EACGI")
+ self._assertMatches(self.full, self.queryY, sort, "FBJHD")
+
+ sort.setSort([SortField("double", SortField.DOUBLE),
+ SortField.FIELD_DOC])
+ self._assertMatches(self.full, self.queryX, sort, "AGICE")
+ self._assertMatches(self.full, self.queryY, sort, "DJHBF")
+
+ sort.setSort([SortField("byte", SortField.BYTE),
+ SortField.FIELD_DOC])
+ self._assertMatches(self.full, self.queryX, sort, "CIGAE")
+ self._assertMatches(self.full, self.queryY, sort, "DHFBJ")
+
+ sort.setSort([SortField("short", SortField.SHORT),
+ SortField.FIELD_DOC])
+ self._assertMatches(self.full, self.queryX, sort, "IAGCE")
+ self._assertMatches(self.full, self.queryY, sort, "DFHBJ")
+
sort.setSort([SortField("string", SortField.STRING),
SortField.FIELD_DOC])
self._assertMatches(self.full, self.queryX, sort, "AIGEC")
self._assertMatches(self.full, self.queryY, sort, "DJHFB")
+
+ def testStringSort(self):
+ """
+ Test String sorting: small queue to many matches, multi field sort,
+ reverse sort
+ """
+ sort = Sort()
+ searcher = self.getFullStrings()
+
+ sort.setSort([SortField("string", SortField.STRING),
+ SortField("string2", SortField.STRING, True),
+ SortField.FIELD_DOC])
+
+ result = searcher.search(MatchAllDocsQuery(), None, 500, sort).scoreDocs
+
+ buff = []
+ last = None
+ lastSub = None
+ lastDocId = 0
+ fail = False
+
+ for scoreDoc in result:
+ doc2 = searcher.doc(scoreDoc.doc)
+ v = doc2.getValues("tracer")
+ v2 = doc2.getValues("tracer2")
+ for _v, _v2 in izip(v, v2):
+ if last is not None:
+ _cmp = cmp(_v, last)
+ if _cmp < 0: # ensure first field is in order
+ fail = True
+ print "fail:", _v, "<", last
+
+ if _cmp == 0: # ensure second field is in reverse order
+ _cmp = cmp(_v2, lastSub)
+ if _cmp > 0:
+ fail = True
+ print "rev field fail:", _v2, ">", lastSub
+ elif _cmp == 0: # ensure docid is in order
+ if scoreDoc.doc < lastDocId:
+ fail = True
+ print "doc fail:", scoreDoc.doc, ">", lastDocId
+
+ last = _v
+ lastSub = _v2
+ lastDocId = scoreDoc.doc
+ buff.append(_v + "(" + _v2 + ")(" + str(scoreDoc.doc) + ") ")
+
+ if fail:
+ print "topn field1(field2)(docID):", ''.join(buff)
+
+ self.assert_(not fail, "Found sort results out of order")
+
+ def testCustomFieldParserSort(self):
+ """
+ test sorts where the type of field is specified and a custom field
+ parser is used, that uses a simple char encoding. The sorted string
+ contains a character beginning from 'A' that is mapped to a numeric
+ value using some "funny" algorithm to be different for each data
+ type.
+ """
+
+ # since tests explicitly use different parsers on the same field name
+ # we explicitly check/purge the FieldCache between each assertMatch
+ fc = FieldCache.DEFAULT
+
+ class intParser(PythonIntParser):
+ def parseInt(_self, val):
+ return (ord(val[0]) - ord('A')) * 123456
+
+ class floatParser(PythonFloatParser):
+ def parseFloat(_self, val):
+ return math.sqrt(ord(val[0]))
+
+ class longParser(PythonLongParser):
+ def parseLong(_self, val):
+ return (ord(val[0]) - ord('A')) * 1234567890L
+
+ class doubleParser(PythonDoubleParser):
+ def parseDouble(_self, val):
+ return math.pow(ord(val[0]), ord(val[0]) - ord('A'))
+
+ class byteParser(PythonByteParser):
+ def parseByte(_self, val):
+ return chr(ord(val[0]) - ord('A'))
+
+ class shortParser(PythonShortParser):
+ def parseShort(_self, val):
+ return ord(val[0]) - ord('A')
+
+ sort = Sort()
+ sort.setSort([SortField("parser", intParser()),
+ SortField.FIELD_DOC])
+ self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
+ self._assertSaneFieldCaches(self.getName() + " IntParser")
+ fc.purgeAllCaches()
+
+ sort.setSort([SortField("parser", floatParser()),
+ SortField.FIELD_DOC])
+ self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
+ self._assertSaneFieldCaches(self.getName() + " FloatParser")
+ fc.purgeAllCaches()
+
+ sort.setSort([SortField("parser", longParser()),
+ SortField.FIELD_DOC])
+ self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
+ self._assertSaneFieldCaches(self.getName() + " LongParser")
+ fc.purgeAllCaches()
+
+ sort.setSort([SortField("parser", doubleParser()),
+ SortField.FIELD_DOC])
+ self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
+ self._assertSaneFieldCaches(self.getName() + " DoubleParser")
+ fc.purgeAllCaches()
+
+ sort.setSort([SortField("parser", byteParser()),
+ SortField.FIELD_DOC])
+ self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
+ self._assertSaneFieldCaches(self.getName() + " ByteParser")
+ fc.purgeAllCaches()
+
+ sort.setSort([SortField("parser", shortParser()),
+ SortField.FIELD_DOC])
+ self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
+ self._assertSaneFieldCaches(self.getName() + " ShortParser")
+ fc.purgeAllCaches()
def testEmptyIndex(self):
"""
@@ -361,8 +369,7 @@
sort.setSort(SortField.FIELD_DOC)
self._assertMatches(empty, self.queryX, sort, "")
- sort.setSort([SortField("int", SortField.INT),
- SortField.FIELD_DOC])
+ sort.setSort([SortField("int", SortField.INT), SortField.FIELD_DOC])
self._assertMatches(empty, self.queryX, sort, "")
sort.setSort([SortField("string", SortField.STRING, True),
@@ -374,11 +381,19 @@
self._assertMatches(empty, self.queryX, sort, "")
+ def testNewCustomFieldParserSort(self):
+ """
+ Test sorting w/ custom FieldComparator
+ """
+ sort = Sort()
+
+ sort.setSort([SortField("parser", MyFieldComparatorSource())])
+ self._assertMatches(self.full, self.queryA, sort, "JIHGFEDCBA")
+
def testAutoSort(self):
"""
test sorts where the type of field is determined dynamically
"""
-
sort = Sort()
sort.setSort("int")
@@ -397,7 +412,6 @@
"""
test sorts in reverse
"""
-
sort = Sort()
sort.setSort([SortField(None, SortField.SCORE, True),
@@ -423,17 +437,22 @@
def testEmptyFieldSort(self):
"""
- test sorting when the sort field is empty (undefined)
- for some of the documents
+ test sorting when the sort field is empty(undefined) for some of the
+ documents
"""
-
sort = Sort()
-
+
sort.setSort("string")
self._assertMatches(self.full, self.queryF, sort, "ZJI")
sort.setSort("string", True)
self._assertMatches(self.full, self.queryF, sort, "IJZ")
+
+ sort.setSort(SortField("i18n", Locale.ENGLISH))
+ self._assertMatches(self.full, self.queryF, sort, "ZJI")
+
+ sort.setSort(SortField("i18n", Locale.ENGLISH, True))
+ self._assertMatches(self.full, self.queryF, sort, "IJZ")
sort.setSort("int")
self._assertMatches(self.full, self.queryF, sort, "IZJ")
@@ -444,16 +463,66 @@
sort.setSort("float")
self._assertMatches(self.full, self.queryF, sort, "ZJI")
+ # using a nonexisting field as first sort key shouldn't make a
+ # difference:
+ sort.setSort([SortField("nosuchfield", SortField.STRING),
+ SortField("float")])
+ self._assertMatches(self.full, self.queryF, sort, "ZJI")
+
sort.setSort("float", True)
self._assertMatches(self.full, self.queryF, sort, "IJZ")
-
+
+ # When a field is None for both documents, the next SortField should
+ # be used.
+ # Works for
+ sort.setSort([SortField("int"),
+ SortField("string", SortField.STRING),
+ SortField("float")])
+ self._assertMatches(self.full, self.queryG, sort, "ZWXY")
+
+ # Reverse the last criterium to make sure the test didn't pass by
+ # chance
+ sort.setSort([SortField("int"),
+ SortField("string", SortField.STRING),
+ SortField("float", True)])
+ self._assertMatches(self.full, self.queryG, sort, "ZYXW")
+
+ # Do the same for a MultiSearcher
+ multiSearcher = MultiSearcher([self.full])
+
+ sort.setSort([SortField("int"),
+ SortField("string", SortField.STRING),
+ SortField("float")])
+ self._assertMatches(multiSearcher, self.queryG, sort, "ZWXY")
+
+ sort.setSort([SortField("int"),
+ SortField("string", SortField.STRING),
+ SortField("float", True)])
+ self._assertMatches(multiSearcher, self.queryG, sort, "ZYXW")
+
+ # Don't close the multiSearcher. it would close the full searcher too!
+ # Do the same for a ParallelMultiSearcher
+ parallelSearcher = ParallelMultiSearcher([self.full])
+
+ sort.setSort([SortField("int"),
+ SortField("string", SortField.STRING),
+ SortField("float")])
+ self._assertMatches(parallelSearcher, self.queryG, sort, "ZWXY")
+
+ sort.setSort([SortField("int"),
+ SortField("string", SortField.STRING),
+ SortField("float", True)])
+ self._assertMatches(parallelSearcher, self.queryG, sort, "ZYXW")
+
+ # Don't close the parallelSearcher. it would close the full searcher
+ # too!
+
def testSortCombos(self):
"""
test sorts using a series of fields
"""
-
sort = Sort()
-
+
sort.setSort(["int", "float"])
self._assertMatches(self.full, self.queryX, sort, "IGEAC")
@@ -468,9 +537,8 @@
"""
test using a Locale for sorting strings
"""
-
sort = Sort()
-
+
sort.setSort([SortField("string", Locale.US)])
self._assertMatches(self.full, self.queryX, sort, "AIGEC")
self._assertMatches(self.full, self.queryY, sort, "DJHFB")
@@ -479,23 +547,62 @@
self._assertMatches(self.full, self.queryX, sort, "CEGIA")
self._assertMatches(self.full, self.queryY, sort, "BFHJD")
+ def testInternationalSort(self):
+ """
+ test using various international locales with accented characters
+ (which sort differently depending on locale)
+ """
+ sort = Sort()
+
+ sort.setSort(SortField("i18n", Locale.US))
+ self._assertMatches(self.full, self.queryY, sort, "BFJDH")
+
+ sort.setSort(SortField("i18n", Locale("sv", "se")))
+ self._assertMatches(self.full, self.queryY, sort, "BJDFH")
+
+ sort.setSort(SortField("i18n", Locale("da", "dk")))
+ self._assertMatches(self.full, self.queryY, sort, "BJDHF")
+
+ sort.setSort(SortField("i18n", Locale.US))
+ self._assertMatches(self.full, self.queryX, sort, "ECAGI")
+
+ sort.setSort(SortField("i18n", Locale.FRANCE))
+ self._assertMatches(self.full, self.queryX, sort, "EACGI")
+
+ def testInternationalMultiSearcherSort(self):
+ """
+ Test the MultiSearcher's ability to preserve locale-sensitive ordering
+ by wrapping it around a single searcher
+ """
+ sort = Sort()
+
+ multiSearcher = MultiSearcher([self.full])
+ sort.setSort(SortField("i18n", Locale("sv", "se")))
+ self._assertMatches(multiSearcher, self.queryY, sort, "BJDFH")
+
+ sort.setSort(SortField("i18n", Locale.US))
+ self._assertMatches(multiSearcher, self.queryY, sort, "BFJDH")
+
+ sort.setSort(SortField("i18n", Locale("da", "dk")))
+ self._assertMatches(multiSearcher, self.queryY, sort, "BJDHF")
+
def testCustomSorts(self):
"""
test a custom sort function
"""
-
sort = Sort()
-
- sort.setSort(SortField("custom", self._getComparatorSource()))
+
+ sort.setSort(SortField("custom",
+ SampleComparable.getComparatorSource()))
self._assertMatches(self.full, self.queryX, sort, "CAIEG")
- sort.setSort(SortField("custom", self._getComparatorSource(), True))
+ sort.setSort(SortField("custom",
+ SampleComparable.getComparatorSource(), True))
self._assertMatches(self.full, self.queryY, sort, "HJDBF")
- custom = self._getComparator()
+ custom = SampleComparable.getComparator()
sort.setSort(SortField("custom", custom))
self._assertMatches(self.full, self.queryX, sort, "CAIEG")
-
sort.setSort(SortField("custom", custom, True))
self._assertMatches(self.full, self.queryY, sort, "HJDBF")
@@ -503,9 +610,9 @@
"""
test a variety of sorts using more than one searcher
"""
-
+
searcher = MultiSearcher([self.searchX, self.searchY])
- self._runMultiSorts(searcher)
+ self.runMultiSorts(searcher, False)
def testParallelMultiSort(self):
"""
@@ -513,7 +620,7 @@
"""
searcher = ParallelMultiSearcher([self.searchX, self.searchY])
- self._runMultiSorts(searcher)
+ self.runMultiSorts(searcher, False)
def testNormalizedScores(self):
"""
@@ -521,94 +628,542 @@
hits are sorted
"""
- full = self.full
-
# capture relevancy scores
- scoresX = self._getScores(full.search(self.queryX))
- scoresY = self._getScores(full.search(self.queryY))
- scoresA = self._getScores(full.search(self.queryA))
-
- # we'll test searching locally and multi
- # note: the multi test depends on each separate index containing
- # the same documents as our local index, so the computed normalization
- # will be the same. so we make a multi searcher over two equal document
- # sets - not realistic, but necessary for testing.
-
- queryX = self.queryX
- queryY = self.queryY
- queryA = self.queryA
+ scoresX = self.getScores(self.full.search(self.queryX, None,
+ 1000).scoreDocs, self.full)
+ scoresY = self.getScores(self.full.search(self.queryY, None,
+ 1000).scoreDocs, self.full)
+ scoresA = self.getScores(self.full.search(self.queryA, None,
+ 1000).scoreDocs, self.full)
+ # we'll test searching locally, remote and multi
multi = MultiSearcher([self.searchX, self.searchY])
- gs = self._getScores
-
# change sorting and make sure relevancy stays the same
sort = Sort()
-
- self._assertSameValues(scoresX, gs(full.search(queryX, sort)))
- self._assertSameValues(scoresX, gs(multi.search(queryX, sort)))
- self._assertSameValues(scoresY, gs(full.search(queryY, sort)))
- self._assertSameValues(scoresY, gs(multi.search(queryY, sort)))
- self._assertSameValues(scoresA, gs(full.search(queryA, sort)))
- self._assertSameValues(scoresA, gs(multi.search(queryA, sort)))
+ self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
+ self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
+ self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
+ self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
+ self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
+ self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))
sort.setSort(SortField.FIELD_DOC)
- self._assertSameValues(scoresX, gs(full.search(queryX,sort)))
- self._assertSameValues(scoresX, gs(multi.search(queryX, sort)))
- self._assertSameValues(scoresY, gs(full.search(queryY, sort)))
- self._assertSameValues(scoresY, gs(multi.search(queryY, sort)))
- self._assertSameValues(scoresA, gs(full.search(queryA, sort)))
- self._assertSameValues(scoresA, gs(multi.search(queryA, sort)))
+ self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
+ self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
+ self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
+ self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
+ self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
+ self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))
sort.setSort("int")
- self._assertSameValues(scoresX, gs(full.search(queryX, sort)))
- self._assertSameValues(scoresX, gs(multi.search(queryX, sort)))
- self._assertSameValues(scoresY, gs(full.search(queryY, sort)))
- self._assertSameValues(scoresY, gs(multi.search(queryY, sort)))
- self._assertSameValues(scoresA, gs(full.search(queryA, sort)))
- self._assertSameValues(scoresA, gs(multi.search(queryA, sort)))
+ self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
+ self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
+ self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
+ self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
+ self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
+ self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))
sort.setSort("float")
- self._assertSameValues(scoresX, gs(full.search(queryX, sort)))
- self._assertSameValues(scoresX, gs(multi.search(queryX, sort)))
- self._assertSameValues(scoresY, gs(full.search(queryY, sort)))
- self._assertSameValues(scoresY, gs(multi.search(queryY, sort)))
- self._assertSameValues(scoresA, gs(full.search(queryA, sort)))
- self._assertSameValues(scoresA, gs(multi.search(queryA, sort)))
+ self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
+ self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
+ self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
+ self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
+ self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
+ self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))
sort.setSort("string")
- self._assertSameValues(scoresX, gs(full.search(queryX, sort)))
- self._assertSameValues(scoresX, gs(multi.search(queryX, sort)))
- self._assertSameValues(scoresY, gs(full.search(queryY, sort)))
- self._assertSameValues(scoresY, gs(multi.search(queryY, sort)))
- self._assertSameValues(scoresA, gs(full.search(queryA, sort)))
- self._assertSameValues(scoresA, gs(multi.search(queryA, sort)))
+ self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
+ self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
+ self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
+ self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
+ self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
+ self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))
sort.setSort(["int", "float"])
- self._assertSameValues(scoresX, gs(full.search(queryX, sort)))
- self._assertSameValues(scoresX, gs(multi.search(queryX, sort)))
- self._assertSameValues(scoresY, gs(full.search(queryY, sort)))
- self._assertSameValues(scoresY, gs(multi.search(queryY, sort)))
- self._assertSameValues(scoresA, gs(full.search(queryA, sort)))
- self._assertSameValues(scoresA, gs(multi.search(queryA, sort)))
+ self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
+ self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
+ self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
+ self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
+ self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
+ self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))
sort.setSort([SortField("int", True),
SortField(None, SortField.DOC, True)])
- self._assertSameValues(scoresX, gs(full.search(queryX, sort)))
- self._assertSameValues(scoresX, gs(multi.search(queryX, sort)))
- self._assertSameValues(scoresY, gs(full.search(queryY, sort)))
- self._assertSameValues(scoresY, gs(multi.search(queryY, sort)))
- self._assertSameValues(scoresA, gs(full.search(queryA, sort)))
- self._assertSameValues(scoresA, gs(multi.search(queryA, sort)))
+ self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
+ self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
+ self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
+ self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
+ self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
+ self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))
+
+ sort.setSort(["float", "string"])
+ self._assertSameValues(scoresX, self.getScores(self.full.search(self.queryX, None, 1000, sort).scoreDocs, self.full))
+ self._assertSameValues(scoresX, self.getScores(multi.search(self.queryX, None, 1000, sort).scoreDocs, multi))
+ self._assertSameValues(scoresY, self.getScores(self.full.search(self.queryY, None, 1000, sort).scoreDocs, self.full))
+ self._assertSameValues(scoresY, self.getScores(multi.search(self.queryY, None, 1000, sort).scoreDocs, multi))
+ self._assertSameValues(scoresA, self.getScores(self.full.search(self.queryA, None, 1000, sort).scoreDocs, self.full))
+ self._assertSameValues(scoresA, self.getScores(multi.search(self.queryA, None, 1000, sort).scoreDocs, multi))
+
+ def testTopDocsScores(self):
+ """
+ There was previously a bug in FieldSortedHitQueue.maxscore when only
+ a single doc was added. That is what the following tests for.
+ """
+
+ sort = Sort()
+ nDocs = 10
+
+ # try to pick a query that will result in an unnormalized
+ # score greater than 1 to test for correct normalization
+ docs1 = self.full.search(self.queryE, None, nDocs, sort)
+
+ # a filter that only allows through the first hit
+ class filter(PythonFilter):
+ def bits(_self, reader):
+ bs = BitSet(reader.maxDoc())
+ bs.set(0, reader.maxDoc())
+ bs.set(docs1.scoreDocs[0].doc)
+ return bs
+
+ filt = filter()
+
+ docs2 = self.full.search(self.queryE, filt, nDocs, sort)
+ self.assertEqual(docs1.scoreDocs[0].score,
+ docs2.scoreDocs[0].score,
+ 1e-6)
+
+ def testSortWithoutFillFields(self):
+ """
+ There was previously a bug in TopFieldCollector when fillFields was
+ set to False - the same doc and score was set in ScoreDoc[]
+ array. This test asserts that if fillFields is False, the documents
+ are set properly. It does not use Searcher's default search
+ methods(with Sort) since all set fillFields to True.
+ """
+
+ sorts = [Sort(SortField.FIELD_DOC), Sort()]
+ for sort in sorts:
+ q = MatchAllDocsQuery()
+ tdc = TopFieldCollector.create(sort, 10, False,
+ False, False, True)
+ self.full.search(q, tdc)
+
+ sds = tdc.topDocs().scoreDocs
+ for i in xrange(1, len(sds)):
+ self.assert_(sds[i].doc != sds[i - 1].doc)
+
+ def testSortWithoutScoreTracking(self):
+ """
+ Two Sort criteria to instantiate the multi/single comparators.
+ """
+
+ sorts = [Sort(SortField.FIELD_DOC), Sort()]
+ for sort in sorts:
+ q = MatchAllDocsQuery()
+ tdc = TopFieldCollector.create(sort, 10, True, False,
+ False, True)
+
+ self.full.search(q, tdc)
+
+ tds = tdc.topDocs()
+ sds = tds.scoreDocs
+ for sd in sds:
+ self.assert_(Float.isNaN_(sd.score))
+
+ self.assert_(Float.isNaN_(tds.getMaxScore()))
+
+ def testSortWithScoreNoMaxScoreTracking(self):
+ """
+ Two Sort criteria to instantiate the multi/single comparators.
+ """
+
+ sorts = [Sort(SortField.FIELD_DOC), Sort()]
+ for sort in sorts:
+ q = MatchAllDocsQuery()
+ tdc = TopFieldCollector.create(sort, 10, True, True,
+ False, True)
+
+ self.full.search(q, tdc)
+
+ tds = tdc.topDocs()
+ sds = tds.scoreDocs
+ for sd in sds:
+ self.assert_(not Float.isNaN_(sd.score))
+
+ self.assert_(Float.isNaN_(tds.getMaxScore()))
+
+ def testSortWithScoreAndMaxScoreTracking(self):
+ """
+ Two Sort criteria to instantiate the multi/single comparators.
+ """
+
+ sorts = [Sort(SortField.FIELD_DOC), Sort()]
+ for sort in sorts:
+ q = MatchAllDocsQuery()
+ tdc = TopFieldCollector.create(sort, 10, True, True,
+ True, True)
+
+ self.full.search(q, tdc)
+
+ tds = tdc.topDocs()
+ sds = tds.scoreDocs
+ for sd in sds:
+ self.assert_(not Float.isNaN_(sd.score))
+
+ self.assert_(not Float.isNaN_(tds.getMaxScore()))
+
+ def testOutOfOrderDocsScoringSort(self):
+ """
+ Two Sort criteria to instantiate the multi/single comparators.
+ """
+
+ sorts = [Sort(SortField.FIELD_DOC), Sort()]
+
+ tfcOptions = [[False, False, False],
+ [False, False, True],
+ [False, True, False],
+ [False, True, True],
+ [True, False, False],
+ [True, False, True],
+ [True, True, False],
+ [True, True, True]]
+
+ actualTFCClasses = [
+ "OutOfOrderOneComparatorNonScoringCollector",
+ "OutOfOrderOneComparatorScoringMaxScoreCollector",
+ "OutOfOrderOneComparatorScoringNoMaxScoreCollector",
+ "OutOfOrderOneComparatorScoringMaxScoreCollector",
+ "OutOfOrderOneComparatorNonScoringCollector",
+ "OutOfOrderOneComparatorScoringMaxScoreCollector",
+ "OutOfOrderOneComparatorScoringNoMaxScoreCollector",
+ "OutOfOrderOneComparatorScoringMaxScoreCollector"
+ ]
+
+ # Save the original value to set later.
+ origVal = BooleanQuery.getAllowDocsOutOfOrder()
+
+ BooleanQuery.setAllowDocsOutOfOrder(True)
+ bq = BooleanQuery()
+
+ # Add a Query with SHOULD, since bw.scorer() returns BooleanScorer2
+ # which delegates to BS if there are no mandatory clauses.
+ bq.add(MatchAllDocsQuery(), BooleanClause.Occur.SHOULD)
+
+ # Set minNrShouldMatch to 1 so that BQ will not optimize rewrite to
+ # return the clause instead of BQ.
+ bq.setMinimumNumberShouldMatch(1)
+
+ try:
+ for sort in sorts:
+ for tfcOption, actualTFCClass in izip(tfcOptions,
+ actualTFCClasses):
+ tdc = TopFieldCollector.create(sort, 10, tfcOption[0],
+ tfcOption[1], tfcOption[2],
+ False)
+
+ self.assert_(tdc.getClass().getName().endswith("$" + actualTFCClass))
+
+ self.full.search(bq, tdc)
+
+ tds = tdc.topDocs()
+ sds = tds.scoreDocs
+ self.assertEqual(10, len(sds))
+ finally:
+ # Whatever happens, reset BooleanQuery.allowDocsOutOfOrder to the
+ # original value. Don't set it to False in case the
+ # implementation in BQ will change some day.
+ BooleanQuery.setAllowDocsOutOfOrder(origVal)
+
+ def testSortWithScoreAndMaxScoreTrackingNoResults(self):
+ """
+ Two Sort criteria to instantiate the multi/single comparators.
+ """
+
+ sorts = [Sort(SortField.FIELD_DOC), Sort()]
+ for sort in sorts:
+ tdc = TopFieldCollector.create(sort, 10, True, True, True, True)
+ tds = tdc.topDocs()
+ self.assertEqual(0, tds.totalHits)
+ self.assert_(Float.isNaN_(tds.getMaxScore()))
+
+ def runMultiSorts(self, multi, isFull):
+ """
+ runs a variety of sorts useful for multisearchers
+ """
+ sort = Sort()
+
+ sort.setSort(SortField.FIELD_DOC)
+ expected = isFull and "ABCDEFGHIJ" or "ACEGIBDFHJ"
+ self._assertMatches(multi, self.queryA, sort, expected)
+
+ sort.setSort(SortField("int", SortField.INT))
+ expected = isFull and "IDHFGJABEC" or "IDHFGJAEBC"
+ self._assertMatches(multi, self.queryA, sort, expected)
+
+ sort.setSort([SortField("int", SortField.INT), SortField.FIELD_DOC])
+ expected = isFull and "IDHFGJABEC" or "IDHFGJAEBC"
+ self._assertMatches(multi, self.queryA, sort, expected)
+
+ sort.setSort("int")
+ expected = isFull and "IDHFGJABEC" or "IDHFGJAEBC"
+ self._assertMatches(multi, self.queryA, sort, expected)
+
+ sort.setSort([SortField("float", SortField.FLOAT), SortField.FIELD_DOC])
+ self._assertMatches(multi, self.queryA, sort, "GDHJCIEFAB")
+
+ sort.setSort("float")
+ self._assertMatches(multi, self.queryA, sort, "GDHJCIEFAB")
+
+ sort.setSort("string")
+ self._assertMatches(multi, self.queryA, sort, "DJAIHGFEBC")
+
+ sort.setSort("int", True)
+ expected = isFull and "CABEJGFHDI" or "CAEBJGFHDI"
+ self._assertMatches(multi, self.queryA, sort, expected)
+
+ sort.setSort("float", True)
+ self._assertMatches(multi, self.queryA, sort, "BAFECIJHDG")
+
+ sort.setSort("string", True)
+ self._assertMatches(multi, self.queryA, sort, "CBEFGHIAJD")
+
+ sort.setSort(["int", "float"])
+ self._assertMatches(multi, self.queryA, sort, "IDHFGJEABC")
sort.setSort(["float", "string"])
- self._assertSameValues(scoresX, gs(full.search(queryX, sort)))
- self._assertSameValues(scoresX, gs(multi.search(queryX, sort)))
- self._assertSameValues(scoresY, gs(full.search(queryY, sort)))
- self._assertSameValues(scoresY, gs(multi.search(queryY, sort)))
- self._assertSameValues(scoresA, gs(full.search(queryA, sort)))
- self._assertSameValues(scoresA, gs(multi.search(queryA, sort)))
+ self._assertMatches(multi, self.queryA, sort, "GDHJICEFAB")
+
+ sort.setSort("int")
+ self._assertMatches(multi, self.queryF, sort, "IZJ")
+
+ sort.setSort("int", True)
+ self._assertMatches(multi, self.queryF, sort, "JZI")
+
+ sort.setSort("float")
+ self._assertMatches(multi, self.queryF, sort, "ZJI")
+
+ sort.setSort("string")
+ self._assertMatches(multi, self.queryF, sort, "ZJI")
+
+ sort.setSort("string", True)
+ self._assertMatches(multi, self.queryF, sort, "IJZ")
+
+ # up to this point, all of the searches should have "sane"
+ # FieldCache behavior, and should have reused hte cache in several
+ # cases
+ self._assertSaneFieldCaches(self.getName() + " various")
+
+ # next we'll check Locale based(String[]) for 'string', so purge first
+ FieldCache.DEFAULT.purgeAllCaches()
+
+ sort.setSort([SortField("string", Locale.US)])
+ self._assertMatches(multi, self.queryA, sort, "DJAIHGFEBC")
+
+ sort.setSort([SortField("string", Locale.US, True)])
+ self._assertMatches(multi, self.queryA, sort, "CBEFGHIAJD")
+
+ sort.setSort([SortField("string", Locale.UK)])
+ self._assertMatches(multi, self.queryA, sort, "DJAIHGFEBC")
+
+ self._assertSaneFieldCaches(self.getName() + " Locale.US + Locale.UK")
+ FieldCache.DEFAULT.purgeAllCaches()
+
+ def _assertMatches(self, searcher, query, sort, expectedResult):
+ """
+ make sure the documents returned by the search match the expected
+ list
+ """
+
+ # ScoreDoc[] result = searcher.search(query, None, 1000, sort).scoreDocs
+ hits = searcher.search(query, None, len(expectedResult), sort)
+ sds = hits.scoreDocs
+
+ self.assertEqual(hits.totalHits, len(expectedResult))
+ buff = []
+ for sd in sds:
+ doc = searcher.doc(sd.doc)
+ v = doc.getValues("tracer")
+ for _v in v:
+ buff.append(_v)
+
+ self.assertEqual(expectedResult, ''.join(buff))
+
+ def getScores(self, hits, searcher):
+
+ scoreMap = {}
+ for hit in hits:
+ doc = searcher.doc(hit.doc)
+ v = doc.getValues("tracer")
+ self.assertEqual(len(v), 1)
+ scoreMap[v[0]] = hit.score
+
+ return scoreMap
+
+ def _assertSameValues(self, m1, m2):
+ """
+ make sure all the values in the maps match
+ """
+
+ self.assertEquals(len(m1), len(m2))
+ for key in m1.iterkeys():
+ self.assertEquals(m1[key], m2[key], 1e-6)
+
+ def getName(self):
+
+ return type(self).__name__
+
+ def _assertSaneFieldCaches(self, msg):
+
+ entries = FieldCache.DEFAULT.getCacheEntries()
+
+ insanity = FieldCacheSanityChecker.checkSanity(entries)
+ self.assertEqual(0, len(insanity),
+ msg + ": Insane FieldCache usage(s) found")
+
+
+class MyFieldComparator(PythonFieldComparator):
+
+ def __init__(self, numHits):
+ super(MyFieldComparator, self).__init__()
+ self.slotValues = [0] * numHits
+
+ def copy(self, slot, doc):
+ self.slotValues[slot] = self.docValues[doc]
+
+ def compare(self, slot1, slot2):
+ return self.slotValues[slot1] - self.slotValues[slot2]
+
+ def compareBottom(self, doc):
+ return self.bottomValue - self.docValues[doc]
+
+ def setBottom(self, bottom):
+ self.bottomValue = self.slotValues[bottom]
+
+ def setNextReader(self, reader, docBase):
+
+ class intParser(PythonIntParser):
+ def parseInt(_self, val):
+ return (ord(val[0]) - ord('A')) * 123456
+
+ self.docValues = FieldCache.DEFAULT.getInts(reader, "parser",
+ intParser())
+
+ def value(self, slot):
+ return Integer(self.slotValues[slot])
+
+
+class MyFieldComparatorSource(PythonFieldComparatorSource):
+
+ def newComparator(self, fieldname, numHits, sortPos, reversed):
+ return MyFieldComparator(numHits)
+
+
+class SampleComparable(PythonComparable):
+
+ def __init__(self, s):
+ super(SampleComparable, self).__init__()
+
+ self.string_part, self.int_part = s.split('-', 1)
+ self.int_part = int(self.int_part)
+
+ def compareTo(self, o):
+
+ # The not-so-obvious code below first checks that o needs to be
+ # downcast into a SampleComparable, does it if needed, and sets it
+ # to the inner python instance to work, getting it through '.self'.
+ # If the compareTo() call came from Java then o may just be
+ # wrapped with java.lang.Object, hence not an instance of
+ # SampleComparable even though the actual Java object is.
+ # In that case isinstance() returns False but cast_() succeeds.
+ if not isinstance(o, SampleComparable):
+ o = SampleComparable.cast_(o).self
+
+ i = cmp(self.string_part, o.string_part)
+ if i == 0:
+ return cmp(self.int_part, o.int_part)
+
+ return i
+
+ def getComparatorSource(cls):
+
+ class sortComparatorSource(PythonSortComparatorSource):
+
+ def newComparator(_self, reader, fieldName):
+ enumerator = reader.terms(Term(fieldName, ""))
+ try:
+ class scoreDocComparator(PythonScoreDocComparator):
+ def __init__(self_):
+ super(scoreDocComparator, self_).__init__()
+ self_.cachedValues = _self.fillCache(reader, enumerator, fieldName)
+
+ def compare(self_, i, j):
+ return self_.cachedValues[i.doc].compareTo(self_.cachedValues[j.doc])
+
+ def sortValue(self_, i):
+ return self_.cachedValues[i.doc]
+
+ def sortType(self_):
+ return SortField.CUSTOM
+
+ return scoreDocComparator()
+ finally:
+ enumerator.close()
+
+ def fillCache(_self, reader, enumerator, fieldName):
+ """
+ Returns an array of objects which represent that natural order
+ of the term values in the given field.
+
+ param reader Terms are in this index.
+ param enumerator Use this to get the term values and TermDocs.
+ param fieldName Comparables should be for this field.
+ return Array of objects representing natural order of terms
+ in field.
+ throws IOException If an error occurs reading the index.
+ """
+ retArray = [None] * reader.maxDoc()
+ if len(retArray) > 0:
+ termDocs = reader.termDocs()
+ try:
+ if enumerator.term() is None:
+ raise AssertionError, "no terms in field " + fieldName
+ while True:
+ term = enumerator.term()
+ if term.field() != fieldName:
+ break
+ termval = _self.getComparable(term.text())
+ termDocs.seek(enumerator)
+ while termDocs.next():
+ retArray[termDocs.doc()] = termval
+ if not enumerator.next():
+ break
+ finally:
+ termDocs.close()
+ return retArray
+
+ def getComparable(_self, termtext):
+ return SampleComparable(termtext)
+
+ return sortComparatorSource()
+
+ def getComparator(cls):
+
+ class sortComparator(PythonSortComparator):
+ def getComparable(_self, termtext):
+ return SampleComparable(termtext)
+
+ def hashCode(_self):
+ return _self.getClass().getName().hashCode()
+
+ def equals(_self, that):
+ return _self.getClass().equals(that.getClass())
+
+ return sortComparator()
+
+ getComparatorSource = classmethod(getComparatorSource)
+ getComparator = classmethod(getComparator)
if __name__ == "__main__":