You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mh...@apache.org on 2015/02/12 11:10:17 UTC
svn commit: r1659195 - in /lucene/dev/trunk/lucene: ./
core/src/java/org/apache/lucene/util/ core/src/test/org/apache/lucene/util/
misc/src/java/org/apache/lucene/search/
misc/src/test/org/apache/lucene/search/
Author: mharwood
Date: Thu Feb 12 10:10:16 2015
New Revision: 1659195
URL: http://svn.apache.org/r1659195
Log:
LUCENE-6066: new DiversifiedTopDocsCollector in misc and PriorityQueue.remove method
Added:
lucene/dev/trunk/lucene/misc/src/java/org/apache/lucene/search/DiversifiedTopDocsCollector.java (with props)
lucene/dev/trunk/lucene/misc/src/test/org/apache/lucene/search/TestDiversifiedTopDocsCollector.java (with props)
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/PriorityQueue.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestPriorityQueue.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1659195&r1=1659194&r2=1659195&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Thu Feb 12 10:10:16 2015
@@ -33,6 +33,10 @@ API Changes
New Features
+* LUCENE-6066: Added DiversifiedTopDocsCollector to misc for collecting no more
+ than a given number of results under a choice of key. Introduces new remove
+ method to core's PriorityQueue. (Mark Harwood)
+
* LUCENE-3922: Added JapaneseNumberFilter that normalizes Japanese numbers
in kansuji form to regular/Arabic numbers. (Gaute Lambertsen, Christian Moen)
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/PriorityQueue.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/PriorityQueue.java?rev=1659195&r1=1659194&r2=1659195&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/PriorityQueue.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/PriorityQueue.java Thu Feb 12 10:10:16 2015
@@ -17,17 +17,19 @@ package org.apache.lucene.util;
* limitations under the License.
*/
-/** A PriorityQueue maintains a partial ordering of its elements such that the
- * least element can always be found in constant time. Put()'s and pop()'s
- * require log(size) time.
+/**
+ * A PriorityQueue maintains a partial ordering of its elements such that the
+ * least element can always be found in constant time. Put()'s and pop()'s
+ * require log(size) time but the remove() cost implemented here is linear.
*
- * <p><b>NOTE</b>: This class will pre-allocate a full array of
- * length <code>maxSize+1</code> if instantiated via the
- * {@link #PriorityQueue(int,boolean)} constructor with
- * <code>prepopulate</code> set to <code>true</code>.
+ * <p>
+ * <b>NOTE</b>: This class will pre-allocate a full array of length
+ * <code>maxSize+1</code> if instantiated via the
+ * {@link #PriorityQueue(int,boolean)} constructor with <code>prepopulate</code>
+ * set to <code>true</code>.
*
* @lucene.internal
-*/
+ */
public abstract class PriorityQueue<T> {
private int size = 0;
private final int maxSize;
@@ -130,7 +132,7 @@ public abstract class PriorityQueue<T> {
public final T add(T element) {
size++;
heap[size] = element;
- upHeap();
+ upHeap(size);
return heap[1];
}
@@ -174,7 +176,7 @@ public abstract class PriorityQueue<T> {
heap[1] = heap[size]; // move last to first
heap[size] = null; // permit GC of objects
size--;
- downHeap(); // adjust heap
+ downHeap(1); // adjust heap
return result;
} else {
return null;
@@ -201,7 +203,7 @@ public abstract class PriorityQueue<T> {
* @return the new 'top' element.
*/
public final T updateTop() {
- downHeap();
+ downHeap(1);
return heap[1];
}
@@ -226,8 +228,31 @@ public abstract class PriorityQueue<T> {
size = 0;
}
- private final void upHeap() {
- int i = size;
+ /**
+ * Removes an existing element currently stored in the PriorityQueue. Cost is
+ * linear with the size of the queue. (A specialization of PriorityQueue which
+ * tracks element positions would provide a constant remove time but the
+ * trade-off would be extra cost to all additions/insertions)
+ */
+ public final boolean remove(T element) {
+ for (int i = 1; i <= size; i++) {
+ if (heap[i] == element) {
+ heap[i] = heap[size];
+ heap[size] = null; // permit GC of objects
+ size--;
+ if (i <= size) {
+ if (!upHeap(i)) {
+ downHeap(i);
+ }
+ }
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private final boolean upHeap(int origPos) {
+ int i = origPos;
T node = heap[i]; // save bottom node
int j = i >>> 1;
while (j > 0 && lessThan(node, heap[j])) {
@@ -236,10 +261,10 @@ public abstract class PriorityQueue<T> {
j = j >>> 1;
}
heap[i] = node; // install saved node
+ return i != origPos;
}
-
- private final void downHeap() {
- int i = 1;
+
+ private final void downHeap(int i) {
T node = heap[i]; // save top node
int j = i << 1; // find smaller child
int k = j + 1;
@@ -257,7 +282,7 @@ public abstract class PriorityQueue<T> {
}
heap[i] = node; // install saved node
}
-
+
/** This method returns the internal heap array as Object[].
* @lucene.internal
*/
Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestPriorityQueue.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestPriorityQueue.java?rev=1659195&r1=1659194&r2=1659195&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestPriorityQueue.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestPriorityQueue.java Thu Feb 12 10:10:16 2015
@@ -17,21 +17,41 @@ package org.apache.lucene.util;
* limitations under the License.
*/
+import java.util.ArrayList;
import java.util.Random;
public class TestPriorityQueue extends LuceneTestCase {
- private static class IntegerQueue extends PriorityQueue<Integer> {
- public IntegerQueue(int count) {
- super(count);
- }
-
- @Override
- protected boolean lessThan(Integer a, Integer b) {
- return (a < b);
+ private static class IntegerQueue extends PriorityQueue<Integer> {
+ public IntegerQueue(int count) {
+ super(count);
+ }
+
+ @Override
+ protected boolean lessThan(Integer a, Integer b) {
+ if (a.equals(b)) {
+ assert (a != b);
+ int hashA = System.identityHashCode(a);
+ int hashB = System.identityHashCode(b);
+ assert (hashA != hashB);
+ return hashA < hashB;
+ }
+ return (a < b);
+ }
+
+ protected final void checkValidity() {
+ Object[] heapArray = getHeapArray();
+ for (int i = 1; i <= size(); i++) {
+ int parent = i >>> 1;
+ if (parent > 1) {
+ assertTrue(lessThan((Integer) heapArray[parent],
+ (Integer) heapArray[i]));
}
+ }
}
+ }
+
public void testPQ() throws Exception {
testPQ(atLeast(10000), random());
}
@@ -111,5 +131,61 @@ public class TestPriorityQueue extends L
assertEquals(size, pq.size());
assertEquals((Integer) 2, pq.top());
}
-
+
+ public void testRemovalsAndInsertions() {
+ Random random = random();
+ int numDocsInPQ = TestUtil.nextInt(random, 1, 100);
+ IntegerQueue pq = new IntegerQueue(numDocsInPQ);
+ Integer lastLeast = null;
+
+ // Basic insertion of new content
+ ArrayList<Integer> sds = new ArrayList<Integer>(numDocsInPQ);
+ for (int i = 0; i < numDocsInPQ * 10; i++) {
+ Integer newEntry = new Integer(Math.abs(random.nextInt()));
+ sds.add(newEntry);
+ Integer evicted = pq.insertWithOverflow(newEntry);
+ pq.checkValidity();
+ if (evicted != null) {
+ assertTrue(sds.remove(evicted));
+ if (evicted != newEntry) {
+ assertTrue(evicted == lastLeast);
+ }
+ }
+ Integer newLeast = pq.top();
+ if ((lastLeast != null) && (newLeast != newEntry)
+ && (newLeast != lastLeast)) {
+ // If there has been a change of least entry and it wasn't our new
+ // addition we expect the scores to increase
+ assertTrue(newLeast <= newEntry);
+ assertTrue(newLeast >= lastLeast);
+ }
+ lastLeast = newLeast;
+
+ }
+
+ // Try many random additions to existing entries - we should always see
+ // increasing scores in the lowest entry in the PQ
+ for (int p = 0; p < 500000; p++) {
+ int element = (int) (random.nextFloat() * (sds.size() - 1));
+ Integer objectToRemove = sds.get(element);
+ assertTrue(sds.remove(element) == objectToRemove);
+ assertTrue(pq.remove(objectToRemove));
+ pq.checkValidity();
+ Integer newEntry = new Integer(Math.abs(random.nextInt()));
+ sds.add(newEntry);
+ assertNull(pq.insertWithOverflow(newEntry));
+ pq.checkValidity();
+ Integer newLeast = pq.top();
+ if ((objectToRemove != lastLeast) && (lastLeast != null)
+ && (newLeast != newEntry)) {
+ // If there has been a change of least entry and it wasn't our new
+ // addition or the loss of our randomly removed entry we expect the
+ // scores to increase
+ assertTrue(newLeast <= newEntry);
+ assertTrue(newLeast >= lastLeast);
+ }
+ lastLeast = newLeast;
+ }
+ }
+
}
Added: lucene/dev/trunk/lucene/misc/src/java/org/apache/lucene/search/DiversifiedTopDocsCollector.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/misc/src/java/org/apache/lucene/search/DiversifiedTopDocsCollector.java?rev=1659195&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/misc/src/java/org/apache/lucene/search/DiversifiedTopDocsCollector.java (added)
+++ lucene/dev/trunk/lucene/misc/src/java/org/apache/lucene/search/DiversifiedTopDocsCollector.java Thu Feb 12 10:10:16 2015
@@ -0,0 +1,251 @@
+package org.apache.lucene.search;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Stack;
+
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.search.DiversifiedTopDocsCollector.ScoreDocKey;
+import org.apache.lucene.util.PriorityQueue;
+
+/**
+ * A {@link TopDocsCollector} that controls diversity in results by ensuring no
+ * more than maxHitsPerKey results from a common source are collected in the
+ * final results.
+ *
+ * An example application might be a product search in a marketplace where no
+ * more than 3 results per retailer are permitted in search results.
+ *
+ * <p>
+ * To compare behaviour with other forms of collector, a useful analogy might be
+ * the problem of making a compilation album of 1967's top hit records:
+ * <ol>
+ * <li>A vanilla query's results might look like a "Best of the Beatles" album -
+ * high quality but not much diversity</li>
+ * <li>A GroupingSearch would produce the equivalent of "The 10 top-selling
+ * artists of 1967 - some killer and quite a lot of filler"</li>
+ * <li>A "diversified" query would be the top 20 hit records of that year - with
+ * a max of 3 Beatles hits in order to maintain diversity</li>
+ * </ol>
+ * This collector improves on the "GroupingSearch" type queries by
+ * <ul>
+ * <li>Working in one pass over the data</li>
+ * <li>Not requiring the client to guess how many groups are required</li>
+ * <li>Removing low-scoring "filler" which sits at the end of each group's hits</li>
+ * </ul>
+ *
+ * This is an abstract class and subclasses have to provide a source of keys for
+ * documents which is then used to help identify duplicate sources.
+ *
+ * @lucene.experimental
+ *
+ */
+public abstract class DiversifiedTopDocsCollector extends
+ TopDocsCollector<ScoreDocKey> {
+ ScoreDocKey spare;
+ private ScoreDocKeyQueue globalQueue;
+ private int numHits;
+ private Map<Long, ScoreDocKeyQueue> perKeyQueues;
+ protected int maxNumPerKey;
+ private Stack<ScoreDocKeyQueue> sparePerKeyQueues = new Stack<>();
+
+ public DiversifiedTopDocsCollector(int numHits, int maxHitsPerKey) {
+ super(new ScoreDocKeyQueue(numHits));
+ // Need to access pq.lessThan() which is protected so have to cast here...
+ this.globalQueue = (ScoreDocKeyQueue) pq;
+ perKeyQueues = new HashMap<Long, ScoreDocKeyQueue>();
+ this.numHits = numHits;
+ this.maxNumPerKey = maxHitsPerKey;
+ }
+
+ /**
+ * Get a source of values used for grouping keys
+ */
+ protected abstract NumericDocValues getKeys(LeafReaderContext context);
+
+ @Override
+ public boolean needsScores() {
+ return true;
+ }
+
+ @Override
+ protected TopDocs newTopDocs(ScoreDoc[] results, int start) {
+ if (results == null) {
+ return EMPTY_TOPDOCS;
+ }
+
+ // We need to compute maxScore in order to set it in TopDocs. If start == 0,
+ // it means the largest element is already in results, use its score as
+ // maxScore. Otherwise pop everything else, until the largest element is
+ // extracted and use its score as maxScore.
+ float maxScore = Float.NaN;
+ if (start == 0) {
+ maxScore = results[0].score;
+ } else {
+ for (int i = globalQueue.size(); i > 1; i--) {
+ globalQueue.pop();
+ }
+ maxScore = globalQueue.pop().score;
+ }
+
+ return new TopDocs(totalHits, results, maxScore);
+ }
+
+ protected ScoreDocKey insert(ScoreDocKey addition, int docBase,
+ NumericDocValues keys) {
+ if ((globalQueue.size() >= numHits)
+ && (globalQueue.lessThan(addition, globalQueue.top()))) {
+ // Queue is full and proposed addition is not a globally
+ // competitive score
+ return addition;
+ }
+ // The addition stands a chance of being entered - check the
+ // key-specific restrictions.
+ // We delay fetching the key until we are certain the score is globally
+ // competitive. We need to adjust the ScoreDoc's global doc value to be
+ // a leaf reader value when looking up keys
+ addition.key = keys.get(addition.doc - docBase);
+
+ // For this to work the choice of key class needs to implement
+ // hashcode and equals.
+ ScoreDocKeyQueue thisKeyQ = perKeyQueues.get(addition.key);
+
+ if (thisKeyQ == null) {
+ if (sparePerKeyQueues.size() == 0) {
+ thisKeyQ = new ScoreDocKeyQueue(maxNumPerKey);
+ } else {
+ thisKeyQ = sparePerKeyQueues.pop();
+ }
+ perKeyQueues.put(addition.key, thisKeyQ);
+ }
+ ScoreDocKey perKeyOverflow = thisKeyQ.insertWithOverflow(addition);
+ if (perKeyOverflow == addition) {
+ // This key group has reached capacity and our proposed addition
+ // was not competitive in the group - do not insert into the
+ // main PQ or the key will be overly-populated in final results.
+ return addition;
+ }
+ if (perKeyOverflow == null) {
+ // This proposed addition is also locally competitive within the
+ // key group - make a global entry and return
+ ScoreDocKey globalOverflow = globalQueue.insertWithOverflow(addition);
+ perKeyGroupRemove(globalOverflow);
+ return globalOverflow;
+ }
+ // For the given key, we have reached max capacity but the new addition
+ // is better than a prior entry that still exists in the global results
+ // - request the weaker-scoring entry to be removed from the global
+ // queue.
+ globalQueue.remove(perKeyOverflow);
+ // Add the locally-competitive addition into the globally queue
+ globalQueue.add(addition);
+ return perKeyOverflow;
+ }
+
+ private void perKeyGroupRemove(ScoreDocKey globalOverflow) {
+ if (globalOverflow == null) {
+ return;
+ }
+ ScoreDocKeyQueue q = perKeyQueues.get(globalOverflow.key);
+ ScoreDocKey perKeyLowest = q.pop();
+ // The least globally-competitive item should also always be the least
+ // key-local item
+ assert (globalOverflow == perKeyLowest);
+ if (q.size() == 0) {
+ perKeyQueues.remove(globalOverflow.key);
+ sparePerKeyQueues.push(q);
+ }
+ }
+
+ @Override
+ public LeafCollector getLeafCollector(LeafReaderContext context)
+ throws IOException {
+ final int base = context.docBase;
+ final NumericDocValues keySource = getKeys(context);
+
+ return new LeafCollector() {
+ Scorer scorer;
+
+ @Override
+ public void setScorer(Scorer scorer) throws IOException {
+ this.scorer = scorer;
+ }
+
+ @Override
+ public void collect(int doc) throws IOException {
+ float score = scorer.score();
+
+ // This collector cannot handle NaN
+ assert !Float.isNaN(score);
+
+ totalHits++;
+
+ doc += base;
+
+ if (spare == null) {
+ spare = new ScoreDocKey(doc, score);
+ } else {
+ spare.doc = doc;
+ spare.score = score;
+ }
+ spare = insert(spare, base, keySource);
+ }
+ };
+ }
+
+ static class ScoreDocKeyQueue extends PriorityQueue<ScoreDocKey> {
+
+ ScoreDocKeyQueue(int size) {
+ super(size);
+ }
+
+ @Override
+ protected final boolean lessThan(ScoreDocKey hitA, ScoreDocKey hitB) {
+ if (hitA.score == hitB.score)
+ return hitA.doc > hitB.doc;
+ else
+ return hitA.score < hitB.score;
+ }
+ }
+
+ //
+ /**
+ * An extension to ScoreDoc that includes a key used for grouping purposes
+ */
+ static public class ScoreDocKey extends ScoreDoc {
+ Long key;
+
+ protected ScoreDocKey(int doc, float score) {
+ super(doc, score);
+ }
+
+ public Long getKey() {
+ return key;
+ }
+
+ @Override
+ public String toString() {
+ return "key:" + key + " doc=" + doc + " s=" + score;
+ }
+
+ }
+
+}
Added: lucene/dev/trunk/lucene/misc/src/test/org/apache/lucene/search/TestDiversifiedTopDocsCollector.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/misc/src/test/org/apache/lucene/search/TestDiversifiedTopDocsCollector.java?rev=1659195&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/misc/src/test/org/apache/lucene/search/TestDiversifiedTopDocsCollector.java (added)
+++ lucene/dev/trunk/lucene/misc/src/test/org/apache/lucene/search/TestDiversifiedTopDocsCollector.java Thu Feb 12 10:10:16 2015
@@ -0,0 +1,464 @@
+package org.apache.lucene.search;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.FloatDocValuesField;
+import org.apache.lucene.document.FloatField;
+import org.apache.lucene.document.SortedDocValuesField;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.FieldInvertState;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.SlowCompositeReaderWrapper;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.StoredDocument;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.similarities.Similarity;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+
+/**
+ * Demonstrates an application of the {@link DiversifiedTopDocsCollector} in
+ * assembling a collection of top results but without over-representation of any
+ * one source (in this case top-selling singles from the 60s without having them
+ * all be Beatles records...). Results are ranked by the number of weeks a
+ * single is top of the charts and de-duped by the artist name.
+ *
+ */
+public class TestDiversifiedTopDocsCollector extends LuceneTestCase {
+
+ public void testNonDiversifiedResults() throws Exception {
+ int numberOfTracksOnCompilation = 10;
+ int expectedMinNumOfBeatlesHits = 5;
+ TopDocs res = searcher.search(getTestQuery(), numberOfTracksOnCompilation);
+ assertEquals(numberOfTracksOnCompilation, res.scoreDocs.length);
+ // due to randomization of segment merging in tests the exact number of Beatles hits
+ // selected varies between 5 and 6 but we prove the point they are over-represented
+ // in our result set using a standard search.
+ assertTrue(getMaxNumRecordsPerArtist(res.scoreDocs) >= expectedMinNumOfBeatlesHits);
+ }
+
+ public void testFirstPageDiversifiedResults() throws Exception {
+ // Using a diversified collector we can limit the results from
+ // any one artist.
+ int requiredMaxHitsPerArtist = 2;
+ int numberOfTracksOnCompilation = 10;
+ DiversifiedTopDocsCollector tdc = doDiversifiedSearch(
+ numberOfTracksOnCompilation, requiredMaxHitsPerArtist);
+ ScoreDoc[] sd = tdc.topDocs(0).scoreDocs;
+ assertEquals(numberOfTracksOnCompilation, sd.length);
+ assertTrue(getMaxNumRecordsPerArtist(sd) <= requiredMaxHitsPerArtist);
+ }
+
+ public void testSecondPageResults() throws Exception {
+ int numberOfTracksPerCompilation = 10;
+ int numberOfCompilations = 2;
+ int requiredMaxHitsPerArtist = 1;
+
+ // Volume 2 of our hits compilation - start at position 10
+ DiversifiedTopDocsCollector tdc = doDiversifiedSearch(
+ numberOfTracksPerCompilation * numberOfCompilations,
+ requiredMaxHitsPerArtist);
+ ScoreDoc[] volume2 = tdc.topDocs(numberOfTracksPerCompilation,
+ numberOfTracksPerCompilation).scoreDocs;
+ assertEquals(numberOfTracksPerCompilation, volume2.length);
+ assertTrue(getMaxNumRecordsPerArtist(volume2) <= requiredMaxHitsPerArtist);
+
+ }
+
+ public void testInvalidArguments() throws Exception {
+ int numResults = 5;
+ DiversifiedTopDocsCollector tdc = doDiversifiedSearch(numResults, 15);
+
+ // start < 0
+ assertEquals(0, tdc.topDocs(-1).scoreDocs.length);
+
+ // start > pq.size()
+ assertEquals(0, tdc.topDocs(numResults + 1).scoreDocs.length);
+
+ // start == pq.size()
+ assertEquals(0, tdc.topDocs(numResults).scoreDocs.length);
+
+ // howMany < 0
+ assertEquals(0, tdc.topDocs(0, -1).scoreDocs.length);
+
+ // howMany == 0
+ assertEquals(0, tdc.topDocs(0, 0).scoreDocs.length);
+
+ }
+
+ // Diversifying collector that looks up de-dup keys using SortedDocValues
+ // from a top-level Reader
+ private static final class DocValuesDiversifiedCollector extends
+ DiversifiedTopDocsCollector {
+ private final SortedDocValues sdv;
+
+ public DocValuesDiversifiedCollector(int size, int maxHitsPerKey,
+ SortedDocValues sdv) {
+ super(size, maxHitsPerKey);
+ this.sdv = sdv;
+ }
+
+ @Override
+ protected NumericDocValues getKeys(final LeafReaderContext context) {
+
+ return new NumericDocValues() {
+ @Override
+ public long get(int docID) {
+ // Keys are always expressed as a long so we obtain the
+ // ordinal for our String-based artist name here
+ return sdv.getOrd(context.docBase + docID);
+ }
+ };
+ }
+ }
+
+ // Alternative, faster implementation for converting String keys to longs
+ // but with the potential for hash collisions
+ private static final class HashedDocValuesDiversifiedCollector extends
+ DiversifiedTopDocsCollector {
+
+ private final String field;
+ private BinaryDocValues vals;
+
+ public HashedDocValuesDiversifiedCollector(int size, int maxHitsPerKey,
+ String field) {
+ super(size, maxHitsPerKey);
+ this.field = field;
+ }
+
+ @Override
+ protected NumericDocValues getKeys(LeafReaderContext context) {
+ return new NumericDocValues() {
+ @Override
+ public long get(int docID) {
+ return vals == null ? -1 : vals.get(docID).hashCode();
+ }
+ };
+ }
+
+ @Override
+ public LeafCollector getLeafCollector(LeafReaderContext context)
+ throws IOException {
+ this.vals = DocValues.getBinary(context.reader(), field);
+ return super.getLeafCollector(context);
+ }
+ }
+
+ // Test data - format is artist, song, weeks at top of charts
+ private static String[] hitsOfThe60s = {
+ "1966\tSPENCER DAVIS GROUP\tKEEP ON RUNNING\t1",
+ "1966\tOVERLANDERS\tMICHELLE\t3",
+ "1966\tNANCY SINATRA\tTHESE BOOTS ARE MADE FOR WALKIN'\t4",
+ "1966\tWALKER BROTHERS\tTHE SUN AIN'T GONNA SHINE ANYMORE\t4",
+ "1966\tSPENCER DAVIS GROUP\tSOMEBODY HELP ME\t2",
+ "1966\tDUSTY SPRINGFIELD\tYOU DON'T HAVE TO SAY YOU LOVE ME\t1",
+ "1966\tMANFRED MANN\tPRETTY FLAMINGO\t3",
+ "1966\tROLLING STONES\tPAINT IT, BLACK\t1",
+ "1966\tFRANK SINATRA\tSTRANGERS IN THE NIGHT\t3",
+ "1966\tBEATLES\tPAPERBACK WRITER\t5",
+ "1966\tKINKS\tSUNNY AFTERNOON\t2",
+ "1966\tGEORGIE FAME AND THE BLUE FLAMES\tGETAWAY\t1",
+ "1966\tCHRIS FARLOWE\tOUT OF TIME\t1",
+ "1966\tTROGGS\tWITH A GIRL LIKE YOU\t2",
+ "1966\tBEATLES\tYELLOW SUBMARINE/ELEANOR RIGBY\t4",
+ "1966\tSMALL FACES\tALL OR NOTHING\t1",
+ "1966\tJIM REEVES\tDISTANT DRUMS\t5",
+ "1966\tFOUR TOPS\tREACH OUT I'LL BE THERE\t3",
+ "1966\tBEACH BOYS\tGOOD VIBRATIONS\t2",
+ "1966\tTOM JONES\tGREEN GREEN GRASS OF HOME\t4",
+ "1967\tMONKEES\tI'M A BELIEVER\t4",
+ "1967\tPETULA CLARK\tTHIS IS MY SONG\t2",
+ "1967\tENGELBERT HUMPERDINCK\tRELEASE ME\t4",
+ "1967\tNANCY SINATRA AND FRANK SINATRA\tSOMETHIN' STUPID\t2",
+ "1967\tSANDIE SHAW\tPUPPET ON A STRING\t3",
+ "1967\tTREMELOES\tSILENCE IS GOLDEN\t3",
+ "1967\tPROCOL HARUM\tA WHITER SHADE OF PALE\t4",
+ "1967\tBEATLES\tALL YOU NEED IS LOVE\t7",
+ "1967\tSCOTT MCKENZIE\tSAN FRANCISCO (BE SURE TO WEAR SOME FLOWERS INYOUR HAIR)\t4",
+ "1967\tENGELBERT HUMPERDINCK\tTHE LAST WALTZ\t5",
+ "1967\tBEE GEES\tMASSACHUSETTS (THE LIGHTS WENT OUT IN)\t4",
+ "1967\tFOUNDATIONS\tBABY NOW THAT I'VE FOUND YOU\t2",
+ "1967\tLONG JOHN BALDRY\tLET THE HEARTACHES BEGIN\t2",
+ "1967\tBEATLES\tHELLO GOODBYE\t5",
+ "1968\tGEORGIE FAME\tTHE BALLAD OF BONNIE AND CLYDE\t1",
+ "1968\tLOVE AFFAIR\tEVERLASTING LOVE\t2",
+ "1968\tMANFRED MANN\tMIGHTY QUINN\t2",
+ "1968\tESTHER AND ABI OFARIM\tCINDERELLA ROCKEFELLA\t3",
+ "1968\tDAVE DEE, DOZY, BEAKY, MICK AND TICH\tTHE LEGEND OF XANADU\t1",
+ "1968\tBEATLES\tLADY MADONNA\t2",
+ "1968\tCLIFF RICHARD\tCONGRATULATIONS\t2",
+ "1968\tLOUIS ARMSTRONG\tWHAT A WONDERFUL WORLD/CABARET\t4",
+ "1968\tGARRY PUCKETT AND THE UNION GAP\tYOUNG GIRL\t4",
+ "1968\tROLLING STONES\tJUMPING JACK FLASH\t2",
+ "1968\tEQUALS\tBABY COME BACK\t3", "1968\tDES O'CONNOR\tI PRETEND\t1",
+ "1968\tTOMMY JAMES AND THE SHONDELLS\tMONY MONY\t2",
+ "1968\tCRAZY WORLD OF ARTHUR BROWN\tFIRE!\t1",
+ "1968\tTOMMY JAMES AND THE SHONDELLS\tMONY MONY\t1",
+ "1968\tBEACH BOYS\tDO IT AGAIN\t1",
+ "1968\tBEE GEES\tI'VE GOTTA GET A MESSAGE TO YOU\t1",
+ "1968\tBEATLES\tHEY JUDE\t8",
+ "1968\tMARY HOPKIN\tTHOSE WERE THE DAYS\t6",
+ "1968\tJOE COCKER\tWITH A LITTLE HELP FROM MY FRIENDS\t1",
+ "1968\tHUGO MONTENEGRO\tTHE GOOD THE BAD AND THE UGLY\t4",
+ "1968\tSCAFFOLD\tLILY THE PINK\t3",
+ "1969\tMARMALADE\tOB-LA-DI, OB-LA-DA\t1",
+ "1969\tSCAFFOLD\tLILY THE PINK\t1",
+ "1969\tMARMALADE\tOB-LA-DI, OB-LA-DA\t2",
+ "1969\tFLEETWOOD MAC\tALBATROSS\t1", "1969\tMOVE\tBLACKBERRY WAY\t1",
+ "1969\tAMEN CORNER\t(IF PARADISE IS) HALF AS NICE\t2",
+ "1969\tPETER SARSTEDT\tWHERE DO YOU GO TO (MY LOVELY)\t4",
+ "1969\tMARVIN GAYE\tI HEARD IT THROUGH THE GRAPEVINE\t3",
+ "1969\tDESMOND DEKKER AND THE ACES\tTHE ISRAELITES\t1",
+ "1969\tBEATLES\tGET BACK\t6", "1969\tTOMMY ROE\tDIZZY\t1",
+ "1969\tBEATLES\tTHE BALLAD OF JOHN AND YOKO\t3",
+ "1969\tTHUNDERCLAP NEWMAN\tSOMETHING IN THE AIR\t3",
+ "1969\tROLLING STONES\tHONKY TONK WOMEN\t5",
+ "1969\tZAGER AND EVANS\tIN THE YEAR 2525 (EXORDIUM AND TERMINUS)\t3",
+ "1969\tCREEDENCE CLEARWATER REVIVAL\tBAD MOON RISING\t3",
+ "1969\tJANE BIRKIN AND SERGE GAINSBOURG\tJE T'AIME... MOI NON PLUS\t1",
+ "1969\tBOBBIE GENTRY\tI'LL NEVER FALL IN LOVE AGAIN\t1",
+ "1969\tARCHIES\tSUGAR, SUGAR\t4" };
+
+ private static final Map<String, Record> parsedRecords = new HashMap<String, Record>();
+ private Directory dir;
+ private IndexReader reader;
+ private IndexSearcher searcher;
+ private SortedDocValues artistDocValues;
+
+ static class Record {
+ String year;
+ String artist;
+ String song;
+ float weeks;
+ String id;
+
+ public Record(String id, String year, String artist, String song,
+ float weeks) {
+ super();
+ this.id = id;
+ this.year = year;
+ this.artist = artist;
+ this.song = song;
+ this.weeks = weeks;
+ }
+
+ @Override
+ public String toString() {
+ return "Record [id=" + id + ", artist=" + artist + ", weeks=" + weeks
+ + ", year=" + year + ", song=" + song + "]";
+ }
+
+ }
+
+ private DiversifiedTopDocsCollector doDiversifiedSearch(int numResults,
+ int maxResultsPerArtist) throws IOException {
+ // Alternate between implementations used for key lookups
+ if (random().nextBoolean()) {
+ // Faster key lookup but with potential for collisions on larger datasets
+ return doFuzzyDiversifiedSearch(numResults, maxResultsPerArtist);
+ } else {
+ // Slower key lookup but 100% accurate
+ return doAccurateDiversifiedSearch(numResults, maxResultsPerArtist);
+ }
+ }
+
+ private DiversifiedTopDocsCollector doFuzzyDiversifiedSearch(int numResults,
+ int maxResultsPerArtist) throws IOException {
+ DiversifiedTopDocsCollector tdc = new HashedDocValuesDiversifiedCollector(
+ numResults, maxResultsPerArtist, "artist");
+ searcher.search(getTestQuery(), tdc);
+ return tdc;
+ }
+
+ private DiversifiedTopDocsCollector doAccurateDiversifiedSearch(
+ int numResults, int maxResultsPerArtist) throws IOException {
+ DiversifiedTopDocsCollector tdc = new DocValuesDiversifiedCollector(
+ numResults, maxResultsPerArtist, artistDocValues);
+ searcher.search(getTestQuery(), tdc);
+ return tdc;
+ }
+
+ private Query getTestQuery() {
+ BooleanQuery testQuery = new BooleanQuery();
+ testQuery.add(new BooleanClause(new TermQuery(new Term("year", "1966")),
+ Occur.SHOULD));
+ testQuery.add(new BooleanClause(new TermQuery(new Term("year", "1967")),
+ Occur.SHOULD));
+ testQuery.add(new BooleanClause(new TermQuery(new Term("year", "1968")),
+ Occur.SHOULD));
+ testQuery.add(new BooleanClause(new TermQuery(new Term("year", "1969")),
+ Occur.SHOULD));
+ return testQuery;
+ }
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+
+ // populate an index with documents - artist, song and weeksAtNumberOne
+ dir = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+ Document doc = new Document();
+
+ Field yearField = newTextField("year", "", Field.Store.NO);
+ SortedDocValuesField artistField = new SortedDocValuesField("artist",
+ new BytesRef(""));
+ Field weeksAtNumberOneField = new FloatDocValuesField("weeksAtNumberOne",
+ 0.0F);
+ Field weeksStoredField = new FloatField("weeks", 0.0F, Store.YES);
+ Field idField = newStringField("id", "", Field.Store.YES);
+ Field songField = newTextField("song", "", Field.Store.NO);
+ Field storedArtistField = newTextField("artistName", "", Field.Store.NO);
+
+ doc.add(idField);
+ doc.add(weeksAtNumberOneField);
+ doc.add(storedArtistField);
+ doc.add(songField);
+ doc.add(weeksStoredField);
+ doc.add(yearField);
+ doc.add(artistField);
+
+ parsedRecords.clear();
+ for (int i = 0; i < hitsOfThe60s.length; i++) {
+ String cols[] = hitsOfThe60s[i].split("\t");
+ Record record = new Record(String.valueOf(i), cols[0], cols[1], cols[2],
+ Float.valueOf(cols[3]));
+ parsedRecords.put(record.id, record);
+ idField.setStringValue(record.id);
+ yearField.setStringValue(record.year);
+ storedArtistField.setStringValue(record.artist);
+ artistField.setBytesValue(new BytesRef(record.artist));
+ songField.setStringValue(record.song);
+ weeksStoredField.setFloatValue(record.weeks);
+ weeksAtNumberOneField.setFloatValue(record.weeks);
+ writer.addDocument(doc);
+ if (i % 10 == 0) {
+ // Causes the creation of multiple segments for our test
+ writer.commit();
+ }
+ }
+ reader = writer.getReader();
+ writer.close();
+ searcher = newSearcher(reader);
+ LeafReader ar = SlowCompositeReaderWrapper.wrap(reader);
+ artistDocValues = ar.getSortedDocValues("artist");
+
+ // All searches sort by song popularity
+ final Similarity base = searcher.getSimilarity();
+ searcher.setSimilarity(new DocValueSimilarity(base, "weeksAtNumberOne"));
+ }
+
+ @Override
+ public void tearDown() throws Exception {
+ reader.close();
+ dir.close();
+ dir = null;
+ super.tearDown();
+ }
+
+ private int getMaxNumRecordsPerArtist(ScoreDoc[] sd) throws IOException {
+ int result = 0;
+ HashMap<String, Integer> artistCounts = new HashMap<String, Integer>();
+ for (int i = 0; i < sd.length; i++) {
+ StoredDocument doc = reader.document(sd[i].doc);
+ Record record = parsedRecords.get(doc.get("id"));
+ Integer count = artistCounts.get(record.artist);
+ int newCount = 1;
+ if (count != null) {
+ newCount = count.intValue() + 1;
+ }
+ result = Math.max(result, newCount);
+ artistCounts.put(record.artist, newCount);
+ }
+ return result;
+ }
+
+ /**
+ * Similarity that wraps another similarity and replaces the final score
+ * according to whats in a docvalues field.
+ *
+ * @lucene.experimental
+ */
+ static class DocValueSimilarity extends Similarity {
+ private final Similarity sim;
+ private final String scoreValueField;
+
+ public DocValueSimilarity(Similarity sim, String scoreValueField) {
+ this.sim = sim;
+ this.scoreValueField = scoreValueField;
+ }
+
+ @Override
+ public long computeNorm(FieldInvertState state) {
+ return sim.computeNorm(state);
+ }
+
+ @Override
+ public SimWeight computeWeight(float queryBoost,
+ CollectionStatistics collectionStats, TermStatistics... termStats) {
+ return sim.computeWeight(queryBoost, collectionStats, termStats);
+ }
+
+ @Override
+ public SimScorer simScorer(SimWeight stats, LeafReaderContext context)
+ throws IOException {
+ final SimScorer sub = sim.simScorer(stats, context);
+ final NumericDocValues values = DocValues.getNumeric(context.reader(),
+ scoreValueField);
+
+ return new SimScorer() {
+ @Override
+ public float score(int doc, float freq) {
+ return Float.intBitsToFloat((int) values.get(doc));
+ }
+
+ @Override
+ public float computeSlopFactor(int distance) {
+ return sub.computeSlopFactor(distance);
+ }
+
+ @Override
+ public float computePayloadFactor(int doc, int start, int end,
+ BytesRef payload) {
+ return sub.computePayloadFactor(doc, start, end, payload);
+ }
+
+ @Override
+ public Explanation explain(int doc, Explanation freq) {
+ return new Explanation(Float.intBitsToFloat((int) values.get(doc)),
+ "indexDocValue(" + scoreValueField + ")");
+ }
+ };
+ }
+ }
+
+}