You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2011/05/17 19:20:54 UTC
svn commit: r1104421 - in /lucene/dev/trunk/modules/grouping/src:
java/org/apache/lucene/search/grouping/
test/org/apache/lucene/search/grouping/
Author: mikemccand
Date: Tue May 17 17:20:54 2011
New Revision: 1104421
URL: http://svn.apache.org/viewvc?rev=1104421&view=rev
Log:
LUCENE-3098: add AllGroupsCollector
Added:
lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/AllGroupsCollector.java (with props)
lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/AllGroupsCollectorTest.java (with props)
Modified:
lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java
lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html
lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java
Added: lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/AllGroupsCollector.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/AllGroupsCollector.java?rev=1104421&view=auto
==============================================================================
--- lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/AllGroupsCollector.java (added)
+++ lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/AllGroupsCollector.java Tue May 17 17:20:54 2011
@@ -0,0 +1,131 @@
+package org.apache.lucene.search.grouping;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.FieldCache;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.util.BytesRef;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+/**
+ * A collector that collects all groups that match the
+ * query. Only the group value is collected, and the order
+ * is undefined. This collector does not determine
+ * the most relevant document of a group.
+ *
+ * <p/>
+ * Internally, {@link SentinelIntSet} is used to detect
+ * if a group is already added to the total count. For each
+ * segment the {@link SentinelIntSet} is cleared and filled
+ * with previous counted groups that occur in the new
+ * segment.
+ *
+ * @lucene.experimental
+ */
+public class AllGroupsCollector extends Collector {
+
+ private static final int DEFAULT_INITIAL_SIZE = 128;
+
+ private final String groupField;
+ private final SentinelIntSet ordSet;
+ private final List<BytesRef> groups;
+ private final BytesRef spareBytesRef = new BytesRef();
+
+ private FieldCache.DocTermsIndex index;
+
+ /**
+ * Expert: Constructs a {@link AllGroupsCollector}
+ *
+ * @param groupField The field to group by
+ * @param initialSize The initial size of the {@link SentinelIntSet} and groups list. The initial size should
+ * roughly match the total number of expected unique groups. Be aware that the heap usage
+ * is 4 bytes * initialSize.
+ */
+ public AllGroupsCollector(String groupField, int initialSize) {
+ this.groupField = groupField;
+ ordSet = new SentinelIntSet(initialSize, -1);
+ groups = new ArrayList<BytesRef>(initialSize);
+ }
+
+ /**
+ * Constructs a {@link AllGroupsCollector}. This sets the initialSize for the {@link SentinelIntSet} and group list
+ * to 128 in the {@link #AllGroupsCollector(String, int)} constructor.
+ *
+ * @param groupField The field to group by
+ */
+ public AllGroupsCollector(String groupField) {
+ this(groupField, DEFAULT_INITIAL_SIZE);
+ }
+
+ public void setScorer(Scorer scorer) throws IOException {
+ }
+
+ public void collect(int doc) throws IOException {
+ int key = index.getOrd(doc);
+ if (!ordSet.exists(key)) {
+ ordSet.put(key);
+ BytesRef term = key == 0 ? null : index.getTerm(doc, new BytesRef());
+ groups.add(term);
+ }
+ }
+
+ /**
+ * Returns the total number of groups for the executed search.
+ * This is a convenience method. The following code snippet has the same effect: <pre>getGroups().size()</pre>
+ *
+ * @return The total number of groups for the executed search
+ */
+ public int getGroupCount() {
+ return groups.size();
+ }
+
+ /**
+ * Returns the group values
+ * <p/>
+ * This is an unordered collections of group values. For each group that matched the query there is a {@link BytesRef}
+ * representing a group value.
+ *
+ * @return the group values
+ */
+ public Collection<BytesRef> getGroups() {
+ return groups;
+ }
+
+ public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException {
+ index = FieldCache.DEFAULT.getTermsIndex(context.reader, groupField);
+
+ // Clear ordSet and fill it with previous encountered groups that can occur in the current segment.
+ ordSet.clear();
+ for (BytesRef countedGroup : groups) {
+ int ord = index.binarySearchLookup(countedGroup, spareBytesRef);
+ if (ord >= 0) {
+ ordSet.put(ord);
+ }
+ }
+ }
+
+ public boolean acceptsDocsOutOfOrder() {
+ return true;
+ }
+}
\ No newline at end of file
Modified: lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java?rev=1104421&r1=1104420&r2=1104421&view=diff
==============================================================================
--- lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java (original)
+++ lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java Tue May 17 17:20:54 2011
@@ -21,9 +21,6 @@ import org.apache.lucene.search.SortFiel
/** Represents result returned by a grouping search.
*
- * Note that we do not return the total number of unique
- * groups; doing so would be costly.
- *
* @lucene.experimental */
public class TopGroups {
/** Number of documents matching the search */
@@ -32,6 +29,9 @@ public class TopGroups {
/** Number of documents grouped into the topN groups */
public final int totalGroupedHitCount;
+ /** The total number of unique groups. If <code>null</code> this value is not computed. */
+ public final Integer totalGroupCount;
+
/** Group results in groupSort order */
public final GroupDocs[] groups;
@@ -47,5 +47,15 @@ public class TopGroups {
this.totalHitCount = totalHitCount;
this.totalGroupedHitCount = totalGroupedHitCount;
this.groups = groups;
+ this.totalGroupCount = null;
+ }
+
+ public TopGroups(TopGroups oldTopGroups, Integer totalGroupCount) {
+ this.groupSort = oldTopGroups.groupSort;
+ this.withinGroupSort = oldTopGroups.withinGroupSort;
+ this.totalHitCount = oldTopGroups.totalHitCount;
+ this.totalGroupedHitCount = oldTopGroups.totalGroupedHitCount;
+ this.groups = oldTopGroups.groups;
+ this.totalGroupCount = totalGroupCount;
}
}
Modified: lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html?rev=1104421&r1=1104420&r2=1104421&view=diff
==============================================================================
--- lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html (original)
+++ lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html Tue May 17 17:20:54 2011
@@ -88,6 +88,13 @@ field fall into a single group.</p>
boolean fillFields = true;
SecondPassGroupingCollector c2 = new SecondPassGroupingCollector("author", topGroups, groupSort, docSort, docOffset+docsPerGroup, getScores, getMaxScores, fillFields);
+ //Optionally compute total group count
+ AllGroupsCollector allGroupsCollector = null;
+ if (requiredTotalGroupCount) {
+ allGroupsCollector = new AllGroupsCollector("author");
+ c2 = MultiCollector.wrap(c2, allGroupsCollector);
+ }
+
if (cachedCollector.isCached()) {
// Cache fit within maxCacheRAMMB, so we can replay it:
cachedCollector.replay(c2);
@@ -97,6 +104,9 @@ field fall into a single group.</p>
}
TopGroups groupsResult = c2.getTopGroups(docOffset);
+ if (requiredTotalGroupCount) {
+ groupResult = new TopGroups(groupsResult, allGroupsCollector.getGroupCount());
+ }
// Render groupsResult...
</pre>
Added: lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/AllGroupsCollectorTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/AllGroupsCollectorTest.java?rev=1104421&view=auto
==============================================================================
--- lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/AllGroupsCollectorTest.java (added)
+++ lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/AllGroupsCollectorTest.java Tue May 17 17:20:54 2011
@@ -0,0 +1,109 @@
+package org.apache.lucene.search.grouping;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class AllGroupsCollectorTest extends LuceneTestCase {
+
+ public void testTotalGroupCount() throws Exception {
+
+ final String groupField = "author";
+
+ Directory dir = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(
+ random,
+ dir,
+ newIndexWriterConfig(TEST_VERSION_CURRENT,
+ new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
+ // 0
+ Document doc = new Document();
+ doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("content", "random text", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "1", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ // 1
+ doc = new Document();
+ doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("content", "some more random text blob", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "2", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ // 2
+ doc = new Document();
+ doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("content", "some more random textual data", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "3", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+ w.commit(); // To ensure a second segment
+
+ // 3
+ doc = new Document();
+ doc.add(new Field(groupField, "author2", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("content", "some random text", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "4", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ // 4
+ doc = new Document();
+ doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("content", "some more random text", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "5", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ // 5
+ doc = new Document();
+ doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("content", "random blob", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ // 6 -- no author field
+ doc = new Document();
+ doc.add(new Field("content", "random word stuck in alot of other text", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NO));
+ w.addDocument(doc);
+
+ IndexSearcher indexSearcher = new IndexSearcher(w.getReader());
+ w.close();
+
+ AllGroupsCollector c1 = new AllGroupsCollector(groupField);
+ indexSearcher.search(new TermQuery(new Term("content", "random")), c1);
+ assertEquals(4, c1.getGroupCount());
+
+ AllGroupsCollector c2 = new AllGroupsCollector(groupField);
+ indexSearcher.search(new TermQuery(new Term("content", "some")), c2);
+ assertEquals(3, c2.getGroupCount());
+
+ AllGroupsCollector c3 = new AllGroupsCollector(groupField);
+ indexSearcher.search(new TermQuery(new Term("content", "blob")), c3);
+ assertEquals(2, c3.getGroupCount());
+
+ indexSearcher.getIndexReader().close();
+ dir.close();
+ }
+}
Modified: lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java?rev=1104421&r1=1104420&r2=1104421&view=diff
==============================================================================
--- lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java (original)
+++ lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java Tue May 17 17:20:54 2011
@@ -17,13 +17,7 @@
package org.apache.lucene.search.grouping;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.List;
+import java.util.*;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
@@ -32,15 +26,7 @@ import org.apache.lucene.document.Numeri
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
-import org.apache.lucene.search.CachingCollector;
-import org.apache.lucene.search.Collector;
-import org.apache.lucene.search.FieldCache;
-import org.apache.lucene.search.FieldDoc;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.search.Sort;
-import org.apache.lucene.search.SortField;
-import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
@@ -121,7 +107,7 @@ public class TestGrouping extends Lucene
final SecondPassGroupingCollector c2 = new SecondPassGroupingCollector(groupField, c1.getTopGroups(0, true), groupSort, null, 5, true, false, true);
indexSearcher.search(new TermQuery(new Term("content", "random")), c2);
-
+
final TopGroups groups = c2.getTopGroups(0);
assertEquals(7, groups.totalHitCount);
@@ -243,6 +229,7 @@ public class TestGrouping extends Lucene
boolean fillFields,
boolean getScores,
boolean getMaxScores,
+ boolean doAllGroups,
Sort groupSort,
Sort docSort,
int topNGroups,
@@ -258,6 +245,7 @@ public class TestGrouping extends Lucene
final List<Comparable<?>[]> sortedGroupFields = new ArrayList<Comparable<?>[]>();
int totalHitCount = 0;
+ Set<BytesRef> knownGroups = new HashSet<BytesRef>();
for(GroupDoc d : groupDocs) {
// TODO: would be better to filter by searchTerm before sorting!
@@ -265,6 +253,13 @@ public class TestGrouping extends Lucene
continue;
}
totalHitCount++;
+
+ if (doAllGroups) {
+ if (!knownGroups.contains(d.group)) {
+ knownGroups.add(d.group);
+ }
+ }
+
List<GroupDoc> l = groups.get(d.group);
if (l == null) {
sortedGroups.add(d.group);
@@ -317,7 +312,14 @@ public class TestGrouping extends Lucene
fillFields ? sortedGroupFields.get(idx) : null);
}
- return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result);
+ if (doAllGroups) {
+ return new TopGroups(
+ new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result),
+ knownGroups.size()
+ );
+ } else {
+ return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result);
+ }
}
public void testRandom() throws Exception {
@@ -335,7 +337,7 @@ public class TestGrouping extends Lucene
if (VERBOSE) {
System.out.println("TEST: numDocs=" + numDocs + " numGroups=" + numGroups);
}
-
+
final List<BytesRef> groups = new ArrayList<BytesRef>();
for(int i=0;i<numGroups;i++) {
groups.add(new BytesRef(_TestUtil.randomRealisticUnicodeString(random)));
@@ -428,8 +430,16 @@ public class TestGrouping extends Lucene
//final int docOffset = 0;
final boolean doCache = random.nextBoolean();
+ final boolean doAllGroups = random.nextBoolean();
if (VERBOSE) {
- System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup);
+ System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup + " doAllGroups=" + doAllGroups);
+ }
+
+ final AllGroupsCollector allGroupsCollector;
+ if (doAllGroups) {
+ allGroupsCollector = new AllGroupsCollector("group");
+ } else {
+ allGroupsCollector = null;
}
final FirstPassGroupingCollector c1 = new FirstPassGroupingCollector("group", groupSort, groupOffset+topNGroups);
@@ -440,7 +450,16 @@ public class TestGrouping extends Lucene
if (VERBOSE) {
System.out.println("TEST: maxCacheMB=" + maxCacheMB);
}
- c = cCache = new CachingCollector(c1, true, maxCacheMB);
+
+ if (doAllGroups) {
+ cCache = new CachingCollector(c1, true, maxCacheMB);
+ c = MultiCollector.wrap(cCache, allGroupsCollector);
+ } else {
+ c = cCache = new CachingCollector(c1, true, maxCacheMB);
+ }
+ } else if (doAllGroups) {
+ c = MultiCollector.wrap(c1, allGroupsCollector);
+ cCache = null;
} else {
c = c1;
cCache = null;
@@ -475,8 +494,13 @@ public class TestGrouping extends Lucene
} else {
s.search(new TermQuery(new Term("content", searchTerm)), c2);
}
-
- groupsResult = c2.getTopGroups(docOffset);
+
+ if (doAllGroups) {
+ TopGroups tempTopGroups = c2.getTopGroups(docOffset);
+ groupsResult = new TopGroups(tempTopGroups, allGroupsCollector.getGroupCount());
+ } else {
+ groupsResult = c2.getTopGroups(docOffset);
+ }
} else {
groupsResult = null;
if (VERBOSE) {
@@ -484,7 +508,7 @@ public class TestGrouping extends Lucene
}
}
- final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset);
+ final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, doAllGroups, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset);
try {
// NOTE: intentional but temporary field cache insanity!
@@ -509,7 +533,10 @@ public class TestGrouping extends Lucene
assertEquals(expected.groups.length, actual.groups.length);
assertEquals(expected.totalHitCount, actual.totalHitCount);
assertEquals(expected.totalGroupedHitCount, actual.totalGroupedHitCount);
-
+ if (expected.totalGroupCount != null) {
+ assertEquals(expected.totalGroupCount, actual.totalGroupCount);
+ }
+
for(int groupIDX=0;groupIDX<expected.groups.length;groupIDX++) {
if (VERBOSE) {
System.out.println(" check groupIDX=" + groupIDX);