You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2011/05/17 19:20:54 UTC

svn commit: r1104421 - in /lucene/dev/trunk/modules/grouping/src: java/org/apache/lucene/search/grouping/ test/org/apache/lucene/search/grouping/

Author: mikemccand
Date: Tue May 17 17:20:54 2011
New Revision: 1104421

URL: http://svn.apache.org/viewvc?rev=1104421&view=rev
Log:
LUCENE-3098: add AllGroupsCollector

Added:
    lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/AllGroupsCollector.java   (with props)
    lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/AllGroupsCollectorTest.java   (with props)
Modified:
    lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java
    lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html
    lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java

Added: lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/AllGroupsCollector.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/AllGroupsCollector.java?rev=1104421&view=auto
==============================================================================
--- lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/AllGroupsCollector.java (added)
+++ lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/AllGroupsCollector.java Tue May 17 17:20:54 2011
@@ -0,0 +1,131 @@
+package org.apache.lucene.search.grouping;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.FieldCache;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.util.BytesRef;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+/**
+ * A collector that collects all groups that match the
+ * query. Only the group value is collected, and the order
+ * is undefined.  This collector does not determine
+ * the most relevant document of a group.
+ *
+ * <p/>
+ * Internally, {@link SentinelIntSet} is used to detect
+ * if a group is already added to the total count.  For each
+ * segment the {@link SentinelIntSet} is cleared and filled
+ * with previous counted groups that occur in the new
+ * segment.
+ *
+ * @lucene.experimental
+ */
+public class AllGroupsCollector extends Collector {
+
+  private static final int DEFAULT_INITIAL_SIZE = 128;
+
+  private final String groupField;
+  private final SentinelIntSet ordSet;
+  private final List<BytesRef> groups;
+  private final BytesRef spareBytesRef = new BytesRef();
+
+  private FieldCache.DocTermsIndex index;
+
+  /**
+   * Expert: Constructs a {@link AllGroupsCollector}
+   *
+   * @param groupField  The field to group by
+   * @param initialSize The initial size of the {@link SentinelIntSet} and groups list. The initial size should
+   *                    roughly match the total number of expected unique groups. Be aware that the heap usage
+   *                    is 4 bytes * initialSize.
+   */
+  public AllGroupsCollector(String groupField, int initialSize) {
+    this.groupField = groupField;
+    ordSet = new SentinelIntSet(initialSize, -1);
+    groups = new ArrayList<BytesRef>(initialSize);
+  }
+
+  /**
+   * Constructs a {@link AllGroupsCollector}. This sets the initialSize for the {@link SentinelIntSet} and group list
+   * to 128 in the {@link #AllGroupsCollector(String, int)} constructor.
+   *
+   * @param groupField The field to group by
+   */
+  public AllGroupsCollector(String groupField) {
+    this(groupField, DEFAULT_INITIAL_SIZE);
+  }
+
+  public void setScorer(Scorer scorer) throws IOException {
+  }
+
+  public void collect(int doc) throws IOException {
+    int key = index.getOrd(doc);
+    if (!ordSet.exists(key)) {
+      ordSet.put(key);
+      BytesRef term = key == 0 ? null : index.getTerm(doc, new BytesRef());
+      groups.add(term);
+    }
+  }
+
+  /**
+   * Returns the total number of groups for the executed search.
+   * This is a convenience method. The following code snippet has the same effect: <pre>getGroups().size()</pre>
+   *
+   * @return The total number of groups for the executed search
+   */
+  public int getGroupCount() {
+    return groups.size();
+  }
+
+  /**
+   * Returns the group values
+   * <p/>
+   * This is an unordered collections of group values. For each group that matched the query there is a {@link BytesRef}
+   * representing a group value.
+   *
+   * @return the group values
+   */
+  public Collection<BytesRef> getGroups() {
+    return groups;
+  }
+
+  public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException {
+    index = FieldCache.DEFAULT.getTermsIndex(context.reader, groupField);
+
+    // Clear ordSet and fill it with previous encountered groups that can occur in the current segment.
+    ordSet.clear();
+    for (BytesRef countedGroup : groups) {
+      int ord = index.binarySearchLookup(countedGroup, spareBytesRef);
+      if (ord >= 0) {
+        ordSet.put(ord);
+      }
+    }
+  }
+
+  public boolean acceptsDocsOutOfOrder() {
+    return true;
+  }
+}
\ No newline at end of file

Modified: lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java?rev=1104421&r1=1104420&r2=1104421&view=diff
==============================================================================
--- lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java (original)
+++ lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java Tue May 17 17:20:54 2011
@@ -21,9 +21,6 @@ import org.apache.lucene.search.SortFiel
 
 /** Represents result returned by a grouping search.
  *
- * Note that we do not return the total number of unique
- * groups; doing so would be costly.
- * 
  * @lucene.experimental */
 public class TopGroups {
   /** Number of documents matching the search */
@@ -32,6 +29,9 @@ public class TopGroups {
   /** Number of documents grouped into the topN groups */
   public final int totalGroupedHitCount;
 
+  /** The total number of unique groups. If <code>null</code> this value is not computed. */
+  public final Integer totalGroupCount;
+
   /** Group results in groupSort order */
   public final GroupDocs[] groups;
 
@@ -47,5 +47,15 @@ public class TopGroups {
     this.totalHitCount = totalHitCount;
     this.totalGroupedHitCount = totalGroupedHitCount;
     this.groups = groups;
+    this.totalGroupCount = null;
+  }
+
+  public TopGroups(TopGroups oldTopGroups, Integer totalGroupCount) {
+    this.groupSort = oldTopGroups.groupSort;
+    this.withinGroupSort = oldTopGroups.withinGroupSort;
+    this.totalHitCount = oldTopGroups.totalHitCount;
+    this.totalGroupedHitCount = oldTopGroups.totalGroupedHitCount;
+    this.groups = oldTopGroups.groups;
+    this.totalGroupCount = totalGroupCount;
   }
 }

Modified: lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html?rev=1104421&r1=1104420&r2=1104421&view=diff
==============================================================================
--- lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html (original)
+++ lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html Tue May 17 17:20:54 2011
@@ -88,6 +88,13 @@ field fall into a single group.</p>
   boolean fillFields = true;
   SecondPassGroupingCollector c2 = new SecondPassGroupingCollector("author", topGroups, groupSort, docSort, docOffset+docsPerGroup, getScores, getMaxScores, fillFields);
 
+  //Optionally compute total group count
+  AllGroupsCollector allGroupsCollector = null;
+  if (requiredTotalGroupCount) {
+    allGroupsCollector = new AllGroupsCollector("author");
+    c2 = MultiCollector.wrap(c2, allGroupsCollector);
+  }
+
   if (cachedCollector.isCached()) {
     // Cache fit within maxCacheRAMMB, so we can replay it:
     cachedCollector.replay(c2);
@@ -97,6 +104,9 @@ field fall into a single group.</p>
   }
         
   TopGroups groupsResult = c2.getTopGroups(docOffset);
+  if (requiredTotalGroupCount) {
+    groupResult = new TopGroups(groupsResult, allGroupsCollector.getGroupCount());
+  }
 
   // Render groupsResult...
 </pre>

Added: lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/AllGroupsCollectorTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/AllGroupsCollectorTest.java?rev=1104421&view=auto
==============================================================================
--- lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/AllGroupsCollectorTest.java (added)
+++ lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/AllGroupsCollectorTest.java Tue May 17 17:20:54 2011
@@ -0,0 +1,109 @@
+package org.apache.lucene.search.grouping;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class AllGroupsCollectorTest extends LuceneTestCase {
+
+  public void testTotalGroupCount() throws Exception {
+
+    final String groupField = "author";
+
+    Directory dir = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(
+                               random,
+                               dir,
+                               newIndexWriterConfig(TEST_VERSION_CURRENT,
+                                                    new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
+    // 0
+    Document doc = new Document();
+    doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("content", "random text", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("id", "1", Field.Store.YES, Field.Index.NO));
+    w.addDocument(doc);
+
+    // 1
+    doc = new Document();
+    doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("content", "some more random text blob", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("id", "2", Field.Store.YES, Field.Index.NO));
+    w.addDocument(doc);
+
+    // 2
+    doc = new Document();
+    doc.add(new Field(groupField, "author1", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("content", "some more random textual data", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("id", "3", Field.Store.YES, Field.Index.NO));
+    w.addDocument(doc);
+    w.commit(); // To ensure a second segment
+
+    // 3
+    doc = new Document();
+    doc.add(new Field(groupField, "author2", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("content", "some random text", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("id", "4", Field.Store.YES, Field.Index.NO));
+    w.addDocument(doc);
+
+    // 4
+    doc = new Document();
+    doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("content", "some more random text", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("id", "5", Field.Store.YES, Field.Index.NO));
+    w.addDocument(doc);
+
+    // 5
+    doc = new Document();
+    doc.add(new Field(groupField, "author3", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("content", "random blob", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NO));
+    w.addDocument(doc);
+
+    // 6 -- no author field
+    doc = new Document();
+    doc.add(new Field("content", "random word stuck in alot of other text", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("id", "6", Field.Store.YES, Field.Index.NO));
+    w.addDocument(doc);
+
+    IndexSearcher indexSearcher = new IndexSearcher(w.getReader());
+    w.close();
+
+    AllGroupsCollector c1 = new AllGroupsCollector(groupField);
+    indexSearcher.search(new TermQuery(new Term("content", "random")), c1);
+    assertEquals(4, c1.getGroupCount());
+
+    AllGroupsCollector c2 = new AllGroupsCollector(groupField);
+    indexSearcher.search(new TermQuery(new Term("content", "some")), c2);
+    assertEquals(3, c2.getGroupCount());
+
+    AllGroupsCollector c3 = new AllGroupsCollector(groupField);
+    indexSearcher.search(new TermQuery(new Term("content", "blob")), c3);
+    assertEquals(2, c3.getGroupCount());
+
+    indexSearcher.getIndexReader().close();
+    dir.close();
+  }
+}

Modified: lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java?rev=1104421&r1=1104420&r2=1104421&view=diff
==============================================================================
--- lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java (original)
+++ lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java Tue May 17 17:20:54 2011
@@ -17,13 +17,7 @@
 
 package org.apache.lucene.search.grouping;
 
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.List;
+import java.util.*;
 
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.document.Document;
@@ -32,15 +26,7 @@ import org.apache.lucene.document.Numeri
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.search.CachingCollector;
-import org.apache.lucene.search.Collector;
-import org.apache.lucene.search.FieldCache;
-import org.apache.lucene.search.FieldDoc;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.search.Sort;
-import org.apache.lucene.search.SortField;
-import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.*;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
@@ -121,7 +107,7 @@ public class TestGrouping extends Lucene
 
     final SecondPassGroupingCollector c2 = new SecondPassGroupingCollector(groupField, c1.getTopGroups(0, true), groupSort, null, 5, true, false, true);
     indexSearcher.search(new TermQuery(new Term("content", "random")), c2);
-    
+
     final TopGroups groups = c2.getTopGroups(0);
 
     assertEquals(7, groups.totalHitCount);
@@ -243,6 +229,7 @@ public class TestGrouping extends Lucene
                                  boolean fillFields,
                                  boolean getScores,
                                  boolean getMaxScores,
+                                 boolean doAllGroups,
                                  Sort groupSort,
                                  Sort docSort,
                                  int topNGroups,
@@ -258,6 +245,7 @@ public class TestGrouping extends Lucene
     final List<Comparable<?>[]> sortedGroupFields = new ArrayList<Comparable<?>[]>();
 
     int totalHitCount = 0;
+    Set<BytesRef> knownGroups = new HashSet<BytesRef>();
 
     for(GroupDoc d : groupDocs) {
       // TODO: would be better to filter by searchTerm before sorting!
@@ -265,6 +253,13 @@ public class TestGrouping extends Lucene
         continue;
       }
       totalHitCount++;
+
+      if (doAllGroups) {
+        if (!knownGroups.contains(d.group)) {
+          knownGroups.add(d.group);
+        }
+      }
+
       List<GroupDoc> l = groups.get(d.group);
       if (l == null) {
         sortedGroups.add(d.group);
@@ -317,7 +312,14 @@ public class TestGrouping extends Lucene
                                               fillFields ? sortedGroupFields.get(idx) : null);
     }
 
-    return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result);
+    if (doAllGroups) {
+      return new TopGroups(
+          new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result),
+          knownGroups.size()
+      );
+    } else {
+      return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result);
+    }
   }
 
   public void testRandom() throws Exception {
@@ -335,7 +337,7 @@ public class TestGrouping extends Lucene
       if (VERBOSE) {
         System.out.println("TEST: numDocs=" + numDocs + " numGroups=" + numGroups);
       }
-      
+
       final List<BytesRef> groups = new ArrayList<BytesRef>();
       for(int i=0;i<numGroups;i++) {
         groups.add(new BytesRef(_TestUtil.randomRealisticUnicodeString(random)));
@@ -428,8 +430,16 @@ public class TestGrouping extends Lucene
         //final int docOffset = 0;
 
         final boolean doCache = random.nextBoolean();
+        final boolean doAllGroups = random.nextBoolean();
         if (VERBOSE) {
-          System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup);
+          System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup + " doAllGroups=" + doAllGroups);
+        }
+
+        final AllGroupsCollector allGroupsCollector;
+        if (doAllGroups) {
+          allGroupsCollector = new AllGroupsCollector("group");
+        } else {
+          allGroupsCollector = null;
         }
 
         final FirstPassGroupingCollector c1 = new FirstPassGroupingCollector("group", groupSort, groupOffset+topNGroups);
@@ -440,7 +450,16 @@ public class TestGrouping extends Lucene
           if (VERBOSE) {
             System.out.println("TEST: maxCacheMB=" + maxCacheMB);
           }
-          c = cCache = new CachingCollector(c1, true, maxCacheMB);
+
+          if (doAllGroups) {
+            cCache = new CachingCollector(c1, true, maxCacheMB);
+            c = MultiCollector.wrap(cCache, allGroupsCollector);
+          } else {
+            c = cCache = new CachingCollector(c1, true, maxCacheMB);
+          }
+        } else if (doAllGroups) {
+          c = MultiCollector.wrap(c1, allGroupsCollector);
+          cCache = null;
         } else {
           c = c1;
           cCache = null;
@@ -475,8 +494,13 @@ public class TestGrouping extends Lucene
           } else {
             s.search(new TermQuery(new Term("content", searchTerm)), c2);
           }
-        
-          groupsResult = c2.getTopGroups(docOffset);
+
+          if (doAllGroups) {
+            TopGroups tempTopGroups = c2.getTopGroups(docOffset);
+            groupsResult = new TopGroups(tempTopGroups, allGroupsCollector.getGroupCount());
+          } else {
+            groupsResult = c2.getTopGroups(docOffset);
+          }
         } else {
           groupsResult = null;
           if (VERBOSE) {
@@ -484,7 +508,7 @@ public class TestGrouping extends Lucene
           }
         }
 
-        final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset);
+        final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, doAllGroups, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset);
 
         try {
           // NOTE: intentional but temporary field cache insanity!
@@ -509,7 +533,10 @@ public class TestGrouping extends Lucene
     assertEquals(expected.groups.length, actual.groups.length);
     assertEquals(expected.totalHitCount, actual.totalHitCount);
     assertEquals(expected.totalGroupedHitCount, actual.totalGroupedHitCount);
-    
+    if (expected.totalGroupCount != null) {
+      assertEquals(expected.totalGroupCount, actual.totalGroupCount);
+    }
+
     for(int groupIDX=0;groupIDX<expected.groups.length;groupIDX++) {
       if (VERBOSE) {
         System.out.println("  check groupIDX=" + groupIDX);