You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mv...@apache.org on 2012/03/07 22:57:48 UTC

svn commit: r1298144 - in /lucene/dev/trunk: lucene/contrib/ lucene/core/src/java/org/apache/lucene/index/ modules/grouping/src/java/org/apache/lucene/search/grouping/ modules/grouping/src/java/org/apache/lucene/search/grouping/term/ modules/grouping/s...

Author: mvg
Date: Wed Mar  7 21:57:48 2012
New Revision: 1298144

URL: http://svn.apache.org/viewvc?rev=1298144&view=rev
Log:
LUCENE-3802: Support for grouped faceting.

Added:
    lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/AbstractGroupFacetCollector.java
    lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/term/TermGroupFacetCollector.java
    lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/AbstractGroupingTestCase.java
    lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/TermGroupFacetCollectorTest.java
Modified:
    lucene/dev/trunk/lucene/contrib/CHANGES.txt
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/DocTermOrds.java

Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?rev=1298144&r1=1298143&r2=1298144&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Wed Mar  7 21:57:48 2012
@@ -72,6 +72,8 @@ New Features
    start/endOffset, if offsets are indexed. (Alan Woodward via Mike
    McCandless)
 
+ * LUCENE-3802: Support for grouped faceting. (Martijn van Groningen)
+
 API Changes
 
  * LUCENE-2606: Changed RegexCapabilities interface to fix thread 

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/DocTermOrds.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/DocTermOrds.java?rev=1298144&r1=1298143&r2=1298144&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/DocTermOrds.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/DocTermOrds.java Wed Mar  7 21:57:48 2012
@@ -216,6 +216,13 @@ public class DocTermOrds {
     }
   }
 
+  /**
+   * @return The number of terms in this field
+   */
+  public int numTerms() {
+    return numTermsInField;
+  }
+
   /** Subclass can override this */
   protected void visitTerm(TermsEnum te, int termNum) throws IOException {
   }

Added: lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/AbstractGroupFacetCollector.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/AbstractGroupFacetCollector.java?rev=1298144&view=auto
==============================================================================
--- lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/AbstractGroupFacetCollector.java (added)
+++ lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/AbstractGroupFacetCollector.java Wed Mar  7 21:57:48 2012
@@ -0,0 +1,224 @@
+package org.apache.lucene.search.grouping;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.util.BytesRef;
+
+import java.io.IOException;
+import java.util.*;
+
+/**
+ * Base class for computing grouped facets.
+ *
+ * @lucene.experimental
+ */
+public abstract class AbstractGroupFacetCollector extends Collector {
+
+  protected final String groupField;
+  protected final String facetField;
+  protected final BytesRef facetPrefix;
+
+  protected AbstractGroupFacetCollector(String groupField, String facetField, BytesRef facetPrefix) {
+    this.groupField = groupField;
+    this.facetField = facetField;
+    this.facetPrefix = facetPrefix;
+  }
+
+  /**
+   * Returns grouped facet results that were computed over zero or more segments.
+   * Grouped facet counts are merged from zero or more segment results.
+   *
+   * @param size The total number of facets to include. This is typically offset + limit
+   * @param minCount The minimum count a facet entry should have to be included in the grouped facet result
+   * @param orderByCount Whether to sort the facet entries by facet entry count. If <code>false</code> then the facets
+   *                     are sorted lexicographically in ascending order.
+   * @return grouped facet results
+   * @throws IOException If I/O related errors occur during merging segment grouped facet counts.
+   */
+  public abstract GroupedFacetResult mergeSegmentResults(int size, int minCount, boolean orderByCount) throws IOException;
+
+  public void setScorer(Scorer scorer) throws IOException {
+  }
+
+  public boolean acceptsDocsOutOfOrder() {
+    return true;
+  }
+
+  /**
+   * The grouped facet result. Containing grouped facet entries, total count and total missing count.
+   */
+  public static class GroupedFacetResult {
+
+    private final static Comparator<FacetEntry> orderByCountAndValue = new Comparator<FacetEntry>() {
+
+      public int compare(FacetEntry a, FacetEntry b) {
+        int cmp = b.count - a.count; // Highest count first!
+        if (cmp != 0) {
+          return cmp;
+        }
+        return a.value.compareTo(b.value);
+      }
+
+    };
+
+    private final static Comparator<FacetEntry> orderByValue = new Comparator<FacetEntry>() {
+
+      public int compare(FacetEntry a, FacetEntry b) {
+        return a.value.compareTo(b.value);
+      }
+
+    };
+
+    private final int maxSize;
+    private final NavigableSet<FacetEntry> facetEntries;
+    private final int totalMissingCount;
+    private final int totalCount;
+
+    private int currentMin;
+
+    public GroupedFacetResult(int size, int minCount, boolean orderByCount, int totalCount, int totalMissingCount) {
+      this.facetEntries = new TreeSet<FacetEntry>(orderByCount ? orderByCountAndValue : orderByValue);
+      this.totalMissingCount = totalMissingCount;
+      this.totalCount = totalCount;
+      maxSize = size;
+      currentMin = minCount;
+    }
+
+    public void addFacetCount(BytesRef facetValue, int count) {
+      if (count < currentMin) {
+        return;
+      }
+
+      FacetEntry facetEntry = new FacetEntry(facetValue, count);
+      if (facetEntries.size() == maxSize) {
+        if (facetEntries.higher(facetEntry) == null) {
+          return;
+        }
+        facetEntries.pollLast();
+      }
+      facetEntries.add(facetEntry);
+
+      if (facetEntries.size() == maxSize) {
+        currentMin = facetEntries.last().count;
+      }
+    }
+
+    /**
+     * Returns a list of facet entries to be rendered based on the specified offset and limit.
+     * The facet entries are retrieved from the facet entries collected during merging.
+     *
+     * @param offset The offset in the collected facet entries during merging
+     * @param limit The number of facets to return starting from the offset.
+     * @return a list of facet entries to be rendered based on the specified offset and limit
+     */
+    public List<FacetEntry> getFacetEntries(int offset, int limit) {
+      List<FacetEntry> entries = new LinkedList<FacetEntry>();
+      limit += offset;
+
+      int i = 0;
+      for (FacetEntry facetEntry : facetEntries) {
+        if (i < offset) {
+          i++;
+          continue;
+        }
+        if (i++ >= limit) {
+          break;
+        }
+        entries.add(facetEntry);
+      }
+      return entries;
+    }
+
+    /**
+     * Returns the sum of all facet entries counts.
+     *
+     * @return the sum of all facet entries counts
+     */
+    public int getTotalCount() {
+      return totalCount;
+    }
+
+    /**
+     * Returns the number of groups that didn't have a facet value.
+     *
+     * @return the number of groups that didn't have a facet value
+     */
+    public int getTotalMissingCount() {
+      return totalMissingCount;
+    }
+  }
+
+  /**
+   * Represents a facet entry with a value and a count.
+   */
+  public static class FacetEntry {
+
+    private final BytesRef value;
+    private final int count;
+
+    public FacetEntry(BytesRef value, int count) {
+      this.value = value;
+      this.count = count;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+      if (this == o) return true;
+      if (o == null || getClass() != o.getClass()) return false;
+
+      FacetEntry that = (FacetEntry) o;
+
+      if (count != that.count) return false;
+      if (!value.equals(that.value)) return false;
+
+      return true;
+    }
+
+    @Override
+    public int hashCode() {
+      int result = value.hashCode();
+      result = 31 * result + count;
+      return result;
+    }
+
+    @Override
+    public String toString() {
+      return "FacetEntry{" +
+          "value=" + value.utf8ToString() +
+          ", count=" + count +
+          '}';
+    }
+
+    /**
+     * @return The value of this facet entry
+     */
+    public BytesRef getValue() {
+      return value;
+    }
+
+    /**
+     * @return The count (number of groups) of this facet entry.
+     */
+    public int getCount() {
+      return count;
+    }
+  }
+
+}

Added: lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/term/TermGroupFacetCollector.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/term/TermGroupFacetCollector.java?rev=1298144&view=auto
==============================================================================
--- lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/term/TermGroupFacetCollector.java (added)
+++ lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/term/TermGroupFacetCollector.java Wed Mar  7 21:57:48 2012
@@ -0,0 +1,391 @@
+package org.apache.lucene.search.grouping.term;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.DocTermOrds;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.FieldCache;
+import org.apache.lucene.search.grouping.AbstractGroupFacetCollector;
+import org.apache.lucene.util.*;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * An implementation of {@link AbstractGroupFacetCollector} that computes grouped facets based on the indexed terms
+ * from the {@link FieldCache}.
+ *
+ * @lucene.experimental
+ */
+public abstract class TermGroupFacetCollector extends AbstractGroupFacetCollector {
+
+  final List<GroupedFacetHit> groupedFacetHits;
+  final SentinelIntSet segmentGroupedFacetHits;
+  final List<SegmentResult> segmentResults;
+  final BytesRef spare = new BytesRef();
+
+  FieldCache.DocTermsIndex groupFieldTermsIndex;
+  int[] segmentFacetCounts;
+  int segmentTotalCount;
+  int startFacetOrd;
+  int endFacetOrd;
+
+  /**
+   * Factory method for creating the right implementation based on the fact whether the facet field contains
+   * multiple tokens per documents.
+   *
+   * @param groupField The group field
+   * @param facetField The facet field
+   * @param facetFieldMultivalued Whether the facet field has multiple tokens per document
+   * @param facetPrefix The facet prefix a facet entry should start with to be included.
+   * @param initialSize The initial allocation size of the internal int set and group facet list which should roughly
+   *                    match the total number of expected unique groups. Be aware that the heap usage is
+   *                    4 bytes * initialSize.
+   * @return <code>TermGroupFacetCollector</code> implementation
+   */
+  public static TermGroupFacetCollector createTermGroupFacetCollector(String groupField,
+                                                                      String facetField,
+                                                                      boolean facetFieldMultivalued,
+                                                                      BytesRef facetPrefix,
+                                                                      int initialSize) {
+    if (facetFieldMultivalued) {
+      return new MV(groupField, facetField, facetPrefix, initialSize);
+    } else {
+      return new SV(groupField, facetField, facetPrefix, initialSize);
+    }
+  }
+
+  TermGroupFacetCollector(String groupField, String facetField, BytesRef facetPrefix, int initialSize) {
+    super(groupField, facetField, facetPrefix);
+    groupedFacetHits = new ArrayList<GroupedFacetHit>(initialSize);
+    segmentGroupedFacetHits = new SentinelIntSet(initialSize, -1);
+    segmentResults = new ArrayList<SegmentResult>();
+  }
+
+  /**
+   * {@inheritDoc}
+   */
+  public GroupedFacetResult mergeSegmentResults(int size, int minCount, boolean orderByCount) throws IOException {
+    if (segmentFacetCounts != null) {
+      segmentResults.add(createSegmentResult());
+      segmentFacetCounts = null; // reset
+    }
+
+    int totalCount = 0;
+    int missingCount = 0;
+    SegmentResultPriorityQueue segments = new SegmentResultPriorityQueue(segmentResults.size());
+    for (SegmentResult segmentResult : segmentResults) {
+      missingCount += segmentResult.missing;
+      if (segmentResult.mergePos >= segmentResult.maxTermPos) {
+        continue;
+      }
+      totalCount += segmentResult.total;
+      segmentResult.initializeForMerge();
+      segments.add(segmentResult);
+    }
+
+    GroupedFacetResult facetResult = new GroupedFacetResult(size, minCount, orderByCount, totalCount, missingCount);
+    while (segments.size() > 0) {
+      SegmentResult segmentResult = segments.top();
+      BytesRef currentFacetValue = BytesRef.deepCopyOf(segmentResult.mergeTerm);
+      int count = 0;
+
+      do {
+        count += segmentResult.counts[segmentResult.mergePos++];
+        if (segmentResult.mergePos < segmentResult.maxTermPos) {
+          segmentResult.nextTerm();
+          segmentResult = segments.updateTop();
+        } else {
+          segments.pop();
+          segmentResult = segments.top();
+          if (segmentResult == null) {
+            break;
+          }
+        }
+      } while (currentFacetValue.equals(segmentResult.mergeTerm));
+      facetResult.addFacetCount(currentFacetValue, count);
+    }
+    return facetResult;
+  }
+
+  protected abstract SegmentResult createSegmentResult();
+
+  // Implementation for single valued facet fields.
+  static class SV extends TermGroupFacetCollector {
+
+    private FieldCache.DocTermsIndex facetFieldTermsIndex;
+
+    SV(String groupField, String facetField, BytesRef facetPrefix, int initialSize) {
+      super(groupField, facetField, facetPrefix, initialSize);
+    }
+
+    public void collect(int doc) throws IOException {
+      int facetOrd = facetFieldTermsIndex.getOrd(doc);
+      if (facetOrd < startFacetOrd || facetOrd >= endFacetOrd) {
+        return;
+      }
+
+      int groupOrd = groupFieldTermsIndex.getOrd(doc);
+      int segmentGroupedFacetsIndex = (groupOrd * facetFieldTermsIndex.numOrd()) + facetOrd;
+      if (segmentGroupedFacetHits.exists(segmentGroupedFacetsIndex)) {
+        return;
+      }
+
+      segmentTotalCount++;
+      segmentFacetCounts[facetOrd]++;
+
+      segmentGroupedFacetHits.put(segmentGroupedFacetsIndex);
+      groupedFacetHits.add(
+          new GroupedFacetHit(
+              groupOrd == 0 ? null : groupFieldTermsIndex.lookup(groupOrd, new BytesRef()),
+              facetOrd == 0 ? null : facetFieldTermsIndex.lookup(facetOrd, new BytesRef())
+          )
+      );
+    }
+
+    public void setNextReader(AtomicReaderContext context) throws IOException {
+      if (segmentFacetCounts != null) {
+        segmentResults.add(createSegmentResult());
+      }
+
+      groupFieldTermsIndex = FieldCache.DEFAULT.getTermsIndex(context.reader(), groupField);
+      facetFieldTermsIndex = FieldCache.DEFAULT.getTermsIndex(context.reader(), facetField);
+      segmentFacetCounts = new int[facetFieldTermsIndex.numOrd()];
+      segmentTotalCount = 0;
+
+      segmentGroupedFacetHits.clear();
+      for (GroupedFacetHit groupedFacetHit : groupedFacetHits) {
+        int facetOrd = facetFieldTermsIndex.binarySearchLookup(groupedFacetHit.facetValue, spare);
+        if (facetOrd < 0) {
+          continue;
+        }
+
+        int groupOrd = groupFieldTermsIndex.binarySearchLookup(groupedFacetHit.groupValue, spare);
+        if (groupOrd < 0) {
+          continue;
+        }
+
+        int segmentGroupedFacetsIndex = (groupOrd * facetFieldTermsIndex.numOrd()) + facetOrd;
+        segmentGroupedFacetHits.put(segmentGroupedFacetsIndex);
+      }
+
+      if (facetPrefix != null) {
+        startFacetOrd = facetFieldTermsIndex.binarySearchLookup(facetPrefix, spare);
+        if (startFacetOrd < 0) {
+          // Points to the ord one higher than facetPrefix
+          startFacetOrd = -startFacetOrd - 1;
+        }
+        BytesRef facetEndPrefix = BytesRef.deepCopyOf(facetPrefix);
+        facetEndPrefix.append(UnicodeUtil.BIG_TERM);
+        endFacetOrd = facetFieldTermsIndex.binarySearchLookup(facetEndPrefix, spare);
+        endFacetOrd = -endFacetOrd - 1; // Points to the ord one higher than facetEndPrefix
+      } else {
+        startFacetOrd = 0;
+        endFacetOrd = facetFieldTermsIndex.numOrd();
+      }
+    }
+
+    protected SegmentResult createSegmentResult() {
+      return new SegmentResult(segmentFacetCounts, segmentTotalCount, facetFieldTermsIndex.getTermsEnum(), startFacetOrd, endFacetOrd);
+    }
+  }
+
+  // Implementation for multi valued facet fields.
+  static class MV extends TermGroupFacetCollector {
+
+    private DocTermOrds facetFieldDocTermOrds;
+    private TermsEnum facetOrdTermsEnum;
+    private DocTermOrds.TermOrdsIterator reuse;
+
+    MV(String groupField, String facetField, BytesRef facetPrefix, int initialSize) {
+      super(groupField, facetField, facetPrefix, initialSize);
+    }
+
+    public void collect(int doc) throws IOException {
+      int groupOrd = groupFieldTermsIndex.getOrd(doc);
+      reuse = facetFieldDocTermOrds.lookup(doc, reuse);
+      int chunk;
+      boolean first = true;
+      int[] buffer = new int[5];
+      do {
+        chunk = reuse.read(buffer);
+        if (first && chunk == 0) {
+          chunk = 1;
+          buffer[0] = facetFieldDocTermOrds.numTerms(); // this facet ord is reserved for docs not containing facet field.
+        }
+        first = false;
+
+        for (int pos = 0; pos < chunk; pos++) {
+          int facetOrd = buffer[pos];
+          if (facetOrd < startFacetOrd || facetOrd >= endFacetOrd) {
+            continue;
+          }
+
+          int segmentGroupedFacetsIndex = (groupOrd * (facetFieldDocTermOrds.numTerms() + 1)) + facetOrd;
+          if (segmentGroupedFacetHits.exists(segmentGroupedFacetsIndex)) {
+            continue;
+          }
+
+          segmentTotalCount++;
+          segmentFacetCounts[facetOrd]++;
+
+          segmentGroupedFacetHits.put(segmentGroupedFacetsIndex);
+          groupedFacetHits.add(
+              new GroupedFacetHit(
+                  groupOrd == 0 ? null : groupFieldTermsIndex.lookup(groupOrd, new BytesRef()),
+                  facetOrd == facetFieldDocTermOrds.numTerms() ? null : BytesRef.deepCopyOf(facetFieldDocTermOrds.lookupTerm(facetOrdTermsEnum, facetOrd))
+              )
+          );
+        }
+      } while (chunk >= buffer.length);
+    }
+
+    public void setNextReader(AtomicReaderContext context) throws IOException {
+      if (segmentFacetCounts != null) {
+        segmentResults.add(createSegmentResult());
+      }
+
+      reuse = null;
+      groupFieldTermsIndex = FieldCache.DEFAULT.getTermsIndex(context.reader(), groupField);
+      facetFieldDocTermOrds = FieldCache.DEFAULT.getDocTermOrds(context.reader(), facetField);
+      facetOrdTermsEnum = facetFieldDocTermOrds.getOrdTermsEnum(context.reader());
+      // [facetFieldDocTermOrds.numTerms() + 1] for all possible facet values and docs not containing facet field
+      segmentFacetCounts = new int[facetFieldDocTermOrds.numTerms() + 1];
+      segmentTotalCount = 0;
+
+      segmentGroupedFacetHits.clear();
+      for (GroupedFacetHit groupedFacetHit : groupedFacetHits) {
+        int groupOrd = groupFieldTermsIndex.binarySearchLookup(groupedFacetHit.groupValue, spare);
+        if (groupOrd < 0) {
+          continue;
+        }
+
+        int facetOrd;
+        if (groupedFacetHit.facetValue != null) {
+          if (!facetOrdTermsEnum.seekExact(groupedFacetHit.facetValue, true)) {
+            continue;
+          }
+          facetOrd = (int) facetOrdTermsEnum.ord();
+        } else {
+          facetOrd = facetFieldDocTermOrds.numTerms();
+        }
+
+        // (facetFieldDocTermOrds.numTerms() + 1) for all possible facet values and docs not containing facet field
+        int segmentGroupedFacetsIndex = (groupOrd * (facetFieldDocTermOrds.numTerms() + 1)) + facetOrd;
+        segmentGroupedFacetHits.put(segmentGroupedFacetsIndex);
+      }
+
+      if (facetPrefix != null) {
+        TermsEnum.SeekStatus seekStatus = facetOrdTermsEnum.seekCeil(facetPrefix, true);
+        if (seekStatus != TermsEnum.SeekStatus.END) {
+          startFacetOrd = (int) facetOrdTermsEnum.ord();
+        } else {
+          startFacetOrd = 0;
+          endFacetOrd = 0;
+          return;
+        }
+
+        BytesRef facetEndPrefix = BytesRef.deepCopyOf(facetPrefix);
+        facetEndPrefix.append(UnicodeUtil.BIG_TERM);
+        seekStatus = facetOrdTermsEnum.seekCeil(facetEndPrefix, true);
+        if (seekStatus != TermsEnum.SeekStatus.END) {
+          endFacetOrd = (int) facetOrdTermsEnum.ord();
+        } else {
+          endFacetOrd = facetFieldDocTermOrds.numTerms(); // Don't include null...
+        }
+      } else {
+        startFacetOrd = 0;
+        endFacetOrd = facetFieldDocTermOrds.numTerms() + 1;
+      }
+    }
+
+    protected SegmentResult createSegmentResult() {
+      return new SegmentResult(segmentFacetCounts, segmentTotalCount, facetFieldDocTermOrds.numTerms(), facetOrdTermsEnum, startFacetOrd, endFacetOrd);
+    }
+  }
+
+}
+
+class SegmentResult {
+
+  final int[] counts;
+  final int total;
+  final int missing;
+
+  // Used for merging the segment results
+  BytesRef mergeTerm;
+  int mergePos;
+  final int maxTermPos;
+  final TermsEnum tenum;
+
+  SegmentResult(int[] counts, int total, TermsEnum tenum, int startFacetOrd, int endFacetOrd) {
+    this.counts = counts;
+    this.missing = counts[0];
+    this.total = total - missing;
+    this.tenum = tenum;
+    this.mergePos = startFacetOrd == 0 ? 1 : startFacetOrd;
+    this.maxTermPos = endFacetOrd;
+  }
+
+  SegmentResult(int[] counts, int total, int missingCountIndex, TermsEnum tenum, int startFacetOrd, int endFacetOrd) {
+    this.counts = counts;
+    this.missing = counts[missingCountIndex];
+    this.total = total - missing;
+    this.tenum = tenum;
+    this.mergePos = startFacetOrd;
+    if (endFacetOrd == missingCountIndex + 1) {
+      this.maxTermPos = missingCountIndex;
+    } else {
+      this.maxTermPos = endFacetOrd;
+    }
+  }
+
+  void initializeForMerge() throws IOException {
+    tenum.seekExact(mergePos);
+    mergeTerm = tenum.term();
+  }
+
+  void nextTerm() throws IOException {
+    mergeTerm = tenum.next();
+  }
+
+}
+
+class GroupedFacetHit {
+
+  final BytesRef groupValue;
+  final BytesRef facetValue;
+
+  GroupedFacetHit(BytesRef groupValue, BytesRef facetValue) {
+    this.groupValue = groupValue;
+    this.facetValue = facetValue;
+  }
+}
+
+class SegmentResultPriorityQueue extends PriorityQueue<SegmentResult> {
+
+  SegmentResultPriorityQueue(int maxSize) {
+    super(maxSize);
+  }
+
+  protected boolean lessThan(SegmentResult a, SegmentResult b) {
+    return a.mergeTerm.compareTo(b.mergeTerm) < 0;
+  }
+}

Added: lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/AbstractGroupingTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/AbstractGroupingTestCase.java?rev=1298144&view=auto
==============================================================================
--- lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/AbstractGroupingTestCase.java (added)
+++ lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/AbstractGroupingTestCase.java Wed Mar  7 21:57:48 2012
@@ -0,0 +1,52 @@
+package org.apache.lucene.search.grouping;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
+
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Random;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+/**
+ * Base class for grouping related tests.
+ */
+// TODO (MvG) : The grouping tests contain a lot of code duplication. Try to move the common code to this class..
+public class AbstractGroupingTestCase extends LuceneTestCase {
+  
+  protected String generateRandomNonEmptyString() {
+    String randomValue;
+    do {
+      // B/c of DV based impl we can't see the difference between an empty string and a null value.
+      // For that reason we don't generate empty string groups.
+      randomValue = _TestUtil.randomRealisticUnicodeString(random);
+    } while ("".equals(randomValue));
+    return randomValue;
+  }
+
+}

Added: lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/TermGroupFacetCollectorTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/TermGroupFacetCollectorTest.java?rev=1298144&view=auto
==============================================================================
--- lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/TermGroupFacetCollectorTest.java (added)
+++ lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/TermGroupFacetCollectorTest.java Wed Mar  7 21:57:48 2012
@@ -0,0 +1,600 @@
+package org.apache.lucene.search.grouping;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.*;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.grouping.term.TermGroupFacetCollector;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util._TestUtil;
+
+import java.io.IOException;
+import java.util.*;
+
+public class TermGroupFacetCollectorTest extends AbstractGroupingTestCase {
+
+  public void testSimple() throws Exception {
+    final String groupField = "hotel";
+    FieldType customType = new FieldType();
+    customType.setStored(true);
+
+    Directory dir = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(
+        random,
+        dir,
+        newIndexWriterConfig(TEST_VERSION_CURRENT,
+            new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
+    boolean canUseIDV = false;// Enable later... !"Lucene3x".equals(w.w.getConfig().getCodec().getName());
+
+    // 0
+    Document doc = new Document();
+    addGroupField(doc, groupField, "a", canUseIDV);
+    doc.add(new Field("airport", "ams", TextField.TYPE_UNSTORED));
+    doc.add(new Field("duration", "5", TextField.TYPE_UNSTORED));
+    w.addDocument(doc);
+
+    // 1
+    doc = new Document();
+    addGroupField(doc, groupField, "a", canUseIDV);
+    doc.add(new Field("airport", "dus", TextField.TYPE_STORED));
+    doc.add(new Field("duration", "10", TextField.TYPE_UNSTORED));
+    w.addDocument(doc);
+
+    // 2
+    doc = new Document();
+    addGroupField(doc, groupField, "b", canUseIDV);
+    doc.add(new Field("airport", "ams", TextField.TYPE_UNSTORED));
+    doc.add(new Field("duration", "10", TextField.TYPE_UNSTORED));
+    w.addDocument(doc);
+    w.commit(); // To ensure a second segment
+
+    // 3
+    doc = new Document();
+    addGroupField(doc, groupField, "b", canUseIDV);
+    doc.add(new Field("airport", "ams", TextField.TYPE_UNSTORED));
+    doc.add(new Field("duration", "5", TextField.TYPE_UNSTORED));
+    w.addDocument(doc);
+
+    // 4
+    doc = new Document();
+    addGroupField(doc, groupField, "b", canUseIDV);
+    doc.add(new Field("airport", "ams", TextField.TYPE_UNSTORED));
+    doc.add(new Field("duration", "5", TextField.TYPE_UNSTORED));
+    w.addDocument(doc);
+
+    IndexSearcher indexSearcher = new IndexSearcher(w.getReader());
+    TermGroupFacetCollector groupedAirportFacetCollector =
+        TermGroupFacetCollector.createTermGroupFacetCollector(groupField, "airport", false, null, 128);
+    indexSearcher.search(new MatchAllDocsQuery(), groupedAirportFacetCollector);
+    TermGroupFacetCollector.GroupedFacetResult airportResult = groupedAirportFacetCollector.mergeSegmentResults(10, 0, false);
+    assertEquals(3, airportResult.getTotalCount());
+    assertEquals(0, airportResult.getTotalMissingCount());
+
+    List<TermGroupFacetCollector.FacetEntry> entries = airportResult.getFacetEntries(0, 10);
+    assertEquals(2, entries.size());
+    assertEquals("ams", entries.get(0).getValue().utf8ToString());
+    assertEquals(2, entries.get(0).getCount());
+    assertEquals("dus", entries.get(1).getValue().utf8ToString());
+    assertEquals(1, entries.get(1).getCount());
+
+
+    TermGroupFacetCollector groupedDurationFacetCollector =
+        TermGroupFacetCollector.createTermGroupFacetCollector(groupField, "duration", false, null, 128);
+    indexSearcher.search(new MatchAllDocsQuery(), groupedDurationFacetCollector);
+    TermGroupFacetCollector.GroupedFacetResult durationResult = groupedDurationFacetCollector.mergeSegmentResults(10, 0, false);
+    assertEquals(4, durationResult.getTotalCount());
+    assertEquals(0, durationResult.getTotalMissingCount());
+
+    entries = durationResult.getFacetEntries(0, 10);
+    assertEquals(2, entries.size());
+    assertEquals("10", entries.get(0).getValue().utf8ToString());
+    assertEquals(2, entries.get(0).getCount());
+    assertEquals("5", entries.get(1).getValue().utf8ToString());
+    assertEquals(2, entries.get(1).getCount());
+
+    // 5
+    doc = new Document();
+    addGroupField(doc, groupField, "b", canUseIDV);
+    doc.add(new Field("duration", "5", TextField.TYPE_UNSTORED));
+    w.addDocument(doc);
+
+    // 6
+    doc = new Document();
+    addGroupField(doc, groupField, "b", canUseIDV);
+    doc.add(new Field("airport", "bru", TextField.TYPE_UNSTORED));
+    doc.add(new Field("duration", "10", TextField.TYPE_UNSTORED));
+    w.addDocument(doc);
+
+    // 7
+    doc = new Document();
+    addGroupField(doc, groupField, "b", canUseIDV);
+    doc.add(new Field("airport", "bru", TextField.TYPE_UNSTORED));
+    doc.add(new Field("duration", "15", TextField.TYPE_UNSTORED));
+    w.addDocument(doc);
+
+    // 8
+    doc = new Document();
+    addGroupField(doc, groupField, "a", canUseIDV);
+    doc.add(new Field("airport", "bru", TextField.TYPE_UNSTORED));
+    doc.add(new Field("duration", "10", TextField.TYPE_UNSTORED));
+    w.addDocument(doc);
+
+    indexSearcher.getIndexReader().close();
+    indexSearcher = new IndexSearcher(w.getReader());
+    groupedAirportFacetCollector = TermGroupFacetCollector.createTermGroupFacetCollector(groupField, "airport", true, null, 128);
+    indexSearcher.search(new MatchAllDocsQuery(), groupedAirportFacetCollector);
+    airportResult = groupedAirportFacetCollector.mergeSegmentResults(3, 0, true);
+    assertEquals(5, airportResult.getTotalCount());
+    assertEquals(1, airportResult.getTotalMissingCount());
+
+    entries = airportResult.getFacetEntries(1, 2);
+    assertEquals(2, entries.size());
+    assertEquals("bru", entries.get(0).getValue().utf8ToString());
+    assertEquals(2, entries.get(0).getCount());
+    assertEquals("dus", entries.get(1).getValue().utf8ToString());
+    assertEquals(1, entries.get(1).getCount());
+
+    groupedDurationFacetCollector = TermGroupFacetCollector.createTermGroupFacetCollector(groupField, "duration", false, null, 128);
+    indexSearcher.search(new MatchAllDocsQuery(), groupedDurationFacetCollector);
+    durationResult = groupedDurationFacetCollector.mergeSegmentResults(10, 2, true);
+    assertEquals(5, durationResult.getTotalCount());
+    assertEquals(0, durationResult.getTotalMissingCount());
+
+    entries = durationResult.getFacetEntries(1, 1);
+    assertEquals(1, entries.size());
+    assertEquals("5", entries.get(0).getValue().utf8ToString());
+    assertEquals(2, entries.get(0).getCount());
+
+    // 9
+    doc = new Document();
+    addGroupField(doc, groupField, "c", canUseIDV);
+    doc.add(new Field("airport", "bru", TextField.TYPE_UNSTORED));
+    doc.add(new Field("duration", "15", TextField.TYPE_UNSTORED));
+    w.addDocument(doc);
+
+    // 10
+    doc = new Document();
+    addGroupField(doc, groupField, "c", canUseIDV);
+    doc.add(new Field("airport", "dus", TextField.TYPE_UNSTORED));
+    doc.add(new Field("duration", "10", TextField.TYPE_UNSTORED));
+    w.addDocument(doc);
+
+    indexSearcher.getIndexReader().close();
+    indexSearcher = new IndexSearcher(w.getReader());
+    groupedAirportFacetCollector = TermGroupFacetCollector.createTermGroupFacetCollector(groupField, "airport", false, null, 128);
+    indexSearcher.search(new MatchAllDocsQuery(), groupedAirportFacetCollector);
+    airportResult = groupedAirportFacetCollector.mergeSegmentResults(10, 0, false);
+    assertEquals(7, airportResult.getTotalCount());
+    assertEquals(1, airportResult.getTotalMissingCount());
+
+    entries = airportResult.getFacetEntries(0, 10);
+    assertEquals(3, entries.size());
+    assertEquals("ams", entries.get(0).getValue().utf8ToString());
+    assertEquals(2, entries.get(0).getCount());
+    assertEquals("bru", entries.get(1).getValue().utf8ToString());
+    assertEquals(3, entries.get(1).getCount());
+    assertEquals("dus", entries.get(2).getValue().utf8ToString());
+    assertEquals(2, entries.get(2).getCount());
+
+    groupedDurationFacetCollector = TermGroupFacetCollector.createTermGroupFacetCollector(groupField, "duration", false, new BytesRef("1"), 128);
+    indexSearcher.search(new MatchAllDocsQuery(), groupedDurationFacetCollector);
+    durationResult = groupedDurationFacetCollector.mergeSegmentResults(10, 0, true);
+    assertEquals(5, durationResult.getTotalCount());
+    assertEquals(0, durationResult.getTotalMissingCount());
+
+    entries = durationResult.getFacetEntries(0, 10);
+    assertEquals(2, entries.size());
+    assertEquals("10", entries.get(0).getValue().utf8ToString());
+    assertEquals(3, entries.get(0).getCount());
+    assertEquals("15", entries.get(1).getValue().utf8ToString());
+    assertEquals(2, entries.get(1).getCount());
+
+    w.close();
+    indexSearcher.getIndexReader().close();
+    dir.close();
+  }
+
+  private void addGroupField(Document doc, String groupField, String value, boolean canUseIDV) {
+    doc.add(new Field(groupField, value, TextField.TYPE_UNSTORED));
+    if (canUseIDV) {
+      doc.add(new DocValuesField(groupField, new BytesRef(value), DocValues.Type.BYTES_VAR_SORTED));
+    }
+  }
+
+  public void testRandom() throws Exception {
+    int numberOfRuns = _TestUtil.nextInt(random, 3, 6);
+    for (int indexIter = 0; indexIter < numberOfRuns; indexIter++) {
+      boolean multipleFacetsPerDocument = random.nextBoolean();
+      IndexContext context = createIndexContext(multipleFacetsPerDocument);
+      final IndexSearcher searcher = newSearcher(context.indexReader);
+
+      for (int searchIter = 0; searchIter < 100; searchIter++) {
+        String searchTerm = context.contentStrings[random.nextInt(context.contentStrings.length)];
+        int limit = random.nextInt(context.facetValues.size());
+        int offset = random.nextInt(context.facetValues.size() - limit);
+        int size = offset + limit;
+        int minCount = random.nextBoolean() ? 0 : random.nextInt(1 + context.facetWithMostGroups / 10);
+        boolean orderByCount = random.nextBoolean();
+        String randomStr = getFromSet(context.facetValues, random.nextInt(context.facetValues.size()));
+        final String facetPrefix;
+        if (randomStr == null) {
+          facetPrefix = null;
+        } else {
+          int codePointLen = randomStr.codePointCount(0, randomStr.length());
+          int randomLen = random.nextInt(codePointLen);
+          if (codePointLen == randomLen - 1) {
+            facetPrefix = null;
+          } else {
+            int end = randomStr.offsetByCodePoints(0, randomLen);
+            facetPrefix = random.nextBoolean() ? null : randomStr.substring(end);
+          }
+        }
+
+        GroupedFacetResult expectedFacetResult = createExpectedFacetResult(searchTerm, context, offset, limit, minCount, orderByCount, facetPrefix);
+        TermGroupFacetCollector groupFacetCollector = createRandomCollector("group", "facet", facetPrefix, multipleFacetsPerDocument);
+        searcher.search(new TermQuery(new Term("content", searchTerm)), groupFacetCollector);
+        TermGroupFacetCollector.GroupedFacetResult actualFacetResult = groupFacetCollector.mergeSegmentResults(size, minCount, orderByCount);
+
+        List<TermGroupFacetCollector.FacetEntry> expectedFacetEntries = expectedFacetResult.getFacetEntries();
+        List<TermGroupFacetCollector.FacetEntry> actualFacetEntries = actualFacetResult.getFacetEntries(offset, limit);
+
+        if (VERBOSE) {
+          System.out.println("Collector: " + groupFacetCollector.getClass().getSimpleName());
+          System.out.println("Num group: " + context.numGroups);
+          System.out.println("Num doc: " + context.numDocs);
+          System.out.println("Index iter: " + indexIter);
+          System.out.println("multipleFacetsPerDocument: " + multipleFacetsPerDocument);
+          System.out.println("Search iter: " + searchIter);
+
+          System.out.println("Search term: " + searchTerm);
+          System.out.println("Min count: " + minCount);
+          System.out.println("Facet offset: " + offset);
+          System.out.println("Facet limit: " + limit);
+          System.out.println("Facet prefix: " + facetPrefix);
+          System.out.println("Order by count: " + orderByCount);
+
+          System.out.println("\n=== Expected: \n");
+          System.out.println("Total count " + expectedFacetResult.getTotalCount());
+          System.out.println("Total missing count " + expectedFacetResult.getTotalMissingCount());
+          int counter = 1;
+          for (TermGroupFacetCollector.FacetEntry expectedFacetEntry : expectedFacetEntries) {
+            System.out.println(
+                String.format(
+                    "%d. Expected facet value %s with count %d",
+                    counter++, expectedFacetEntry.getValue().utf8ToString(), expectedFacetEntry.getCount()
+                )
+            );
+          }
+
+          System.out.println("\n=== Actual: \n");
+          System.out.println("Total count " + actualFacetResult.getTotalCount());
+          System.out.println("Total missing count " + actualFacetResult.getTotalMissingCount());
+          counter = 1;
+          for (TermGroupFacetCollector.FacetEntry actualFacetEntry : actualFacetEntries) {
+            System.out.println(
+                String.format(
+                    "%d. Actual facet value %s with count %d",
+                    counter++, actualFacetEntry.getValue().utf8ToString(), actualFacetEntry.getCount()
+                )
+            );
+          }
+          System.out.println("\n===================================================================================");
+        }
+
+        assertEquals(expectedFacetResult.getTotalCount(), actualFacetResult.getTotalCount());
+        assertEquals(expectedFacetResult.getTotalMissingCount(), actualFacetResult.getTotalMissingCount());
+        assertEquals(expectedFacetEntries.size(), actualFacetEntries.size());
+        for (int i = 0; i < expectedFacetEntries.size(); i++) {
+          TermGroupFacetCollector.FacetEntry expectedFacetEntry = expectedFacetEntries.get(i);
+          TermGroupFacetCollector.FacetEntry actualFacetEntry = actualFacetEntries.get(i);
+          assertEquals(expectedFacetEntry.getValue().utf8ToString() + " != " + actualFacetEntry.getValue().utf8ToString(), expectedFacetEntry.getValue(), actualFacetEntry.getValue());
+          assertEquals(expectedFacetEntry.getCount() + " != " + actualFacetEntry.getCount(), expectedFacetEntry.getCount(), actualFacetEntry.getCount());
+        }
+      }
+
+      context.indexReader.close();
+      context.dir.close();
+    }
+  }
+
+  private IndexContext createIndexContext(boolean multipleFacetValuesPerDocument) throws IOException {
+    final int numDocs = _TestUtil.nextInt(random, 138, 1145) * RANDOM_MULTIPLIER;
+    final int numGroups = _TestUtil.nextInt(random, 1, numDocs / 4);
+    final int numFacets = _TestUtil.nextInt(random, 1, numDocs / 6);
+
+    if (VERBOSE) {
+      System.out.println("TEST: numDocs=" + numDocs + " numGroups=" + numGroups);
+    }
+
+    final List<String> groups = new ArrayList<String>();
+    for (int i = 0; i < numGroups; i++) {
+      groups.add(generateRandomNonEmptyString());
+    }
+    final List<String> facetValues = new ArrayList<String>();
+    for (int i = 0; i < numFacets; i++) {
+      facetValues.add(generateRandomNonEmptyString());
+    }
+    final String[] contentBrs = new String[_TestUtil.nextInt(random, 2, 20)];
+    if (VERBOSE) {
+      System.out.println("TEST: create fake content");
+    }
+    for (int contentIDX = 0; contentIDX < contentBrs.length; contentIDX++) {
+      contentBrs[contentIDX] = generateRandomNonEmptyString();
+      if (VERBOSE) {
+        System.out.println("  content=" + contentBrs[contentIDX]);
+      }
+    }
+
+    Directory dir = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(
+        random,
+        dir,
+        newIndexWriterConfig(
+            TEST_VERSION_CURRENT,
+            new MockAnalyzer(random)
+        )
+    );
+
+    Document doc = new Document();
+    Document docNoGroup = new Document();
+    Document docNoFacet = new Document();
+    Document docNoGroupNoFacet = new Document();
+    Field group = newField("group", "", StringField.TYPE_UNSTORED);
+    doc.add(group);
+    docNoFacet.add(group);
+    Field[] facetFields = multipleFacetValuesPerDocument? new Field[2 + random.nextInt(6)] : new Field[1];
+    for (int i = 0; i < facetFields.length; i++) {
+      facetFields[i] = newField("facet", "", StringField.TYPE_UNSTORED);
+      doc.add(facetFields[i]);
+      docNoGroup.add(facetFields[i]);
+    }
+    Field content = newField("content", "", StringField.TYPE_UNSTORED);
+    doc.add(content);
+    docNoGroup.add(content);
+    docNoFacet.add(content);
+    docNoGroupNoFacet.add(content);
+
+    NavigableSet<String> uniqueFacetValues = new TreeSet<String>(new Comparator<String>() {
+
+      public int compare(String a, String b) {
+        if (a == b) {
+          return 0;
+        } else if (a == null) {
+          return -1;
+        } else if (b == null) {
+          return 1;
+        } else {
+          return a.compareTo(b);
+        }
+      }
+
+    });
+    Map<String, Map<String, Set<String>>> searchTermToFacetToGroups = new HashMap<String, Map<String, Set<String>>>();
+    int facetWithMostGroups = 0;
+    for (int i = 0; i < numDocs; i++) {
+      final String groupValue;
+      if (random.nextInt(24) == 17) {
+        // So we test the "doc doesn't have the group'd
+        // field" case:
+        groupValue = null;
+      } else {
+        groupValue = groups.get(random.nextInt(groups.size()));
+      }
+
+      String contentStr = contentBrs[random.nextInt(contentBrs.length)];
+      if (!searchTermToFacetToGroups.containsKey(contentStr)) {
+        searchTermToFacetToGroups.put(contentStr, new HashMap<String, Set<String>>());
+      }
+      Map<String, Set<String>> facetToGroups = searchTermToFacetToGroups.get(contentStr);
+
+      List<String> facetVals = new ArrayList<String>();
+      if (random.nextInt(24) != 18) {
+        for (Field facetField : facetFields) {
+          String facetValue = facetValues.get(random.nextInt(facetValues.size()));
+          uniqueFacetValues.add(facetValue);
+          if (!facetToGroups.containsKey(facetValue)) {
+            facetToGroups.put(facetValue, new HashSet<String>());
+          }
+          Set<String> groupsInFacet = facetToGroups.get(facetValue);
+          groupsInFacet.add(groupValue);
+          if (groupsInFacet.size() > facetWithMostGroups) {
+            facetWithMostGroups = groupsInFacet.size();
+          }
+          facetField.setStringValue(facetValue);
+          facetVals.add(facetValue);
+        }
+      } else {
+        uniqueFacetValues.add(null);
+        if (!facetToGroups.containsKey(null)) {
+          facetToGroups.put(null, new HashSet<String>());
+        }
+        Set<String> groupsInFacet = facetToGroups.get(null);
+        groupsInFacet.add(groupValue);
+        if (groupsInFacet.size() > facetWithMostGroups) {
+          facetWithMostGroups = groupsInFacet.size();
+        }
+      }
+
+      if (VERBOSE) {
+        System.out.println("  doc content=" + contentStr + " group=" + (groupValue == null ? "null" : groupValue) + " facetVals=" + facetVals);
+      }
+
+      if (groupValue != null) {
+        group.setStringValue(groupValue);
+      }
+      content.setStringValue(contentStr);
+      if (groupValue == null && facetVals.isEmpty()) {
+        writer.addDocument(docNoGroupNoFacet);
+      } else if (facetVals.isEmpty()) {
+        writer.addDocument(docNoFacet);
+      } else if (groupValue == null) {
+        writer.addDocument(docNoGroup);
+      } else {
+        writer.addDocument(doc);
+      }
+    }
+
+    DirectoryReader reader = writer.getReader();
+    writer.close();
+
+    return new IndexContext(searchTermToFacetToGroups, reader, numDocs, dir, facetWithMostGroups, numGroups, contentBrs, uniqueFacetValues);
+  }
+
+  private GroupedFacetResult createExpectedFacetResult(String searchTerm, IndexContext context, int offset, int limit, int minCount, final boolean orderByCount, String facetPrefix) {
+    Map<String, Set<String>> facetGroups = context.searchTermToFacetGroups.get(searchTerm);
+    if (facetGroups == null) {
+      facetGroups = new HashMap<String, Set<String>>();
+    }
+
+    int totalCount = 0;
+    int totalMissCount = 0;
+    Set<String> facetValues;
+    if (facetPrefix != null) {
+      facetValues = new HashSet<String>();
+      for (String facetValue : context.facetValues) {
+        if (facetValue != null && facetValue.startsWith(facetPrefix)) {
+          facetValues.add(facetValue);
+        }
+      }
+    } else {
+      facetValues = context.facetValues;
+    }
+
+    List<TermGroupFacetCollector.FacetEntry> entries = new ArrayList<TermGroupFacetCollector.FacetEntry>(facetGroups.size());
+    // also includes facets with count 0
+    for (String facetValue : facetValues) {
+      if (facetValue == null) {
+        continue;
+      }
+
+      Set<String> groups = facetGroups.get(facetValue);
+      int count = groups != null ? groups.size() : 0;
+      if (count >= minCount) {
+        entries.add(new TermGroupFacetCollector.FacetEntry(new BytesRef(facetValue), count));
+      }
+      totalCount += count;
+    }
+
+    // Only include null count when no facet prefix is specified
+    if (facetPrefix == null) {
+      Set<String> groups = facetGroups.get(null);
+      if (groups != null) {
+        totalMissCount = groups.size();
+      }
+    }
+
+    Collections.sort(entries, new Comparator<TermGroupFacetCollector.FacetEntry>() {
+
+      public int compare(TermGroupFacetCollector.FacetEntry a, TermGroupFacetCollector.FacetEntry b) {
+        if (orderByCount) {
+          int cmp = b.getCount() - a.getCount();
+          if (cmp != 0) {
+            return cmp;
+          }
+        }
+        return a.getValue().compareTo(b.getValue());
+      }
+
+    });
+
+    int endOffset = offset + limit;
+    List<TermGroupFacetCollector.FacetEntry> entriesResult;
+    if (offset >= entries.size()) {
+      entriesResult = Collections.emptyList();
+    } else if (endOffset >= entries.size()) {
+      entriesResult = entries.subList(offset, entries.size());
+    } else {
+      entriesResult = entries.subList(offset, endOffset);
+    }
+    return new GroupedFacetResult(totalCount, totalMissCount, entriesResult);
+  }
+
+  private TermGroupFacetCollector createRandomCollector(String groupField, String facetField, String facetPrefix, boolean multipleFacetsPerDocument) {
+    BytesRef facetPrefixBR = facetPrefix == null ? null : new BytesRef(facetPrefix);
+    return TermGroupFacetCollector.createTermGroupFacetCollector(groupField, facetField, multipleFacetsPerDocument, facetPrefixBR, random.nextInt(1024));
+  }
+
+  private String getFromSet(Set<String> set, int index) {
+    int currentIndex = 0;
+    for (String bytesRef : set) {
+      if (currentIndex++ == index) {
+        return bytesRef;
+      }
+    }
+
+    return null;
+  }
+
+  private class IndexContext {
+
+    final int numDocs;
+    final DirectoryReader indexReader;
+    final Map<String, Map<String, Set<String>>> searchTermToFacetGroups;
+    final NavigableSet<String> facetValues;
+    final Directory dir;
+    final int facetWithMostGroups;
+    final int numGroups;
+    final String[] contentStrings;
+
+    public IndexContext(Map<String, Map<String, Set<String>>> searchTermToFacetGroups, DirectoryReader r,
+                        int numDocs, Directory dir, int facetWithMostGroups, int numGroups, String[] contentStrings, NavigableSet<String> facetValues) {
+      this.searchTermToFacetGroups = searchTermToFacetGroups;
+      this.indexReader = r;
+      this.numDocs = numDocs;
+      this.dir = dir;
+      this.facetWithMostGroups = facetWithMostGroups;
+      this.numGroups = numGroups;
+      this.contentStrings = contentStrings;
+      this.facetValues = facetValues;
+    }
+  }
+
+  private class GroupedFacetResult {
+
+    final int totalCount;
+    final int totalMissingCount;
+    final List<TermGroupFacetCollector.FacetEntry> facetEntries;
+
+    private GroupedFacetResult(int totalCount, int totalMissingCount, List<TermGroupFacetCollector.FacetEntry> facetEntries) {
+      this.totalCount = totalCount;
+      this.totalMissingCount = totalMissingCount;
+      this.facetEntries = facetEntries;
+    }
+
+    public int getTotalCount() {
+      return totalCount;
+    }
+
+    public int getTotalMissingCount() {
+      return totalMissingCount;
+    }
+
+    public List<TermGroupFacetCollector.FacetEntry> getFacetEntries() {
+      return facetEntries;
+    }
+  }
+
+}