You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by gs...@apache.org on 2022/09/13 14:01:23 UTC

[lucene] branch branch_9x updated: GITHUB#11742: MatchingFacetSetsCounts#getTopChildren now returns top children instead of all children (#11764)

This is an automated email from the ASF dual-hosted git repository.

gsmiller pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/branch_9x by this push:
     new 3334697e0d6 GITHUB#11742: MatchingFacetSetsCounts#getTopChildren now returns top children instead of all children (#11764)
3334697e0d6 is described below

commit 3334697e0d6308182f0ad99e11166da3d40c5740
Author: Greg Miller <gs...@gmail.com>
AuthorDate: Tue Sep 13 06:50:52 2022 -0700

    GITHUB#11742: MatchingFacetSetsCounts#getTopChildren now returns top children instead of all children (#11764)
---
 lucene/CHANGES.txt                                 |   5 +
 .../facet/facetset/MatchingFacetSetsCounts.java    |  59 ++++++++++-
 .../facetset/TestMatchingFacetSetsCounts.java      | 110 +++++++++++++++++++++
 3 files changed, 173 insertions(+), 1 deletion(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 6a3a27535a2..30641e46c78 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -5,6 +5,11 @@ http://s.apache.org/luceneversions
 
 ======================== Lucene 9.5.0 =======================
 
+API Changes
+---------------------
+* GITHUB#11742: MatchingFacetSetsCounts#getTopChildren now properly returns "top" children instead
+  of all children. (Greg Miller)
+
 Bug Fixes
 ---------------------
 
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/facetset/MatchingFacetSetsCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/facetset/MatchingFacetSetsCounts.java
index e6c8e67e288..c8151ab471e 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/facetset/MatchingFacetSetsCounts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/facetset/MatchingFacetSetsCounts.java
@@ -30,6 +30,7 @@ import org.apache.lucene.index.DocValues;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.PriorityQueue;
 
 /**
  * Returns the counts for each given {@link FacetSet}
@@ -156,7 +157,45 @@ public class MatchingFacetSetsCounts extends FacetCountsWithFilterQuery {
   @Override
   public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException {
     validateTopN(topN);
-    return getAllChildren(dim, path);
+
+    topN = Math.min(topN, counts.length);
+
+    PriorityQueue<Entry> pq =
+        new PriorityQueue<>(topN, () -> new Entry("", 0)) {
+          @Override
+          protected boolean lessThan(Entry a, Entry b) {
+            return compare(a.count, b.count, a.label, b.label) < 0;
+          }
+        };
+
+    int childCount = 0;
+    Entry reuse = pq.top();
+    for (int i = 0; i < counts.length; i++) {
+      int count = counts[i];
+      if (count > 0) {
+        childCount++;
+        String label = facetSetMatchers[i].label;
+        if (compare(reuse.count, count, reuse.label, label) < 0) {
+          reuse.label = label;
+          reuse.count = count;
+          reuse = pq.updateTop();
+        }
+      }
+    }
+
+    // Pop off any sentinel values in the case that we had fewer child labels with non-zero
+    // counts than the requested top-n:
+    while (childCount < pq.size()) {
+      pq.pop();
+    }
+
+    LabelAndValue[] labelValues = new LabelAndValue[Math.min(topN, childCount)];
+    for (int i = pq.size() - 1; i >= 0; i--) {
+      Entry e = pq.pop();
+      labelValues[i] = new LabelAndValue(e.label, e.count);
+    }
+
+    return new FacetResult(dim, path, totCount, labelValues, childCount);
   }
 
   @Override
@@ -176,4 +215,22 @@ public class MatchingFacetSetsCounts extends FacetCountsWithFilterQuery {
     return Arrays.stream(facetSetMatchers)
         .anyMatch(facetSetMatcher -> facetSetMatcher.dims != dims);
   }
+
+  private static int compare(int count1, int count2, String label1, String label2) {
+    int cmp = Integer.compare(count1, count2);
+    if (cmp == 0) {
+      cmp = label2.compareTo(label1);
+    }
+    return cmp;
+  }
+
+  private static final class Entry {
+    String label;
+    int count;
+
+    Entry(String label, int count) {
+      this.label = label;
+      this.count = count;
+    }
+  }
 }
diff --git a/lucene/facet/src/test/org/apache/lucene/facet/facetset/TestMatchingFacetSetsCounts.java b/lucene/facet/src/test/org/apache/lucene/facet/facetset/TestMatchingFacetSetsCounts.java
index 51466137f9e..e1bbad55e5c 100644
--- a/lucene/facet/src/test/org/apache/lucene/facet/facetset/TestMatchingFacetSetsCounts.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/facetset/TestMatchingFacetSetsCounts.java
@@ -16,19 +16,30 @@
  */
 package org.apache.lucene.facet.facetset;
 
+import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
 import java.io.IOException;
+import java.util.Locale;
 import org.apache.lucene.document.Document;
+import org.apache.lucene.facet.FacetResult;
 import org.apache.lucene.facet.FacetTestCase;
 import org.apache.lucene.facet.Facets;
 import org.apache.lucene.facet.FacetsCollector;
 import org.apache.lucene.facet.FacetsCollectorManager;
+import org.apache.lucene.facet.LabelAndValue;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.tests.index.RandomIndexWriter;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.InPlaceMergeSorter;
 
 public class TestMatchingFacetSetsCounts extends FacetTestCase {
+  private static final int FORD_ORD = 100;
+  private static final int TOYOTA_ORD = 101;
+  private static final int CHEVY_ORD = 102;
+  private static final int NISSAN_ORD = 103;
+  private static final int[] MANUFACTURER_ORDS = {FORD_ORD, TOYOTA_ORD, CHEVY_ORD, NISSAN_ORD};
 
   public void testInvalidTopN() throws IOException {
     Directory d = newDirectory();
@@ -87,4 +98,103 @@ public class TestMatchingFacetSetsCounts extends FacetTestCase {
     r.close();
     d.close();
   }
+
+  public void testTopChildren() throws Exception {
+    Directory d = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(random(), d);
+
+    // As a test scenario, we're faceting on the number of vehicles produced per day/make
+    // combination over the past 30 days:
+    final int numBins = 30;
+
+    final int[] expectedCounts = new int[numBins * MANUFACTURER_ORDS.length];
+    FacetSetMatcher[] facetSetMatchers = new FacetSetMatcher[numBins * MANUFACTURER_ORDS.length];
+
+    int totalDocs = 0;
+    int totalNonZeroBins = 0;
+    int index = 0;
+    for (int i = 0; i < numBins; i++) {
+      for (int ord : MANUFACTURER_ORDS) {
+        facetSetMatchers[index] =
+            new ExactFacetSetMatcher(
+                String.format(Locale.ROOT, "%d:%d", i, ord), new LongFacetSet(i, ord));
+
+        int carsManufactured = RandomNumbers.randomIntBetween(random(), 0, 100);
+        for (int k = 0; k < carsManufactured; k++) {
+          // Create a document for every vehicle produced:
+          Document doc = new Document();
+          doc.add(FacetSetsField.create("field", new LongFacetSet(i, ord)));
+          w.addDocument(doc);
+        }
+
+        if (carsManufactured > 0) {
+          totalNonZeroBins++;
+        }
+        totalDocs += carsManufactured;
+        expectedCounts[index] = carsManufactured;
+        index++;
+      }
+    }
+
+    IndexReader r = w.getReader();
+    w.close();
+
+    IndexSearcher s = newSearcher(r);
+    FacetsCollector fc = s.search(new MatchAllDocsQuery(), new FacetsCollectorManager());
+
+    Facets facets =
+        new MatchingFacetSetsCounts("field", fc, FacetSetDecoder::decodeLongs, facetSetMatchers);
+
+    // Sort by count (high-to-low) and tie-break on label, same as in
+    // MatchingFacetCounts#getTopChildren:
+    final int[] originalIndexes = new int[expectedCounts.length];
+    for (int i = 0; i < originalIndexes.length; i++) {
+      originalIndexes[i] = i;
+    }
+    new InPlaceMergeSorter() {
+      @Override
+      protected int compare(int i, int j) {
+        int cmp = Integer.compare(expectedCounts[j], expectedCounts[i]);
+        if (cmp == 0) {
+          int dayBinI = originalIndexes[i] / MANUFACTURER_ORDS.length;
+          int dayBinJ = originalIndexes[j] / MANUFACTURER_ORDS.length;
+          int ordIndexI = originalIndexes[i] % MANUFACTURER_ORDS.length;
+          int ordIndexJ = originalIndexes[j] % MANUFACTURER_ORDS.length;
+          String labelI =
+              String.format(Locale.ROOT, "%d:%d", dayBinI, MANUFACTURER_ORDS[ordIndexI]);
+          String labelJ =
+              String.format(Locale.ROOT, "%d:%d", dayBinJ, MANUFACTURER_ORDS[ordIndexJ]);
+          cmp = new BytesRef(labelI).compareTo(new BytesRef(labelJ));
+        }
+        return cmp;
+      }
+
+      @Override
+      protected void swap(int i, int j) {
+        int tmp = expectedCounts[i];
+        expectedCounts[i] = expectedCounts[j];
+        expectedCounts[j] = tmp;
+        tmp = originalIndexes[i];
+        originalIndexes[i] = originalIndexes[j];
+        originalIndexes[j] = tmp;
+      }
+    }.sort(0, expectedCounts.length);
+
+    final int topN = 10;
+    final LabelAndValue[] expected = new LabelAndValue[topN];
+    for (int i = 0; i < topN; i++) {
+      int count = expectedCounts[i];
+      int dayBin = originalIndexes[i] / MANUFACTURER_ORDS.length;
+      int ordIndex = originalIndexes[i] % MANUFACTURER_ORDS.length;
+      expected[i] =
+          new LabelAndValue(
+              String.format(Locale.ROOT, "%d:%d", dayBin, MANUFACTURER_ORDS[ordIndex]), count);
+    }
+
+    final FacetResult result = facets.getTopChildren(topN, "field");
+    assertFacetResult(result, "field", new String[0], totalNonZeroBins, totalDocs, expected);
+
+    r.close();
+    d.close();
+  }
 }