You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2021/05/03 15:18:48 UTC
[lucene] branch main updated: LUCENE-9948: Automatically detect multi- vs. single-valued cases in LongValueFacetCounts (#122)

This is an automated email from the ASF dual-hosted git repository.

rmuir pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new 650cad1  LUCENE-9948: Automatically detect multi- vs. single-valued cases in LongValueFacetCounts (#122)
650cad1 is described below

commit 650cad19a2ccf780a8ce30a6a8dfcf0d9d85022e
Author: Greg Miller <gs...@gmail.com>
AuthorDate: Mon May 3 08:18:38 2021 -0700

    LUCENE-9948: Automatically detect multi- vs. single-valued cases in LongValueFacetCounts (#122)
    
    The public API in LongValueFacetCounts previously required the user to specify whether-or-not a field being counted should be single- or multi-valued (i.e., is it NumericDocValues or SortedNumericDocValues). Since we can detect this automatically, it seems unnecessary to ask users to specify.
    
    Co-authored-by: Greg Miller <gm...@amazon.com>
---
 lucene/CHANGES.txt                                 |   5 +-
 lucene/MIGRATE.md                                  |   6 +
 .../apache/lucene/facet/LongValueFacetCounts.java  | 168 +++++++--------------
 .../lucene/facet/TestLongValueFacetCounts.java     |  40 +++--
 4 files changed, 86 insertions(+), 133 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 61fbee6..45dde72 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -110,7 +110,10 @@ API Changes
   Adrien Grand, Simon Willnauer)
 
 * LUCENE-9047: Directory API is now little endian. (Ignacio Vera, Adrien Grand)  
-  
+
+* LUCENE-9948: No longer require the user to specify whether-or-not a field is multi-valued in
+  LongValueFacetCounts (detect automatically based on what is indexed). (Greg Miller)
+
 Improvements
 
 * LUCENE-9687: Hunspell support improvements: add API for spell-checking and suggestions, support compound words,
diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md
index 783efff..29f5f7b 100644
--- a/lucene/MIGRATE.md
+++ b/lucene/MIGRATE.md
@@ -400,3 +400,9 @@ field.
 
 Lucene index readers are now using so little memory with the default codec that
 it was decided to remove the ability to estimate their RAM usage.
+
+## LongValueFacetCounts no longer accepts multiValued param in constructors (LUCENE-9948)
+
+LongValueFacetCounts will now automatically detect whether-or-not an indexed field is single- or
+multi-valued. The user no longer needs to provide this information to the ctors. Migrating should
+be as simple as no longer providing this boolean.
\ No newline at end of file
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/LongValueFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/LongValueFacetCounts.java
index 4379ab7..520d931 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/LongValueFacetCounts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/LongValueFacetCounts.java
@@ -54,52 +54,35 @@ public class LongValueFacetCounts extends Facets {
   /** Used for all values that are >= 1K. */
   private final LongIntScatterMap hashCounts = new LongIntScatterMap();
 
+  /** Field being counted. */
   private final String field;
 
   /**
-   * Total number of values counted, which is the subset of hits that had a value for this field.
+   * Total value count. For single-value cases, this is the subset of hits that had a value for this
+   * field.
    */
   private int totCount;
 
   /**
    * Create {@code LongValueFacetCounts}, using either single-valued {@link NumericDocValues} or
-   * multi-valued {@link SortedNumericDocValues} from the specified field.
+   * multi-valued {@link SortedNumericDocValues} from the specified field (depending on what has
+   * been indexed).
    */
-  public LongValueFacetCounts(String field, FacetsCollector hits, boolean multiValued)
-      throws IOException {
-    this(field, null, hits, multiValued);
+  public LongValueFacetCounts(String field, FacetsCollector hits) throws IOException {
+    this(field, null, hits);
   }
 
   /**
-   * Create {@code LongValueFacetCounts}, using the provided {@link LongValuesSource}. If hits is
-   * null then all facets are counted.
+   * Create {@code LongValueFacetCounts}, using the provided {@link LongValuesSource} if non-null.
+   * If {@code valueSource} is null, doc values from the provided {@code field} will be used.
    */
   public LongValueFacetCounts(String field, LongValuesSource valueSource, FacetsCollector hits)
       throws IOException {
-    this(field, valueSource, hits, false);
-  }
-
-  /**
-   * Create {@code LongValueFacetCounts}, using the provided {@link LongValuesSource}. random access
-   * (implement {@link org.apache.lucene.search.DocIdSet#bits}).
-   */
-  public LongValueFacetCounts(
-      String field, LongValuesSource valueSource, FacetsCollector hits, boolean multiValued)
-      throws IOException {
     this.field = field;
-    if (valueSource == null) {
-      if (multiValued) {
-        countMultiValued(field, hits.getMatchingDocs());
-      } else {
-        count(field, hits.getMatchingDocs());
-      }
-    } else {
-      // value source is always single valued
-      if (multiValued) {
-        throw new IllegalArgumentException(
-            "can only compute multi-valued facets directly from doc values (when valueSource is null)");
-      }
+    if (valueSource != null) {
       count(valueSource, hits.getMatchingDocs());
+    } else {
+      count(field, hits.getMatchingDocs());
     }
   }
 
@@ -107,31 +90,32 @@ public class LongValueFacetCounts extends Facets {
    * Counts all facet values for this reader. This produces the same result as computing facets on a
    * {@link org.apache.lucene.search.MatchAllDocsQuery}, but is more efficient.
    */
-  public LongValueFacetCounts(String field, IndexReader reader, boolean multiValued)
-      throws IOException {
-    this.field = field;
-    if (multiValued) {
-      countAllMultiValued(reader, field);
-    } else {
-      countAll(reader, field);
-    }
+  public LongValueFacetCounts(String field, IndexReader reader) throws IOException {
+    this(field, null, reader);
   }
 
   /**
-   * Counts all facet values for the provided {@link LongValuesSource}. This produces the same
-   * result as computing facets on a {@link org.apache.lucene.search.MatchAllDocsQuery}, but is more
-   * efficient.
+   * Counts all facet values for the provided {@link LongValuesSource} if non-null. If {@code
+   * valueSource} is null, doc values from the provided {@code field} will be used. This produces
+   * the same result as computing facets on a {@link org.apache.lucene.search.MatchAllDocsQuery},
+   * but is more efficient.
    */
   public LongValueFacetCounts(String field, LongValuesSource valueSource, IndexReader reader)
       throws IOException {
     this.field = field;
-    countAll(valueSource, field, reader);
+    if (valueSource != null) {
+      countAll(reader, valueSource);
+    } else {
+      countAll(reader, field);
+    }
   }
 
+  /** Counts from the provided valueSource. */
   private void count(LongValuesSource valueSource, List<MatchingDocs> matchingDocs)
       throws IOException {
 
     for (MatchingDocs hits : matchingDocs) {
+
       LongValues fv = valueSource.getValues(hits.context, null);
 
       // NOTE: this is not as efficient as working directly with the doc values APIs in the sparse
@@ -152,80 +136,41 @@ public class LongValueFacetCounts extends Facets {
     }
   }
 
+  /** Counts from the field's indexed doc values. */
   private void count(String field, List<MatchingDocs> matchingDocs) throws IOException {
-    for (MatchingDocs hits : matchingDocs) {
-      NumericDocValues fv = hits.context.reader().getNumericDocValues(field);
-      if (fv == null) {
-        continue;
-      }
-      countOneSegment(fv, hits);
-    }
-  }
-
-  private void countOneSegment(NumericDocValues values, MatchingDocs hits) throws IOException {
-    DocIdSetIterator it =
-        ConjunctionDISI.intersectIterators(Arrays.asList(hits.bits.iterator(), values));
-
-    for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
-      increment(values.longValue());
-      totCount++;
-    }
-  }
-
-  /** Counts directly from SortedNumericDocValues. */
-  private void countMultiValued(String field, List<MatchingDocs> matchingDocs) throws IOException {
 
     for (MatchingDocs hits : matchingDocs) {
-      SortedNumericDocValues values = hits.context.reader().getSortedNumericDocValues(field);
-      if (values == null) {
-        // this field has no doc values for this segment
-        continue;
-      }
 
-      NumericDocValues singleValues = DocValues.unwrapSingleton(values);
+      SortedNumericDocValues multiValues = DocValues.getSortedNumeric(hits.context.reader(), field);
+      NumericDocValues singleValues = DocValues.unwrapSingleton(multiValues);
 
       if (singleValues != null) {
-        countOneSegment(singleValues, hits);
+
+        DocIdSetIterator it =
+            ConjunctionDISI.intersectIterators(Arrays.asList(hits.bits.iterator(), singleValues));
+
+        for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+          increment(singleValues.longValue());
+          totCount++;
+        }
       } else {
 
         DocIdSetIterator it =
-            ConjunctionDISI.intersectIterators(Arrays.asList(hits.bits.iterator(), values));
+            ConjunctionDISI.intersectIterators(Arrays.asList(hits.bits.iterator(), multiValues));
 
         for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
-          int limit = values.docValueCount();
+          int limit = multiValues.docValueCount();
           totCount += limit;
           for (int i = 0; i < limit; i++) {
-            increment(values.nextValue());
+            increment(multiValues.nextValue());
           }
         }
       }
     }
   }
 
-  /** Optimized version that directly counts all doc values. */
-  private void countAll(IndexReader reader, String field) throws IOException {
-
-    for (LeafReaderContext context : reader.leaves()) {
-
-      NumericDocValues values = context.reader().getNumericDocValues(field);
-      if (values == null) {
-        // this field has no doc values for this segment
-        continue;
-      }
-
-      countAllOneSegment(values);
-    }
-  }
-
-  private void countAllOneSegment(NumericDocValues values) throws IOException {
-    while (values.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
-      totCount++;
-      increment(values.longValue());
-    }
-  }
-
-  private void countAll(LongValuesSource valueSource, String field, IndexReader reader)
-      throws IOException {
+  /** Count everything in the provided valueSource. */
+  private void countAll(IndexReader reader, LongValuesSource valueSource) throws IOException {
 
     for (LeafReaderContext context : reader.leaves()) {
       LongValues fv = valueSource.getValues(context, null);
@@ -241,24 +186,27 @@ public class LongValueFacetCounts extends Facets {
     }
   }
 
-  private void countAllMultiValued(IndexReader reader, String field) throws IOException {
+  /** Count everything in the specified field. */
+  private void countAll(IndexReader reader, String field) throws IOException {
 
     for (LeafReaderContext context : reader.leaves()) {
 
-      SortedNumericDocValues values = context.reader().getSortedNumericDocValues(field);
-      if (values == null) {
-        // this field has no doc values for this segment
-        continue;
-      }
-      NumericDocValues singleValues = DocValues.unwrapSingleton(values);
+      SortedNumericDocValues multiValues = DocValues.getSortedNumeric(context.reader(), field);
+      NumericDocValues singleValues = DocValues.unwrapSingleton(multiValues);
+
       if (singleValues != null) {
-        countAllOneSegment(singleValues);
+
+        while (singleValues.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+          totCount++;
+          increment(singleValues.longValue());
+        }
       } else {
-        while (values.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
-          int limit = values.docValueCount();
+
+        while (multiValues.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+          int limit = multiValues.docValueCount();
           totCount += limit;
           for (int i = 0; i < limit; i++) {
-            increment(values.nextValue());
+            increment(multiValues.nextValue());
           }
         }
       }
@@ -294,7 +242,7 @@ public class LongValueFacetCounts extends Facets {
   /** Returns the specified top number of facets, sorted by count. */
   public FacetResult getTopChildrenSortByCount(int topN) {
     PriorityQueue<Entry> pq =
-        new PriorityQueue<Entry>(Math.min(topN, counts.length + hashCounts.size())) {
+        new PriorityQueue<>(Math.min(topN, counts.length + hashCounts.size())) {
           @Override
           protected boolean lessThan(Entry a, Entry b) {
             // sort by count descending, breaking ties by value ascending:
@@ -409,13 +357,13 @@ public class LongValueFacetCounts extends Facets {
   }
 
   @Override
-  public Number getSpecificValue(String dim, String... path) throws IOException {
+  public Number getSpecificValue(String dim, String... path) {
     // TODO: should we impl this?
     throw new UnsupportedOperationException();
   }
 
   @Override
-  public List<FacetResult> getAllDims(int topN) throws IOException {
+  public List<FacetResult> getAllDims(int topN) {
     return Collections.singletonList(getTopChildren(topN, field));
   }
 
diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestLongValueFacetCounts.java b/lucene/facet/src/test/org/apache/lucene/facet/TestLongValueFacetCounts.java
index 1c4b5eb..5821052 100644
--- a/lucene/facet/src/test/org/apache/lucene/facet/TestLongValueFacetCounts.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/TestLongValueFacetCounts.java
@@ -19,7 +19,7 @@ package org.apache.lucene.facet;
 
 import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.Collections;
+import java.util.Comparator;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -60,7 +60,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
     IndexSearcher s = newSearcher(r);
     s.search(new MatchAllDocsQuery(), fc);
 
-    LongValueFacetCounts facets = new LongValueFacetCounts("field", fc, false);
+    LongValueFacetCounts facets = new LongValueFacetCounts("field", fc);
 
     FacetResult result = facets.getAllChildrenSortByValue();
     assertEquals(
@@ -87,7 +87,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
     IndexSearcher s = newSearcher(r);
     s.search(new MatchAllDocsQuery(), fc);
 
-    LongValueFacetCounts facets = new LongValueFacetCounts("field", fc, false);
+    LongValueFacetCounts facets = new LongValueFacetCounts("field", fc);
 
     FacetResult result = facets.getAllChildrenSortByValue();
     assertEquals(
@@ -119,7 +119,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
     IndexSearcher s = newSearcher(r);
     s.search(new MatchAllDocsQuery(), fc);
 
-    Facets facets = new LongValueFacetCounts("field", fc, false);
+    Facets facets = new LongValueFacetCounts("field", fc);
 
     List<FacetResult> result = facets.getAllDims(10);
     assertEquals(1, result.size());
@@ -199,7 +199,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
       List<Map.Entry<Long, Integer>> expectedCounts = new ArrayList<>(expected.entrySet());
 
       // sort by value
-      Collections.sort(expectedCounts, (a, b) -> (Long.compare(a.getKey(), b.getKey())));
+      expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey));
 
       LongValueFacetCounts facetCounts;
       if (random().nextBoolean()) {
@@ -214,7 +214,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
           if (VERBOSE) {
             System.out.println("  use doc values");
           }
-          facetCounts = new LongValueFacetCounts("field", fc, false);
+          facetCounts = new LongValueFacetCounts("field", fc);
         }
       } else {
         // optimized count all:
@@ -228,7 +228,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
           if (VERBOSE) {
             System.out.println("  count all doc values");
           }
-          facetCounts = new LongValueFacetCounts("field", r, false);
+          facetCounts = new LongValueFacetCounts("field", r);
         }
       }
 
@@ -242,8 +242,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
           Integer.MAX_VALUE);
 
       // sort by count
-      Collections.sort(
-          expectedCounts,
+      expectedCounts.sort(
           (a, b) -> {
             int cmp = -Integer.compare(a.getValue(), b.getValue());
             if (cmp == 0) {
@@ -288,7 +287,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
         if (VERBOSE) {
           System.out.println("  use doc values");
         }
-        facetCounts = new LongValueFacetCounts("field", fc, false);
+        facetCounts = new LongValueFacetCounts("field", fc);
       } else {
         if (VERBOSE) {
           System.out.println("  use value source");
@@ -314,7 +313,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
       expectedCounts = new ArrayList<>(expected.entrySet());
 
       // sort by value
-      Collections.sort(expectedCounts, (a, b) -> (Long.compare(a.getKey(), b.getKey())));
+      expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey));
       actual = facetCounts.getAllChildrenSortByValue();
       assertSame(
           "id " + minId + "-" + maxId + ", sort facets by value",
@@ -325,8 +324,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
           Integer.MAX_VALUE);
 
       // sort by count
-      Collections.sort(
-          expectedCounts,
+      expectedCounts.sort(
           (a, b) -> {
             int cmp = -Integer.compare(a.getValue(), b.getValue());
             if (cmp == 0) {
@@ -447,7 +445,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
       List<Map.Entry<Long, Integer>> expectedCounts = new ArrayList<>(expected.entrySet());
 
       // sort by value
-      Collections.sort(expectedCounts, (a, b) -> (Long.compare(a.getKey(), b.getKey())));
+      expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey));
 
       LongValueFacetCounts facetCounts;
       if (random().nextBoolean()) {
@@ -455,13 +453,13 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
         if (VERBOSE) {
           System.out.println("  use doc values");
         }
-        facetCounts = new LongValueFacetCounts("field", fc, true);
+        facetCounts = new LongValueFacetCounts("field", fc);
       } else {
         // optimized count all:
         if (VERBOSE) {
           System.out.println("  count all doc values");
         }
-        facetCounts = new LongValueFacetCounts("field", r, true);
+        facetCounts = new LongValueFacetCounts("field", r);
       }
 
       FacetResult actual = facetCounts.getAllChildrenSortByValue();
@@ -474,8 +472,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
           Integer.MAX_VALUE);
 
       // sort by count
-      Collections.sort(
-          expectedCounts,
+      expectedCounts.sort(
           (a, b) -> {
             int cmp = -Integer.compare(a.getValue(), b.getValue());
             if (cmp == 0) {
@@ -517,7 +514,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
       fc = new FacetsCollector();
       s.search(IntPoint.newRangeQuery("id", minId, maxId), fc);
       // cannot use value source here because we are multi valued
-      facetCounts = new LongValueFacetCounts("field", fc, true);
+      facetCounts = new LongValueFacetCounts("field", fc);
 
       expected = new HashMap<>();
       expectedChildCount = 0;
@@ -538,7 +535,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
       expectedCounts = new ArrayList<>(expected.entrySet());
 
       // sort by value
-      Collections.sort(expectedCounts, (a, b) -> (Long.compare(a.getKey(), b.getKey())));
+      expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey));
       actual = facetCounts.getAllChildrenSortByValue();
       assertSame(
           "id " + minId + "-" + maxId + ", sort facets by value",
@@ -549,8 +546,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
           Integer.MAX_VALUE);
 
       // sort by count
-      Collections.sort(
-          expectedCounts,
+      expectedCounts.sort(
           (a, b) -> {
             int cmp = -Integer.compare(a.getValue(), b.getValue());
             if (cmp == 0) {