You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by lu...@apache.org on 2022/06/28 05:41:32 UTC

[lucene] branch branch_9x updated: LUCENE-10623: Error implementation of docValueCount for SortingSortedSetDocValues (#967)

This is an automated email from the ASF dual-hosted git repository.

luxugang pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/branch_9x by this push:
     new fb261e6ff48 LUCENE-10623: Error implementation of docValueCount for SortingSortedSetDocValues (#967)
fb261e6ff48 is described below

commit fb261e6ff48e5a57d9dff7fd960e21ec2634294d
Author: Lu Xugang <lu...@apache.org>
AuthorDate: Tue Jun 28 11:49:47 2022 +0800

    LUCENE-10623: Error implementation of docValueCount for SortingSortedSetDocValues (#967)
---
 lucene/CHANGES.txt                                 |  2 ++
 .../codecs/lucene90/Lucene90DocValuesConsumer.java | 10 +++---
 .../lucene/index/SortedSetDocValuesWriter.java     | 39 ++++++++++++++-------
 .../apache/lucene/index/SortingCodecReader.java    |  6 +++-
 .../lucene/index/TestSortingCodecReader.java       | 40 ++++++++++++++++++++++
 .../codecs/asserting/AssertingDocValuesFormat.java |  5 +--
 6 files changed, 79 insertions(+), 23 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 6397b590a22..1fa62180b9d 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -129,6 +129,8 @@ Bug Fixes
 
 * GITHUB#986: Fix FieldExistsQuery rewrite when all docs have vectors. (Julie Tibshirani)
 
+* LUCENE-10623: Error implementation of docValueCount for SortingSortedSetDocValues (Lu Xugang)
+
 Other
 ---------------------
 
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java
index 819c34e097e..494d0edc3bc 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java
@@ -804,12 +804,10 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
               public int nextDoc() throws IOException {
                 int doc = values.nextDoc();
                 if (doc != NO_MORE_DOCS) {
-                  docValueCount = 0;
-                  for (long ord = values.nextOrd();
-                      ord != SortedSetDocValues.NO_MORE_ORDS;
-                      ord = values.nextOrd()) {
-                    ords = ArrayUtil.grow(ords, docValueCount + 1);
-                    ords[docValueCount++] = ord;
+                  docValueCount = values.docValueCount();
+                  ords = ArrayUtil.grow(ords, docValueCount);
+                  for (int j = 0; j < docValueCount; j++) {
+                    ords[j] = values.nextOrd();
                   }
                   i = 0;
                 }
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
index 6ebe69c62b3..fedc0bf6b3b 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
@@ -32,6 +32,7 @@ import org.apache.lucene.util.BytesRefHash;
 import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
 import org.apache.lucene.util.Counter;
 import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.packed.GrowableWriter;
 import org.apache.lucene.util.packed.PackedInts;
 import org.apache.lucene.util.packed.PackedLongValues;
 
@@ -228,7 +229,8 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
               state.segmentInfo.maxDoc(),
               sortMap,
               getValues(sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField),
-              PackedInts.FASTEST);
+              PackedInts.FASTEST,
+              PackedInts.bitsRequired(maxCount));
     } else {
       docOrds = null;
     }
@@ -350,6 +352,8 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
     private final DocOrds ords;
     private int docID = -1;
     private long ordUpto;
+    private long limit;
+    private int count;
 
     SortingSortedSetDocValues(SortedSetDocValues in, DocOrds ords) {
       this.in = in;
@@ -369,7 +373,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
           return docID = NO_MORE_DOCS;
         }
       } while (ords.offsets[docID] <= 0);
-      ordUpto = ords.offsets[docID] - 1;
+      initCount();
       return docID;
     }
 
@@ -382,23 +386,23 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
     public boolean advanceExact(int target) throws IOException {
       // needed in IndexSorter#StringSorter
       docID = target;
-      ordUpto = ords.offsets[docID] - 1;
+      initCount();
       return ords.offsets[docID] > 0;
     }
 
     @Override
     public long nextOrd() {
-      long ord = ords.ords.get(ordUpto++);
-      if (ord == 0) {
+      if (limit == ordUpto) {
         return NO_MORE_ORDS;
       } else {
-        return ord - 1;
+        return ords.ords.get(ordUpto++);
       }
     }
 
     @Override
     public int docValueCount() {
-      return (int) ords.ords.size();
+      assert docID >= 0;
+      return count;
     }
 
     @Override
@@ -415,34 +419,45 @@ class SortedSetDocValuesWriter extends DocValuesWriter<SortedSetDocValues> {
     public long getValueCount() {
       return in.getValueCount();
     }
+
+    private void initCount() {
+      assert docID >= 0;
+      ordUpto = ords.offsets[docID] - 1;
+      count = (int) ords.docValueCounts.get(docID);
+      limit = ordUpto + count;
+    }
   }
 
   static final class DocOrds {
     final long[] offsets;
     final PackedLongValues ords;
+    final GrowableWriter docValueCounts;
+
+    public static final int START_BITS_PER_VALUE = 2;
 
     DocOrds(
         int maxDoc,
         Sorter.DocMap sortMap,
         SortedSetDocValues oldValues,
-        float acceptableOverheadRatio)
+        float acceptableOverheadRatio,
+        int bitsPerValue)
         throws IOException {
       offsets = new long[maxDoc];
       PackedLongValues.Builder builder = PackedLongValues.packedBuilder(acceptableOverheadRatio);
-      long ordOffset = 1; // 0 marks docs with no values
+      docValueCounts = new GrowableWriter(bitsPerValue, maxDoc, acceptableOverheadRatio);
+      long ordOffset = 1;
       int docID;
       while ((docID = oldValues.nextDoc()) != NO_MORE_DOCS) {
         int newDocID = sortMap.oldToNew(docID);
         long startOffset = ordOffset;
         long ord;
         while ((ord = oldValues.nextOrd()) != NO_MORE_ORDS) {
-          builder.add(ord + 1);
+          builder.add(ord);
           ordOffset++;
         }
+        docValueCounts.set(newDocID, ordOffset - startOffset);
         if (startOffset != ordOffset) { // do we have any values?
           offsets[newDocID] = startOffset;
-          builder.add(0); // 0 ord marks next value
-          ordOffset++;
         }
       }
       ords = builder.build();
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java
index 0c0f6c84896..f877ce58059 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java
@@ -483,7 +483,11 @@ public final class SortingCodecReader extends FilterCodecReader {
                 field.name,
                 () ->
                     new SortedSetDocValuesWriter.DocOrds(
-                        maxDoc(), docMap, oldDocValues, PackedInts.FAST)));
+                        maxDoc(),
+                        docMap,
+                        oldDocValues,
+                        PackedInts.FAST,
+                        SortedSetDocValuesWriter.DocOrds.START_BITS_PER_VALUE)));
       }
 
       @Override
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java
index 42a898d87b4..db9c7cf1f51 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestSortingCodecReader.java
@@ -43,6 +43,7 @@ import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.SortField;
 import org.apache.lucene.search.SortedNumericSortField;
+import org.apache.lucene.search.SortedSetSelector;
 import org.apache.lucene.search.SortedSetSortField;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TopDocs;
@@ -55,6 +56,45 @@ import org.apache.lucene.util.IOUtils;
 
 public class TestSortingCodecReader extends LuceneTestCase {
 
+  public void testSortOnAddIndicesOrd() throws IOException {
+    Directory tmpDir = newDirectory();
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
+    IndexWriter w = new IndexWriter(tmpDir, iwc);
+
+    Document doc;
+    doc = new Document();
+    doc.add(new SortedSetDocValuesField("foo", new BytesRef("b")));
+    w.addDocument(doc);
+
+    doc.add(new SortedSetDocValuesField("foo", new BytesRef("a")));
+    doc.add(new SortedSetDocValuesField("foo", new BytesRef("b")));
+    doc.add(new SortedSetDocValuesField("foo", new BytesRef("b")));
+    w.addDocument(doc);
+
+    w.commit();
+
+    Sort indexSort = new Sort(new SortedSetSortField("foo", false, SortedSetSelector.Type.MIN));
+    try (DirectoryReader reader = DirectoryReader.open(tmpDir)) {
+      for (LeafReaderContext ctx : reader.leaves()) {
+        CodecReader wrap =
+            SortingCodecReader.wrap(SlowCodecReaderWrapper.wrap(ctx.reader()), indexSort);
+        assertTrue(wrap.toString(), wrap.toString().startsWith("SortingCodecReader("));
+        SortingCodecReader sortingCodecReader = (SortingCodecReader) wrap;
+        SortedSetDocValues sortedSetDocValues =
+            sortingCodecReader
+                .getDocValuesReader()
+                .getSortedSet(ctx.reader().getFieldInfos().fieldInfo("foo"));
+        sortedSetDocValues.nextDoc();
+        assertEquals(sortedSetDocValues.docValueCount(), 2);
+        sortedSetDocValues.nextDoc();
+        assertEquals(sortedSetDocValues.docValueCount(), 1);
+        assertEquals(sortedSetDocValues.nextDoc(), DocIdSetIterator.NO_MORE_DOCS);
+      }
+    }
+    IOUtils.close(w, dir, tmpDir);
+  }
+
   public void testSortOnAddIndicesInt() throws IOException {
     Directory tmpDir = newDirectory();
     Directory dir = newDirectory();
diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingDocValuesFormat.java b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingDocValuesFormat.java
index a32f2ac57e8..f33521fc10a 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingDocValuesFormat.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingDocValuesFormat.java
@@ -189,11 +189,8 @@ public class AssertingDocValuesFormat extends DocValuesFormat {
         }
 
         long lastOrd = -1;
-        while (true) {
+        for (int i = 0; i < values.docValueCount(); i++) {
           long ord = values.nextOrd();
-          if (ord == SortedSetDocValues.NO_MORE_ORDS) {
-            break;
-          }
           assert ord >= 0 && ord < valueCount
               : "ord=" + ord + " is not in bounds 0 .." + (valueCount - 1);
           assert ord > lastOrd : "ord=" + ord + ",lastOrd=" + lastOrd;