You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2021/08/16 13:14:53 UTC
[lucene-solr] branch branch_8x updated: LUCENE-10014: fix performance bug: when writing doc values with block GCD compression we were unnecessarily wasting index storage by failing to take fully advantage of the GCD compression

This is an automated email from the ASF dual-hosted git repository.

mikemccand pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8x by this push:
     new 372c445  LUCENE-10014: fix performance bug: when writing doc values with block GCD compression we were unnecessarily wasting index storage by failing to take fully advantage of the GCD compression
372c445 is described below

commit 372c445c286f04480e7a62b48e599109a08c8d32
Author: Mike McCandless <mi...@apache.org>
AuthorDate: Mon Aug 16 08:40:02 2021 -0400

    LUCENE-10014: fix performance bug: when writing doc values with block GCD compression we were unnecessarily wasting index storage by failing to take fully advantage of the GCD compression
---
 lucene/CHANGES.txt                                 |  4 ++++
 .../codecs/lucene80/Lucene80DocValuesConsumer.java |  2 +-
 .../BaseLucene80DocValuesFormatTestCase.java       | 23 ++++++++++++++++------
 3 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 800d907..059c148 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -78,6 +78,10 @@ Optimizations
 * LUCENE-10031: Slightly faster segment merging for sorted indices.
   (Adrien Grand)
 
+* LUCENE-10014: Lucene90DocValuesFormat was using too many bits per
+  value when compressing via gcd, unnecessarily wasting index storage.
+  (weizijun)
+
 Bug Fixes
 ---------------------
 
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesConsumer.java
index bbde937..5f09074 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesConsumer.java
@@ -353,7 +353,7 @@ final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Close
       data.writeByte((byte) 0);
       data.writeLong(min);
     } else {
-      final int bitsPerValue = DirectWriter.unsignedBitsRequired(max - min);
+      final int bitsPerValue = DirectWriter.unsignedBitsRequired((max - min) / gcd);
       buffer.reset();
       assert buffer.size() == 0;
       final DirectWriter w = DirectWriter.getInstance(buffer, length, bitsPerValue);
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene80/BaseLucene80DocValuesFormatTestCase.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene80/BaseLucene80DocValuesFormatTestCase.java
index 8696777..51a4cbd 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene80/BaseLucene80DocValuesFormatTestCase.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene80/BaseLucene80DocValuesFormatTestCase.java
@@ -621,11 +621,13 @@ public abstract class BaseLucene80DocValuesFormatTestCase extends BaseCompressin
     IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
     conf.setMaxBufferedDocs(atLeast(Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE));
     conf.setRAMBufferSizeMB(-1);
+    // so Lucene docids are predictable / stay in order
     conf.setMergePolicy(newLogMergePolicy(random().nextBoolean()));
     IndexWriter writer = new IndexWriter(dir, conf);
     
     final int numDocs = atLeast(Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE*3);
     final LongSupplier values = blocksOfVariousBPV();
+    List<long[]> writeDocValues = new ArrayList<>();
     for (int i = 0; i < numDocs; i++) {
       Document doc = new Document();
       
@@ -637,6 +639,7 @@ public abstract class BaseLucene80DocValuesFormatTestCase extends BaseCompressin
         doc.add(new SortedNumericDocValuesField("dv", value));
       }
       Arrays.sort(valueArray);
+      writeDocValues.add(valueArray);
       for (int j = 0; j < valueCount; j++) {
         doc.add(new StoredField("stored", Long.toString(valueArray[j])));
       }
@@ -659,15 +662,23 @@ public abstract class BaseLucene80DocValuesFormatTestCase extends BaseCompressin
         if (i > docValues.docID()) {
           docValues.nextDoc();
         }
-        String expected[] = r.document(i).getValues("stored");
+        String expectedStored[] = r.document(i).getValues("stored");
         if (i < docValues.docID()) {
-          assertEquals(0, expected.length);
+          assertEquals(0, expectedStored.length);
         } else {
-          String actual[] = new String[docValues.docValueCount()];
-          for (int j = 0; j < actual.length; j++) {
-            actual[j] = Long.toString(docValues.nextValue());
+          long[] readValueArray = new long[docValues.docValueCount()];
+          String actualDocValue[] = new String[docValues.docValueCount()];
+          for (int j = 0; j < docValues.docValueCount(); ++j) {
+            long actualDV = docValues.nextValue();
+            readValueArray[j] = actualDV;
+            actualDocValue[j] = Long.toString(readValueArray[j]);
           }
-          assertArrayEquals(expected, actual);
+          long[] writeValueArray = writeDocValues.get(i);
+          // compare write values and read values
+          assertArrayEquals(readValueArray, writeValueArray);
+
+          // compare dv and stored values
+          assertArrayEquals(expectedStored, actualDocValue);
         }
       }
     }