You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2021/08/16 12:40:05 UTC

[lucene] branch main updated: LUCENE-10014: fix performance bug: when writing doc values with block GCD compression we were unnecessarily wasting index storage by failing to take fully advantage of the GCD compression

This is an automated email from the ASF dual-hosted git repository.

mikemccand pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new 19e5c00  LUCENE-10014: fix performance bug: when writing doc values with block GCD compression we were unnecessarily wasting index storage by failing to take fully advantage of the GCD compression
19e5c00 is described below

commit 19e5c00a4fff111100768f50ece12015196e15e0
Author: Mike McCandless <mi...@apache.org>
AuthorDate: Mon Aug 16 08:40:02 2021 -0400

    LUCENE-10014: fix performance bug: when writing doc values with block GCD compression we were unnecessarily wasting index storage by failing to take fully advantage of the GCD compression
---
 lucene/CHANGES.txt                                 |  4 ++++
 .../lucene80/Lucene80DocValuesConsumer.java        |  2 +-
 .../BaseLucene80DocValuesFormatTestCase.java       | 23 ++++++++++++++------
 .../codecs/lucene90/Lucene90DocValuesConsumer.java |  2 +-
 .../lucene90/TestLucene90DocValuesFormat.java      | 25 ++++++++++++++++------
 5 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 23d5116..6460112 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -421,6 +421,10 @@ Optimizations
 * LUCENE-10031: Slightly faster segment merging for sorted indices.
   (Adrien Grand)
 
+* LUCENE-10014: Lucene90DocValuesFormat was using too many bits per
+  value when compressing via gcd, unnecessarily wasting index storage.
+  (weizijun)
+
 Bug Fixes
 ---------------------
 * LUCENE-9988: Fix DrillSideways correctness bug introduced in LUCENE-9944 (Greg Miller)
diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene80/Lucene80DocValuesConsumer.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene80/Lucene80DocValuesConsumer.java
index 1d3bc39..12d5b77 100644
--- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene80/Lucene80DocValuesConsumer.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene80/Lucene80DocValuesConsumer.java
@@ -388,7 +388,7 @@ final class Lucene80DocValuesConsumer extends DocValuesConsumer {
       data.writeByte((byte) 0);
       data.writeLong(min);
     } else {
-      final int bitsPerValue = LegacyDirectWriter.unsignedBitsRequired(max - min);
+      final int bitsPerValue = LegacyDirectWriter.unsignedBitsRequired((max - min) / gcd);
       buffer.reset();
       assert buffer.size() == 0;
       final LegacyDirectWriter w = LegacyDirectWriter.getInstance(buffer, length, bitsPerValue);
diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene80/BaseLucene80DocValuesFormatTestCase.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene80/BaseLucene80DocValuesFormatTestCase.java
index fce0a39..0473375 100644
--- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene80/BaseLucene80DocValuesFormatTestCase.java
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene80/BaseLucene80DocValuesFormatTestCase.java
@@ -634,11 +634,13 @@ public abstract class BaseLucene80DocValuesFormatTestCase
     IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
     conf.setMaxBufferedDocs(atLeast(Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE));
     conf.setRAMBufferSizeMB(-1);
+    // so Lucene docids are predictable / stay in order
     conf.setMergePolicy(newLogMergePolicy(random().nextBoolean()));
     IndexWriter writer = new IndexWriter(dir, conf);
 
     final int numDocs = atLeast(Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE * 3);
     final LongSupplier values = blocksOfVariousBPV();
+    List<long[]> writeDocValues = new ArrayList<>();
     for (int i = 0; i < numDocs; i++) {
       Document doc = new Document();
 
@@ -650,6 +652,7 @@ public abstract class BaseLucene80DocValuesFormatTestCase
         doc.add(new SortedNumericDocValuesField("dv", value));
       }
       Arrays.sort(valueArray);
+      writeDocValues.add(valueArray);
       for (int j = 0; j < valueCount; j++) {
         doc.add(new StoredField("stored", Long.toString(valueArray[j])));
       }
@@ -672,15 +675,23 @@ public abstract class BaseLucene80DocValuesFormatTestCase
         if (i > docValues.docID()) {
           docValues.nextDoc();
         }
-        String expected[] = r.document(i).getValues("stored");
+        String expectedStored[] = r.document(i).getValues("stored");
         if (i < docValues.docID()) {
-          assertEquals(0, expected.length);
+          assertEquals(0, expectedStored.length);
         } else {
-          String actual[] = new String[docValues.docValueCount()];
-          for (int j = 0; j < actual.length; j++) {
-            actual[j] = Long.toString(docValues.nextValue());
+          long[] readValueArray = new long[docValues.docValueCount()];
+          String actualDocValue[] = new String[docValues.docValueCount()];
+          for (int j = 0; j < docValues.docValueCount(); ++j) {
+            long actualDV = docValues.nextValue();
+            readValueArray[j] = actualDV;
+            actualDocValue[j] = Long.toString(readValueArray[j]);
           }
-          assertArrayEquals(expected, actual);
+          long[] writeValueArray = writeDocValues.get(i);
+          // compare write values and read values
+          assertArrayEquals(readValueArray, writeValueArray);
+
+          // compare dv and stored values
+          assertArrayEquals(expectedStored, actualDocValue);
         }
       }
     }
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java
index 227e30c..e914698 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java
@@ -376,7 +376,7 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer {
       data.writeByte((byte) 0);
       data.writeLong(min);
     } else {
-      final int bitsPerValue = DirectWriter.unsignedBitsRequired(max - min);
+      final int bitsPerValue = DirectWriter.unsignedBitsRequired((max - min) / gcd);
       buffer.reset();
       assert buffer.size() == 0;
       final DirectWriter w = DirectWriter.getInstance(buffer, length, bitsPerValue);
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90DocValuesFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90DocValuesFormat.java
index 79c744a..3301582 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90DocValuesFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90DocValuesFormat.java
@@ -618,6 +618,7 @@ public class TestLucene90DocValuesFormat extends BaseCompressingDocValuesFormatT
   }
 
   private static LongSupplier blocksOfVariousBPV() {
+    // this helps exercise GCD compression:
     final long mul = TestUtil.nextInt(random(), 1, 100);
     final long min = random().nextInt();
     return new LongSupplier() {
@@ -627,6 +628,8 @@ public class TestLucene90DocValuesFormat extends BaseCompressingDocValuesFormatT
       @Override
       public long getAsLong() {
         if (i == Lucene90DocValuesFormat.NUMERIC_BLOCK_SIZE) {
+          // change the range of the random generated values on block boundaries, so we exercise
+          // different bits-per-value for each block, and encourage block compression
           maxDelta = 1 << random().nextInt(5);
           i = 0;
         }
@@ -647,6 +650,7 @@ public class TestLucene90DocValuesFormat extends BaseCompressingDocValuesFormatT
 
     final int numDocs = atLeast(Lucene90DocValuesFormat.NUMERIC_BLOCK_SIZE * 3);
     final LongSupplier values = blocksOfVariousBPV();
+    List<long[]> writeDocValues = new ArrayList<>();
     for (int i = 0; i < numDocs; i++) {
       Document doc = new Document();
 
@@ -658,6 +662,7 @@ public class TestLucene90DocValuesFormat extends BaseCompressingDocValuesFormatT
         doc.add(new SortedNumericDocValuesField("dv", value));
       }
       Arrays.sort(valueArray);
+      writeDocValues.add(valueArray);
       for (int j = 0; j < valueCount; j++) {
         doc.add(new StoredField("stored", Long.toString(valueArray[j])));
       }
@@ -680,15 +685,23 @@ public class TestLucene90DocValuesFormat extends BaseCompressingDocValuesFormatT
         if (i > docValues.docID()) {
           docValues.nextDoc();
         }
-        String expected[] = r.document(i).getValues("stored");
+        String expectedStored[] = r.document(i).getValues("stored");
         if (i < docValues.docID()) {
-          assertEquals(0, expected.length);
+          assertEquals(0, expectedStored.length);
         } else {
-          String actual[] = new String[docValues.docValueCount()];
-          for (int j = 0; j < actual.length; j++) {
-            actual[j] = Long.toString(docValues.nextValue());
+          long[] readValueArray = new long[docValues.docValueCount()];
+          String actualDocValue[] = new String[docValues.docValueCount()];
+          for (int j = 0; j < docValues.docValueCount(); ++j) {
+            long actualDV = docValues.nextValue();
+            readValueArray[j] = actualDV;
+            actualDocValue[j] = Long.toString(readValueArray[j]);
           }
-          assertArrayEquals(expected, actual);
+          long[] writeValueArray = writeDocValues.get(i);
+          // compare write values and read values
+          assertArrayEquals(readValueArray, writeValueArray);
+
+          // compare dv and stored values
+          assertArrayEquals(expectedStored, actualDocValue);
         }
       }
     }