You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2022/09/08 13:00:02 UTC

[lucene] branch branch_9x updated: Prevent term vectors from exceeding the maximum dictionary size. (#11726)

This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/branch_9x by this push:
     new 6dcf03ccd51 Prevent term vectors from exceeding the maximum dictionary size. (#11726)
6dcf03ccd51 is described below

commit 6dcf03ccd514cdf03630302bd2025dee179c134b
Author: Adrien Grand <jp...@gmail.com>
AuthorDate: Thu Sep 8 13:44:21 2022 +0200

    Prevent term vectors from exceeding the maximum dictionary size. (#11726)
    
    When indexing term vectors for a very large document, the automatic computation
    of the dictionary size based on the overall size of the block might yield a
    size that exceeds the maximum window size that is supported by LZ4. This commit
    addresses the issue by automatically taking the minimum of the result of this
    computation and the maximum window size (64kB).
---
 lucene/CHANGES.txt                                               | 7 +++++++
 .../lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java | 2 +-
 lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java    | 9 +++++++--
 .../lucene/codecs/compressing/AbstractTestCompressionMode.java   | 8 ++++++++
 4 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index b93e2ecd7ea..6a3a27535a2 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -5,6 +5,13 @@ http://s.apache.org/luceneversions
 
 ======================== Lucene 9.5.0 =======================
 
+Bug Fixes
+---------------------
+
+* GITHUB#11726: Indexing term vectors on large documents could fail due to
+  trying to apply a dictionary whose size is greater than the maximum supported
+  window size for LZ4. (Adrien Grand)
+
 Other
 ---------------------
 * LUCENE-10423: Remove usages of System.currentTimeMillis() from tests. (Marios Trivyzas)
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java
index f1d406e60a5..6f28ca3d0a6 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java
@@ -170,7 +170,7 @@ public final class LZ4WithPresetDictCompressionMode extends CompressionMode {
     @Override
     public void compress(ByteBuffersDataInput buffersInput, DataOutput out) throws IOException {
       final int len = (int) (buffersInput.size() - buffersInput.position());
-      final int dictLength = len / (NUM_SUB_BLOCKS * DICT_SIZE_FACTOR);
+      final int dictLength = Math.min(LZ4.MAX_DISTANCE, len / (NUM_SUB_BLOCKS * DICT_SIZE_FACTOR));
       final int blockLength = (len - dictLength + NUM_SUB_BLOCKS - 1) / NUM_SUB_BLOCKS;
       buffer = ArrayUtil.growNoCopy(buffer, dictLength + blockLength);
       out.writeVInt(dictLength);
diff --git a/lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java b/lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java
index 0deb228356d..67bbdc96ab2 100644
--- a/lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java
+++ b/lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java
@@ -47,9 +47,14 @@ public final class LZ4 {
 
   private LZ4() {}
 
+  /**
+   * Window size: this is the maximum supported distance between two strings so that LZ4 can replace
+   * the second one by a reference to the first one.
+   */
+  public static final int MAX_DISTANCE = 1 << 16; // maximum distance of a reference
+
   static final int MEMORY_USAGE = 14;
   static final int MIN_MATCH = 4; // minimum length of a match
-  static final int MAX_DISTANCE = 1 << 16; // maximum distance of a reference
   static final int LAST_LITERALS = 5; // the last 5 bytes must be encoded as literals
   static final int HASH_LOG_HC = 15; // log size of the dictionary for compressHC
   static final int HASH_TABLE_SIZE_HC = 1 << HASH_LOG_HC;
@@ -512,7 +517,7 @@ public final class LZ4 {
   /**
    * Compress {@code bytes[dictOff+dictLen:dictOff+dictLen+len]} into {@code out} using at most 16kB
    * of memory. {@code bytes[dictOff:dictOff+dictLen]} will be used as a dictionary. {@code dictLen}
-   * must not be greater than 64kB, the maximum window size.
+   * must not be greater than {@link LZ4#MAX_DISTANCE 64kB}, the maximum window size.
    *
    * <p>{@code ht} shouldn't be shared across threads but can safely be reused.
    */
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java b/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java
index 9febfa97928..366f74730fc 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java
@@ -154,4 +154,12 @@ public abstract class AbstractTestCompressionMode extends LuceneTestCase {
     Arrays.fill(decompressed, (byte) random().nextInt());
     test(decompressed);
   }
+
+  public void testExtremelyLargeInput() throws IOException {
+    final byte[] decompressed = new byte[1 << 24]; // 16MB
+    for (int i = 0; i < decompressed.length; ++i) {
+      decompressed[i] = (byte) (i & 0x0F);
+    }
+    test(decompressed);
+  }
 }