You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2013/09/04 18:11:36 UTC

svn commit: r1520060 - in /lucene/dev/trunk/lucene/core/src: java/org/apache/lucene/codecs/compressing/ test/org/apache/lucene/codecs/compressing/

Author: jpountz
Date: Wed Sep  4 16:11:36 2013
New Revision: 1520060

URL: http://svn.apache.org/r1520060
Log:
Fix compression bug on highly compressible inputs with LZ4.compressHC.

Modified:
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/LZ4.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestLZ4CompressionMode.java

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/LZ4.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/LZ4.java?rev=1520060&r1=1520059&r2=1520060&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/LZ4.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/LZ4.java Wed Sep  4 16:11:36 2013
@@ -219,7 +219,7 @@ final class LZ4 {
       final PackedInts.Mutable hashTable = ht.hashTable;
 
       main:
-      while (off < limit) {
+      while (off <= limit) {
         // find a match
         int ref;
         while (true) {
@@ -299,7 +299,7 @@ final class LZ4 {
     }
 
     private int next(int off) {
-      return base + off - (chainTable[off & MASK] & 0xFFFF);
+      return off - (chainTable[off & MASK] & 0xFFFF);
     }
 
     private void addHash(byte[] bytes, int off) {
@@ -310,7 +310,7 @@ final class LZ4 {
         delta = MAX_DISTANCE - 1;
       }
       chainTable[off & MASK] = (short) delta;
-      hashTable[h] = off - base;
+      hashTable[h] = off;
     }
 
     void insert(int off, byte[] bytes) {
@@ -322,12 +322,24 @@ final class LZ4 {
     boolean insertAndFindBestMatch(byte[] buf, int off, int matchLimit, Match match) {
       match.start = off;
       match.len = 0;
+      int delta = 0;
+      int repl = 0;
 
       insert(off, buf);
 
       int ref = hashPointer(buf, off);
+
+      if (ref >= off - 4 && ref <= off && ref >= base) { // potential repetition
+        if (readIntEquals(buf, ref, off)) { // confirmed
+          delta = off - ref;
+          repl = match.len = MIN_MATCH + commonBytes(buf, ref + MIN_MATCH, off + MIN_MATCH, matchLimit);
+          match.ref = ref;
+        }
+        ref = next(ref);
+      }
+
       for (int i = 0; i < MAX_ATTEMPTS; ++i) {
-        if (ref < Math.max(base, off - MAX_DISTANCE + 1)) {
+        if (ref < Math.max(base, off - MAX_DISTANCE + 1) || ref > off) {
           break;
         }
         if (buf[ref + match.len] == buf[off + match.len] && readIntEquals(buf, ref, off)) {
@@ -340,6 +352,21 @@ final class LZ4 {
         ref = next(ref);
       }
 
+      if (repl != 0) {
+        int ptr = off;
+        final int end = off + repl - (MIN_MATCH - 1);
+        while (ptr < end - delta) {
+          chainTable[ptr & MASK] = (short) delta; // pre load
+          ++ptr;
+        }
+        do {
+          chainTable[ptr & MASK] = (short) delta;
+          hashTable[hashHC(readInt(buf, ptr))] = ptr;
+          ++ptr;
+        } while (ptr < end);
+        nextToUpdate = end;
+      }
+
       return match.len != 0;
     }
 
@@ -351,7 +378,7 @@ final class LZ4 {
       final int delta = off - startLimit;
       int ref = hashPointer(buf, off);
       for (int i = 0; i < MAX_ATTEMPTS; ++i) {
-        if (ref < Math.max(base, off - MAX_DISTANCE + 1)) {
+        if (ref < Math.max(base, off - MAX_DISTANCE + 1) || ref > off) {
           break;
         }
         if (buf[ref - delta + match.len] == buf[startLimit + match.len]
@@ -386,6 +413,7 @@ final class LZ4 {
 
     final int srcEnd = srcOff + srcLen;
     final int matchLimit = srcEnd - LAST_LITERALS;
+    final int mfLimit = matchLimit - MIN_MATCH;
 
     int sOff = srcOff;
     int anchor = sOff++;
@@ -397,7 +425,7 @@ final class LZ4 {
     final Match match3 = new Match();
 
     main:
-    while (sOff < matchLimit) {
+    while (sOff <= mfLimit) {
       if (!ht.insertAndFindBestMatch(src, sOff, matchLimit, match1)) {
         ++sOff;
         continue;
@@ -409,7 +437,7 @@ final class LZ4 {
       search2:
       while (true) {
         assert match1.start >= anchor;
-        if (match1.end() >= matchLimit
+        if (match1.end() >= mfLimit
             || !ht.insertAndFindWiderMatch(src, match1.end() - 2, match1.start + 1, matchLimit, match1.len, match2)) {
           // no better match
           encodeSequence(src, anchor, match1.ref, match1.start, match1.len, out);
@@ -445,24 +473,11 @@ final class LZ4 {
             }
           }
 
-          if (match2.start + match2.len >= matchLimit
+          if (match2.start + match2.len >= mfLimit
               || !ht.insertAndFindWiderMatch(src, match2.end() - 3, match2.start, matchLimit, match2.len, match3)) {
             // no better match -> 2 sequences to encode
             if (match2.start < match1.end()) {
-              if (match2.start - match1.start < OPTIMAL_ML) {
-                if (match1.len > OPTIMAL_ML) {
-                  match1.len = OPTIMAL_ML;
-                }
-                if (match1.end() > match2.end() - MIN_MATCH) {
-                  match1.len = match2.end() - match1.start - MIN_MATCH;
-                }
-                final int correction = match1.len - (match2.start - match1.start);
-                if (correction > 0) {
-                  match2.fix(correction);
-                }
-              } else {
-                match1.len = match2.start - match1.start;
-              }
+              match1.len = match2.start - match1.start;
             }
             // encode seq 1
             encodeSequence(src, anchor, match1.ref, match1.start, match1.len, out);

Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java?rev=1520060&r1=1520059&r2=1520060&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java Wed Sep  4 16:11:36 2013
@@ -24,6 +24,7 @@ import org.apache.lucene.store.ByteArray
 import org.apache.lucene.store.ByteArrayDataOutput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
 
 import com.carrotsearch.randomizedtesting.generators.RandomInts;
 
@@ -130,4 +131,10 @@ public abstract class AbstractTestCompre
     test(decompressed);
   }
 
+  public void testConstant() throws IOException {
+    final byte[] decompressed = new byte[_TestUtil.nextInt(random(), 1, 10000)];
+    Arrays.fill(decompressed, (byte) random().nextInt());
+    test(decompressed);
+  }
+
 }
\ No newline at end of file

Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestLZ4CompressionMode.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestLZ4CompressionMode.java?rev=1520060&r1=1520059&r2=1520060&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestLZ4CompressionMode.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestLZ4CompressionMode.java Wed Sep  4 16:11:36 2013
@@ -104,4 +104,8 @@ public abstract class AbstractTestLZ4Com
     test(decompressed);
   }
 
+  public void testMatchRightBeforeLastLiterals() throws IOException {
+    test(new byte[] {1,2,3,4, 1,2,3,4, 1,2,3,4,5});
+  }
+
 }