You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/12/02 19:20:04 UTC

svn commit: r1642981 - in /lucene/dev/trunk/lucene: core/src/java/org/apache/lucene/codecs/compressing/ core/src/java/org/apache/lucene/codecs/lucene50/ test-framework/src/java/org/apache/lucene/codecs/compressing/ test-framework/src/java/org/apache/lu...

Author: rmuir
Date: Tue Dec  2 18:20:03 2014
New Revision: 1642981

URL: http://svn.apache.org/r1642981
Log:
LUCENE-6089: Tune CompressionMode.HIGH_COMPRESSION

Modified:
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsFormat.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressionMode.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java
    lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java
    lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastCompressingCodec.java
    lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastDecompressionCompressingCodec.java
    lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/HighCompressionCompressingCodec.java
    lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/dummy/DummyCompressingCodec.java

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsFormat.java?rev=1642981&r1=1642980&r2=1642981&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsFormat.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsFormat.java Tue Dec  2 18:20:03 2014
@@ -48,15 +48,16 @@ public class CompressingStoredFieldsForm
   private final String segmentSuffix;
   private final CompressionMode compressionMode;
   private final int chunkSize;
+  private final int maxDocsPerChunk;
 
   /**
    * Create a new {@link CompressingStoredFieldsFormat} with an empty segment 
    * suffix.
    * 
-   * @see CompressingStoredFieldsFormat#CompressingStoredFieldsFormat(String, String, CompressionMode, int)
+   * @see CompressingStoredFieldsFormat#CompressingStoredFieldsFormat(String, String, CompressionMode, int, int)
    */
-  public CompressingStoredFieldsFormat(String formatName, CompressionMode compressionMode, int chunkSize) {
-    this(formatName, "", compressionMode, chunkSize);
+  public CompressingStoredFieldsFormat(String formatName, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) {
+    this(formatName, "", compressionMode, chunkSize, maxDocsPerChunk);
   }
   
   /**
@@ -79,6 +80,8 @@ public class CompressingStoredFieldsForm
    * <code>chunkSize</code> is the minimum byte size of a chunk of documents.
    * A value of <code>1</code> can make sense if there is redundancy across
    * fields.
+   * <code>maxDocsPerChunk</code> is an upperbound on how many docs may be stored
+   * in a single chunk. This is to bound the cpu costs for highly compressible data.
    * <p>
    * Higher values of <code>chunkSize</code> should improve the compression
    * ratio but will require more memory at indexing time and might make document
@@ -88,10 +91,11 @@ public class CompressingStoredFieldsForm
    * @param formatName the name of the {@link StoredFieldsFormat}
    * @param compressionMode the {@link CompressionMode} to use
    * @param chunkSize the minimum number of bytes of a single chunk of stored documents
+   * @param maxDocsPerChunk the maximum number of documents in a single chunk
    * @see CompressionMode
    */
   public CompressingStoredFieldsFormat(String formatName, String segmentSuffix, 
-                                       CompressionMode compressionMode, int chunkSize) {
+                                       CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) {
     this.formatName = formatName;
     this.segmentSuffix = segmentSuffix;
     this.compressionMode = compressionMode;
@@ -99,7 +103,10 @@ public class CompressingStoredFieldsForm
       throw new IllegalArgumentException("chunkSize must be >= 1");
     }
     this.chunkSize = chunkSize;
-    
+    if (maxDocsPerChunk < 1) {
+      throw new IllegalArgumentException("maxDocsPerChunk must be >= 1");
+    }
+    this.maxDocsPerChunk = maxDocsPerChunk;
   }
 
   @Override
@@ -113,13 +120,13 @@ public class CompressingStoredFieldsForm
   public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si,
       IOContext context) throws IOException {
     return new CompressingStoredFieldsWriter(directory, si, segmentSuffix, context,
-        formatName, compressionMode, chunkSize);
+        formatName, compressionMode, chunkSize, maxDocsPerChunk);
   }
 
   @Override
   public String toString() {
     return getClass().getSimpleName() + "(compressionMode=" + compressionMode
-        + ", chunkSize=" + chunkSize + ")";
+        + ", chunkSize=" + chunkSize + ", maxDocsPerChunk=" + maxDocsPerChunk + ")";
   }
 
 }

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java?rev=1642981&r1=1642980&r2=1642981&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java Tue Dec  2 18:20:03 2014
@@ -54,9 +54,6 @@ public final class CompressingStoredFiel
   
   /** Extension of stored fields index file */
   public static final String FIELDS_INDEX_EXTENSION = "fdx";
-  
-  // hard limit on the maximum number of documents per chunk
-  static final int MAX_DOCUMENTS_PER_CHUNK = 128;
 
   static final int         STRING = 0x00;
   static final int       BYTE_ARR = 0x01;
@@ -82,6 +79,7 @@ public final class CompressingStoredFiel
   private final CompressionMode compressionMode;
   private final Compressor compressor;
   private final int chunkSize;
+  private final int maxDocsPerChunk;
 
   private final GrowableByteArrayDataOutput bufferedDocs;
   private int[] numStoredFields; // number of stored fields
@@ -91,7 +89,7 @@ public final class CompressingStoredFiel
 
   /** Sole constructor. */
   public CompressingStoredFieldsWriter(Directory directory, SegmentInfo si, String segmentSuffix, IOContext context,
-      String formatName, CompressionMode compressionMode, int chunkSize) throws IOException {
+      String formatName, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) throws IOException {
     assert directory != null;
     this.directory = directory;
     this.segment = si.name;
@@ -99,6 +97,7 @@ public final class CompressingStoredFiel
     this.compressionMode = compressionMode;
     this.compressor = compressionMode.newCompressor();
     this.chunkSize = chunkSize;
+    this.maxDocsPerChunk = maxDocsPerChunk;
     this.docBase = 0;
     this.bufferedDocs = new GrowableByteArrayDataOutput(chunkSize);
     this.numStoredFields = new int[16];
@@ -210,7 +209,7 @@ public final class CompressingStoredFiel
 
   private boolean triggerFlush() {
     return bufferedDocs.length >= chunkSize || // chunks of at least chunkSize bytes
-        numBufferedDocs >= MAX_DOCUMENTS_PER_CHUNK;
+        numBufferedDocs >= maxDocsPerChunk;
   }
 
   private void flush() throws IOException {

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressionMode.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressionMode.java?rev=1642981&r1=1642980&r2=1642981&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressionMode.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressionMode.java Tue Dec  2 18:20:03 2014
@@ -70,7 +70,8 @@ public abstract class CompressionMode {
 
     @Override
     public Compressor newCompressor() {
-      return new DeflateCompressor(Deflater.BEST_COMPRESSION);
+      // 3 is the highest level that doesn't have lazy match evaluation
+      return new DeflateCompressor(3);
     }
 
     @Override

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java?rev=1642981&r1=1642980&r2=1642981&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java Tue Dec  2 18:20:03 2014
@@ -118,7 +118,7 @@ public final class Lucene50StoredFieldsF
 
   /** Sole constructor. */
   public Lucene50StoredFieldsFormat() {
-    super("Lucene50StoredFields", CompressionMode.FAST, 1 << 14);
+    super("Lucene50StoredFields", CompressionMode.FAST, 1 << 14, 128);
   }
 
 }

Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java?rev=1642981&r1=1642980&r2=1642981&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java Tue Dec  2 18:20:03 2014
@@ -36,16 +36,16 @@ public abstract class CompressingCodec e
   /**
    * Create a random instance.
    */
-  public static CompressingCodec randomInstance(Random random, int chunkSize, boolean withSegmentSuffix) {
+  public static CompressingCodec randomInstance(Random random, int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix) {
     switch (random.nextInt(4)) {
     case 0:
-      return new FastCompressingCodec(chunkSize, withSegmentSuffix);
+      return new FastCompressingCodec(chunkSize, maxDocsPerChunk, withSegmentSuffix);
     case 1:
-      return new FastDecompressionCompressingCodec(chunkSize, withSegmentSuffix);
+      return new FastDecompressionCompressingCodec(chunkSize, maxDocsPerChunk, withSegmentSuffix);
     case 2:
-      return new HighCompressionCompressingCodec(chunkSize, withSegmentSuffix);
+      return new HighCompressionCompressingCodec(chunkSize, maxDocsPerChunk, withSegmentSuffix);
     case 3:
-      return new DummyCompressingCodec(chunkSize, withSegmentSuffix);
+      return new DummyCompressingCodec(chunkSize, maxDocsPerChunk, withSegmentSuffix);
     default:
       throw new AssertionError();
     }
@@ -56,14 +56,14 @@ public abstract class CompressingCodec e
    * suffix
    */
   public static CompressingCodec randomInstance(Random random) {
-    return randomInstance(random, RandomInts.randomIntBetween(random, 1, 500), false);
+    return randomInstance(random, RandomInts.randomIntBetween(random, 1, 1 << 15), RandomInts.randomIntBetween(random, 64, 1024), false);
   }
   
   /**
    * Creates a random {@link CompressingCodec} that is using a segment suffix
    */
   public static CompressingCodec randomInstance(Random random, boolean withSegmentSuffix) {
-    return randomInstance(random, RandomInts.randomIntBetween(random, 1, 500), withSegmentSuffix);
+    return randomInstance(random, RandomInts.randomIntBetween(random, 1, 1 << 15), RandomInts.randomIntBetween(random, 64, 1024), withSegmentSuffix);
   }
 
   private final CompressingStoredFieldsFormat storedFieldsFormat;
@@ -72,17 +72,17 @@ public abstract class CompressingCodec e
   /**
    * Creates a compressing codec with a given segment suffix
    */
-  public CompressingCodec(String name, String segmentSuffix, CompressionMode compressionMode, int chunkSize) {
+  public CompressingCodec(String name, String segmentSuffix, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) {
     super(name, TestUtil.getDefaultCodec());
-    this.storedFieldsFormat = new CompressingStoredFieldsFormat(name, segmentSuffix, compressionMode, chunkSize);
+    this.storedFieldsFormat = new CompressingStoredFieldsFormat(name, segmentSuffix, compressionMode, chunkSize, maxDocsPerChunk);
     this.termVectorsFormat = new CompressingTermVectorsFormat(name, segmentSuffix, compressionMode, chunkSize);
   }
   
   /**
    * Creates a compressing codec with an empty segment suffix
    */
-  public CompressingCodec(String name, CompressionMode compressionMode, int chunkSize) {
-    this(name, "", compressionMode, chunkSize);
+  public CompressingCodec(String name, CompressionMode compressionMode, int chunkSize, int maxDocsPerChunk) {
+    this(name, "", compressionMode, chunkSize, maxDocsPerChunk);
   }
 
   @Override

Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastCompressingCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastCompressingCodec.java?rev=1642981&r1=1642980&r2=1642981&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastCompressingCodec.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastCompressingCodec.java Tue Dec  2 18:20:03 2014
@@ -21,14 +21,14 @@ package org.apache.lucene.codecs.compres
 public class FastCompressingCodec extends CompressingCodec {
 
   /** Constructor that allows to configure the chunk size. */
-  public FastCompressingCodec(int chunkSize, boolean withSegmentSuffix) {
+  public FastCompressingCodec(int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix) {
     super("FastCompressingStoredFields", 
           withSegmentSuffix ? "FastCompressingStoredFields" : "",
-          CompressionMode.FAST, chunkSize);
+          CompressionMode.FAST, chunkSize, maxDocsPerChunk);
   }
 
   /** Default constructor. */
   public FastCompressingCodec() {
-    this(1 << 14, false);
+    this(1 << 14, 128, false);
   }
 }

Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastDecompressionCompressingCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastDecompressionCompressingCodec.java?rev=1642981&r1=1642980&r2=1642981&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastDecompressionCompressingCodec.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastDecompressionCompressingCodec.java Tue Dec  2 18:20:03 2014
@@ -21,14 +21,14 @@ package org.apache.lucene.codecs.compres
 public class FastDecompressionCompressingCodec extends CompressingCodec {
 
   /** Constructor that allows to configure the chunk size. */
-  public FastDecompressionCompressingCodec(int chunkSize, boolean withSegmentSuffix) {
+  public FastDecompressionCompressingCodec(int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix) {
     super("FastDecompressionCompressingStoredFields",
           withSegmentSuffix ? "FastDecompressionCompressingStoredFields" : "",
-          CompressionMode.FAST_DECOMPRESSION, chunkSize);
+          CompressionMode.FAST_DECOMPRESSION, chunkSize, maxDocsPerChunk);
   }
 
   /** Default constructor. */
   public FastDecompressionCompressingCodec() {
-    this(1 << 14, false);
+    this(1 << 14, 256, false);
   }
 }

Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/HighCompressionCompressingCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/HighCompressionCompressingCodec.java?rev=1642981&r1=1642980&r2=1642981&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/HighCompressionCompressingCodec.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/HighCompressionCompressingCodec.java Tue Dec  2 18:20:03 2014
@@ -21,14 +21,16 @@ package org.apache.lucene.codecs.compres
 public class HighCompressionCompressingCodec extends CompressingCodec {
 
   /** Constructor that allows to configure the chunk size. */
-  public HighCompressionCompressingCodec(int chunkSize, boolean withSegmentSuffix) {
+  public HighCompressionCompressingCodec(int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix) {
     super("HighCompressionCompressingStoredFields",
           withSegmentSuffix ? "HighCompressionCompressingStoredFields" : "",
-          CompressionMode.HIGH_COMPRESSION, chunkSize);
+          CompressionMode.HIGH_COMPRESSION, chunkSize, maxDocsPerChunk);
   }
 
   /** Default constructor. */
   public HighCompressionCompressingCodec() {
-    this(1 << 14, false);
+    // no need to have a higher block length than 32KB since deflate splits
+    // into blocks of 32KB anyway, and this is a lower bound (try to avoid > 32KB)
+    this(24576, 512, false);
   }
 }

Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/dummy/DummyCompressingCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/dummy/DummyCompressingCodec.java?rev=1642981&r1=1642980&r2=1642981&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/dummy/DummyCompressingCodec.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/dummy/DummyCompressingCodec.java Tue Dec  2 18:20:03 2014
@@ -83,15 +83,15 @@ public class DummyCompressingCodec exten
   };
 
   /** Constructor that allows to configure the chunk size. */
-  public DummyCompressingCodec(int chunkSize, boolean withSegmentSuffix) {
+  public DummyCompressingCodec(int chunkSize, int maxDocsPerChunk, boolean withSegmentSuffix) {
     super("DummyCompressingStoredFields",
           withSegmentSuffix ? "DummyCompressingStoredFields" : "",
-          DUMMY, chunkSize);
+          DUMMY, chunkSize, maxDocsPerChunk);
   }
 
   /** Default constructor. */
   public DummyCompressingCodec() {
-    this(1 << 14, false);
+    this(1 << 14, 128, false);
   }
 
 }