You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/12/06 05:02:46 UTC

svn commit: r1643491 - in /lucene/dev/branches/branch_5x: ./ lucene/ lucene/core/ lucene/core/src/java/org/apache/lucene/codecs/lucene50/ lucene/core/src/test/org/apache/lucene/codecs/lucene50/ lucene/test-framework/ lucene/test-framework/src/java/org/...

Author: rmuir
Date: Sat Dec  6 04:02:45 2014
New Revision: 1643491

URL: http://svn.apache.org/r1643491
Log:
LUCENE-5914: More options for stored fields compression

Added:
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java
      - copied unchanged from r1643490, lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java
Modified:
    lucene/dev/branches/branch_5x/   (props changed)
    lucene/dev/branches/branch_5x/lucene/   (props changed)
    lucene/dev/branches/branch_5x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_5x/lucene/core/   (props changed)
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java
    lucene/dev/branches/branch_5x/lucene/test-framework/   (props changed)
    lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java

Modified: lucene/dev/branches/branch_5x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/CHANGES.txt?rev=1643491&r1=1643490&r2=1643491&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/lucene/CHANGES.txt Sat Dec  6 04:02:45 2014
@@ -101,6 +101,9 @@ New Features
   queries provided that term vectors with positions, offsets, and payloads are present. This is the
   only highlighter that can highlight such queries accurately. (David Smiley)
 
+* LUCENE-5914: Add an option to Lucene50Codec to support either BEST_SPEED
+  or BEST_COMPRESSION for stored fields. (Adrien Grand, Robert Muir)
+
 Optimizations
 
 * LUCENE-5960: Use a more efficient bitset, not a Set<Integer>, to

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java?rev=1643491&r1=1643490&r2=1643491&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java Sat Dec  6 04:02:45 2014
@@ -17,6 +17,8 @@ package org.apache.lucene.codecs.lucene5
  * limitations under the License.
  */
 
+import java.util.Objects;
+
 import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.codecs.CompoundFormat;
 import org.apache.lucene.codecs.DocValuesFormat;
@@ -28,6 +30,7 @@ import org.apache.lucene.codecs.Postings
 import org.apache.lucene.codecs.SegmentInfoFormat;
 import org.apache.lucene.codecs.StoredFieldsFormat;
 import org.apache.lucene.codecs.TermVectorsFormat;
+import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
 import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
 import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
 
@@ -42,7 +45,6 @@ import org.apache.lucene.codecs.perfield
  * @lucene.experimental
  */
 public class Lucene50Codec extends Codec {
-  private final StoredFieldsFormat fieldsFormat = new Lucene50StoredFieldsFormat();
   private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat();
   private final FieldInfosFormat fieldInfosFormat = new Lucene50FieldInfosFormat();
   private final SegmentInfoFormat segmentInfosFormat = new Lucene50SegmentInfoFormat();
@@ -62,15 +64,30 @@ public class Lucene50Codec extends Codec
       return Lucene50Codec.this.getDocValuesFormatForField(field);
     }
   };
+  
+  private final StoredFieldsFormat storedFieldsFormat;
 
-  /** Sole constructor. */
+  /** 
+   * Instantiates a new codec.
+   */
   public Lucene50Codec() {
+    this(Mode.BEST_SPEED);
+  }
+  
+  /** 
+   * Instantiates a new codec, specifying the stored fields compression
+   * mode to use.
+   * @param mode stored fields compression mode to use for newly 
+   *             flushed/merged segments.
+   */
+  public Lucene50Codec(Mode mode) {
     super("Lucene50");
+    this.storedFieldsFormat = new Lucene50StoredFieldsFormat(Objects.requireNonNull(mode));
   }
   
   @Override
   public final StoredFieldsFormat storedFieldsFormat() {
-    return fieldsFormat;
+    return storedFieldsFormat;
   }
   
   @Override
@@ -106,7 +123,11 @@ public class Lucene50Codec extends Codec
   /** Returns the postings format that should be used for writing 
    *  new segments of <code>field</code>.
    *  
-   *  The default implementation always returns "Lucene50"
+   *  The default implementation always returns "Lucene50".
+   *  <p>
+   *  <b>WARNING:</b> if you subclass, you are responsible for index 
+   *  backwards compatibility: future version of Lucene are only 
+   *  guaranteed to be able to read the default implementation. 
    */
   public PostingsFormat getPostingsFormatForField(String field) {
     return defaultFormat;
@@ -115,7 +136,11 @@ public class Lucene50Codec extends Codec
   /** Returns the docvalues format that should be used for writing 
    *  new segments of <code>field</code>.
    *  
-   *  The default implementation always returns "Lucene50"
+   *  The default implementation always returns "Lucene50".
+   *  <p>
+   *  <b>WARNING:</b> if you subclass, you are responsible for index 
+   *  backwards compatibility: future version of Lucene are only 
+   *  guaranteed to be able to read the default implementation. 
    */
   public DocValuesFormat getDocValuesFormatForField(String field) {
     return defaultDVFormat;

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java?rev=1643491&r1=1643490&r2=1643491&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java Sat Dec  6 04:02:45 2014
@@ -17,26 +17,46 @@ package org.apache.lucene.codecs.lucene5
  * limitations under the License.
  */
 
+import java.io.IOException;
+import java.util.Objects;
+
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.StoredFieldsFormat;
+import org.apache.lucene.codecs.StoredFieldsReader;
+import org.apache.lucene.codecs.StoredFieldsWriter;
 import org.apache.lucene.codecs.compressing.CompressingStoredFieldsFormat;
 import org.apache.lucene.codecs.compressing.CompressingStoredFieldsIndexWriter;
 import org.apache.lucene.codecs.compressing.CompressionMode;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.SegmentInfo;
 import org.apache.lucene.index.StoredFieldVisitor;
 import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
 import org.apache.lucene.util.packed.PackedInts;
 
 /**
  * Lucene 5.0 stored fields format.
  *
  * <p><b>Principle</b></p>
- * <p>This {@link StoredFieldsFormat} compresses blocks of 16KB of documents in
+ * <p>This {@link StoredFieldsFormat} compresses blocks of documents in
  * order to improve the compression ratio compared to document-level
  * compression. It uses the <a href="http://code.google.com/p/lz4/">LZ4</a>
- * compression algorithm, which is fast to compress and very fast to decompress
- * data. Although the compression method that is used focuses more on speed
- * than on compression ratio, it should provide interesting compression ratios
- * for redundant inputs (such as log files, HTML or plain text).</p>
+ * compression algorithm by default in 16KB blocks, which is fast to compress 
+ * and very fast to decompress data. Although the default compression method 
+ * that is used ({@link Mode#BEST_SPEED BEST_SPEED}) focuses more on speed than on 
+ * compression ratio, it should provide interesting compression ratios
+ * for redundant inputs (such as log files, HTML or plain text). For higher
+ * compression, you can choose ({@link Mode#BEST_COMPRESSION BEST_COMPRESSION}), which uses 
+ * the <a href="http://en.wikipedia.org/wiki/DEFLATE">DEFLATE</a> algorithm with 24KB blocks 
+ * for a better ratio at the expense of slower performance. 
+ * These two options can be configured like this: </p>
+ * <pre class="prettyprint">
+ *   // the default: for high performance
+ *   indexWriterConfig.setCodec(new Lucene50Codec(Mode.BEST_SPEED));
+ *   // instead for higher performance (but slower):
+ *   // indexWriterConfig.setCodec(new Lucene50Codec(Mode.BEST_COMPRESSION));
+ * </pre>
  * <p><b>File formats</b></p>
  * <p>Stored fields are represented by two files:</p>
  * <ol>
@@ -114,11 +134,58 @@ import org.apache.lucene.util.packed.Pac
  * larger than (<tt>2<sup>31</sup> - 2<sup>14</sup></tt>) bytes.</p>
  * @lucene.experimental
  */
-public final class Lucene50StoredFieldsFormat extends CompressingStoredFieldsFormat {
-
-  /** Sole constructor. */
+public final class Lucene50StoredFieldsFormat extends StoredFieldsFormat {
+  
+  /** Configuration option for stored fields. */
+  public static enum Mode {
+    /** Trade compression ratio for retrieval speed. */
+    BEST_SPEED,
+    /** Trade retrieval speed for compression ratio. */
+    BEST_COMPRESSION
+  }
+  
+  /** Attribute key for compression mode. */
+  public static final String MODE_KEY = Lucene50StoredFieldsFormat.class.getSimpleName() + ".mode";
+  
+  final Mode mode;
+  
+  /** Stored fields format with default options */
   public Lucene50StoredFieldsFormat() {
-    super("Lucene50StoredFields", CompressionMode.FAST, 1 << 14, 128);
+    this(Mode.BEST_SPEED);
+  }
+  
+  /** Stored fields format with specified mode */
+  public Lucene50StoredFieldsFormat(Mode mode) {
+    this.mode = Objects.requireNonNull(mode);
   }
 
+  @Override
+  public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException {
+    String value = si.getAttribute(MODE_KEY);
+    if (value == null) {
+      throw new IllegalStateException("missing value for " + MODE_KEY + " for segment: " + si.name);
+    }
+    Mode mode = Mode.valueOf(value);
+    return impl(mode).fieldsReader(directory, si, fn, context);
+  }
+
+  @Override
+  public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, IOContext context) throws IOException {
+    String previous = si.putAttribute(MODE_KEY, mode.name());
+    if (previous != null) {
+      throw new IllegalStateException("found existing value for " + MODE_KEY + " for segment: " + si.name +
+                                      "old=" + previous + ", new=" + mode.name());
+    }
+    return impl(mode).fieldsWriter(directory, si, context);
+  }
+  
+  StoredFieldsFormat impl(Mode mode) {
+    switch (mode) {
+      case BEST_SPEED: 
+        return new CompressingStoredFieldsFormat("Lucene50StoredFieldsFast", CompressionMode.FAST, 1 << 14, 128);
+      case BEST_COMPRESSION: 
+        return new CompressingStoredFieldsFormat("Lucene50StoredFieldsHigh", CompressionMode.HIGH_COMPRESSION, 24576, 512);
+      default: throw new AssertionError();
+    }
+  }
 }

Modified: lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java?rev=1643491&r1=1643490&r2=1643491&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java (original)
+++ lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java Sat Dec  6 04:02:45 2014
@@ -36,6 +36,9 @@ import org.apache.lucene.codecs.assertin
 import org.apache.lucene.codecs.asserting.AssertingPostingsFormat;
 import org.apache.lucene.codecs.cheapbastard.CheapBastardCodec;
 import org.apache.lucene.codecs.compressing.CompressingCodec;
+import org.apache.lucene.codecs.lucene50.Lucene50Codec;
+import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
+import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
 import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat;
 import org.apache.lucene.codecs.simpletext.SimpleTextCodec;
 import org.apache.lucene.index.RandomCodec;
@@ -44,7 +47,9 @@ import org.apache.lucene.search.similari
 import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
 import org.junit.internal.AssumptionViolatedException;
+
 import com.carrotsearch.randomizedtesting.RandomizedContext;
+import com.carrotsearch.randomizedtesting.generators.RandomPicks;
 
 import static org.apache.lucene.util.LuceneTestCase.INFOSTREAM;
 import static org.apache.lucene.util.LuceneTestCase.LiveIWCFlushMode;
@@ -198,6 +203,8 @@ final class TestRuleSetupAndRestoreClass
       codec = new AssertingCodec();
     } else if ("Compressing".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 6 && !shouldAvoidCodec("Compressing"))) {
       codec = CompressingCodec.randomInstance(random);
+    } else if ("Lucene50".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene50"))) {
+      codec = new Lucene50Codec(RandomPicks.randomFrom(random, Lucene50StoredFieldsFormat.Mode.values()));
     } else if (!"random".equals(TEST_CODEC)) {
       codec = Codec.forName(TEST_CODEC);
     } else if ("random".equals(TEST_POSTINGSFORMAT)) {