You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/12/06 04:30:03 UTC

svn commit: r1643490 - in /lucene/dev/trunk/lucene: ./ core/src/java/org/apache/lucene/codecs/lucene50/ core/src/test/org/apache/lucene/codecs/lucene50/ test-framework/src/java/org/apache/lucene/util/

Author: rmuir
Date: Sat Dec  6 03:30:02 2014
New Revision: 1643490

URL: http://svn.apache.org/r1643490
Log:
LUCENE-5914: More options for stored fields compression

Added:
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java   (with props)
Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java
    lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1643490&r1=1643489&r2=1643490&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Sat Dec  6 03:30:02 2014
@@ -123,6 +123,9 @@ New Features
   queries provided that term vectors with positions, offsets, and payloads are present. This is the
   only highlighter that can highlight such queries accurately. (David Smiley)
 
+* LUCENE-5914: Add an option to Lucene50Codec to support either BEST_SPEED
+  or BEST_COMPRESSION for stored fields. (Adrien Grand, Robert Muir)
+
 Optimizations
 
 * LUCENE-5960: Use a more efficient bitset, not a Set<Integer>, to

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java?rev=1643490&r1=1643489&r2=1643490&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java Sat Dec  6 03:30:02 2014
@@ -17,6 +17,8 @@ package org.apache.lucene.codecs.lucene5
  * limitations under the License.
  */
 
+import java.util.Objects;
+
 import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.codecs.CompoundFormat;
 import org.apache.lucene.codecs.DocValuesFormat;
@@ -28,6 +30,7 @@ import org.apache.lucene.codecs.Postings
 import org.apache.lucene.codecs.SegmentInfoFormat;
 import org.apache.lucene.codecs.StoredFieldsFormat;
 import org.apache.lucene.codecs.TermVectorsFormat;
+import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
 import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
 import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
 
@@ -42,7 +45,6 @@ import org.apache.lucene.codecs.perfield
  * @lucene.experimental
  */
 public class Lucene50Codec extends Codec {
-  private final StoredFieldsFormat fieldsFormat = new Lucene50StoredFieldsFormat();
   private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat();
   private final FieldInfosFormat fieldInfosFormat = new Lucene50FieldInfosFormat();
   private final SegmentInfoFormat segmentInfosFormat = new Lucene50SegmentInfoFormat();
@@ -62,15 +64,30 @@ public class Lucene50Codec extends Codec
       return Lucene50Codec.this.getDocValuesFormatForField(field);
     }
   };
+  
+  private final StoredFieldsFormat storedFieldsFormat;
 
-  /** Sole constructor. */
+  /** 
+   * Instantiates a new codec.
+   */
   public Lucene50Codec() {
+    this(Mode.BEST_SPEED);
+  }
+  
+  /** 
+   * Instantiates a new codec, specifying the stored fields compression
+   * mode to use.
+   * @param mode stored fields compression mode to use for newly 
+   *             flushed/merged segments.
+   */
+  public Lucene50Codec(Mode mode) {
     super("Lucene50");
+    this.storedFieldsFormat = new Lucene50StoredFieldsFormat(Objects.requireNonNull(mode));
   }
   
   @Override
   public final StoredFieldsFormat storedFieldsFormat() {
-    return fieldsFormat;
+    return storedFieldsFormat;
   }
   
   @Override
@@ -106,7 +123,11 @@ public class Lucene50Codec extends Codec
   /** Returns the postings format that should be used for writing 
    *  new segments of <code>field</code>.
    *  
-   *  The default implementation always returns "Lucene50"
+   *  The default implementation always returns "Lucene50".
+   *  <p>
+   *  <b>WARNING:</b> if you subclass, you are responsible for index 
+   *  backwards compatibility: future version of Lucene are only 
+   *  guaranteed to be able to read the default implementation. 
    */
   public PostingsFormat getPostingsFormatForField(String field) {
     return defaultFormat;
@@ -115,7 +136,11 @@ public class Lucene50Codec extends Codec
   /** Returns the docvalues format that should be used for writing 
    *  new segments of <code>field</code>.
    *  
-   *  The default implementation always returns "Lucene50"
+   *  The default implementation always returns "Lucene50".
+   *  <p>
+   *  <b>WARNING:</b> if you subclass, you are responsible for index 
+   *  backwards compatibility: future version of Lucene are only 
+   *  guaranteed to be able to read the default implementation. 
    */
   public DocValuesFormat getDocValuesFormatForField(String field) {
     return defaultDVFormat;

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java?rev=1643490&r1=1643489&r2=1643490&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java Sat Dec  6 03:30:02 2014
@@ -17,26 +17,46 @@ package org.apache.lucene.codecs.lucene5
  * limitations under the License.
  */
 
+import java.io.IOException;
+import java.util.Objects;
+
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.StoredFieldsFormat;
+import org.apache.lucene.codecs.StoredFieldsReader;
+import org.apache.lucene.codecs.StoredFieldsWriter;
 import org.apache.lucene.codecs.compressing.CompressingStoredFieldsFormat;
 import org.apache.lucene.codecs.compressing.CompressingStoredFieldsIndexWriter;
 import org.apache.lucene.codecs.compressing.CompressionMode;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.SegmentInfo;
 import org.apache.lucene.index.StoredFieldVisitor;
 import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
 import org.apache.lucene.util.packed.PackedInts;
 
 /**
  * Lucene 5.0 stored fields format.
  *
  * <p><b>Principle</b></p>
- * <p>This {@link StoredFieldsFormat} compresses blocks of 16KB of documents in
+ * <p>This {@link StoredFieldsFormat} compresses blocks of documents in
  * order to improve the compression ratio compared to document-level
  * compression. It uses the <a href="http://code.google.com/p/lz4/">LZ4</a>
- * compression algorithm, which is fast to compress and very fast to decompress
- * data. Although the compression method that is used focuses more on speed
- * than on compression ratio, it should provide interesting compression ratios
- * for redundant inputs (such as log files, HTML or plain text).</p>
+ * compression algorithm by default in 16KB blocks, which is fast to compress 
+ * and very fast to decompress data. Although the default compression method 
+ * that is used ({@link Mode#BEST_SPEED BEST_SPEED}) focuses more on speed than on 
+ * compression ratio, it should provide interesting compression ratios
+ * for redundant inputs (such as log files, HTML or plain text). For higher
+ * compression, you can choose ({@link Mode#BEST_COMPRESSION BEST_COMPRESSION}), which uses 
+ * the <a href="http://en.wikipedia.org/wiki/DEFLATE">DEFLATE</a> algorithm with 24KB blocks 
+ * for a better ratio at the expense of slower performance. 
+ * These two options can be configured like this: </p>
+ * <pre class="prettyprint">
+ *   // the default: for high performance
+ *   indexWriterConfig.setCodec(new Lucene50Codec(Mode.BEST_SPEED));
+ *   // instead for higher performance (but slower):
+ *   // indexWriterConfig.setCodec(new Lucene50Codec(Mode.BEST_COMPRESSION));
+ * </pre>
  * <p><b>File formats</b></p>
  * <p>Stored fields are represented by two files:</p>
  * <ol>
@@ -114,11 +134,58 @@ import org.apache.lucene.util.packed.Pac
  * larger than (<tt>2<sup>31</sup> - 2<sup>14</sup></tt>) bytes.</p>
  * @lucene.experimental
  */
-public final class Lucene50StoredFieldsFormat extends CompressingStoredFieldsFormat {
-
-  /** Sole constructor. */
+public final class Lucene50StoredFieldsFormat extends StoredFieldsFormat {
+  
+  /** Configuration option for stored fields. */
+  public static enum Mode {
+    /** Trade compression ratio for retrieval speed. */
+    BEST_SPEED,
+    /** Trade retrieval speed for compression ratio. */
+    BEST_COMPRESSION
+  }
+  
+  /** Attribute key for compression mode. */
+  public static final String MODE_KEY = Lucene50StoredFieldsFormat.class.getSimpleName() + ".mode";
+  
+  final Mode mode;
+  
+  /** Stored fields format with default options */
   public Lucene50StoredFieldsFormat() {
-    super("Lucene50StoredFields", CompressionMode.FAST, 1 << 14, 128);
+    this(Mode.BEST_SPEED);
+  }
+  
+  /** Stored fields format with specified mode */
+  public Lucene50StoredFieldsFormat(Mode mode) {
+    this.mode = Objects.requireNonNull(mode);
   }
 
+  @Override
+  public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException {
+    String value = si.getAttribute(MODE_KEY);
+    if (value == null) {
+      throw new IllegalStateException("missing value for " + MODE_KEY + " for segment: " + si.name);
+    }
+    Mode mode = Mode.valueOf(value);
+    return impl(mode).fieldsReader(directory, si, fn, context);
+  }
+
+  @Override
+  public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, IOContext context) throws IOException {
+    String previous = si.putAttribute(MODE_KEY, mode.name());
+    if (previous != null) {
+      throw new IllegalStateException("found existing value for " + MODE_KEY + " for segment: " + si.name +
+                                      "old=" + previous + ", new=" + mode.name());
+    }
+    return impl(mode).fieldsWriter(directory, si, context);
+  }
+  
+  StoredFieldsFormat impl(Mode mode) {
+    switch (mode) {
+      case BEST_SPEED: 
+        return new CompressingStoredFieldsFormat("Lucene50StoredFieldsFast", CompressionMode.FAST, 1 << 14, 128);
+      case BEST_COMPRESSION: 
+        return new CompressingStoredFieldsFormat("Lucene50StoredFieldsHigh", CompressionMode.HIGH_COMPRESSION, 24576, 512);
+      default: throw new AssertionError();
+    }
+  }
 }

Added: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java?rev=1643490&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java (added)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java Sat Dec  6 03:30:02 2014
@@ -0,0 +1,81 @@
+package org.apache.lucene.codecs.lucene50;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.StoredField;
+import org.apache.lucene.index.BaseStoredFieldsFormatTestCase;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.Directory;
+
+import com.carrotsearch.randomizedtesting.generators.RandomPicks;
+
+public class TestLucene50StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase {
+  @Override
+  protected Codec getCodec() {
+    return new Lucene50Codec(Mode.BEST_COMPRESSION);
+  }
+  
+  /**
+   * Change compression params (leaving it the same for old segments)
+   * and tests that nothing breaks.
+   */
+  public void testMixedCompressions() throws Exception {
+    Directory dir = newDirectory();
+    for (int i = 0; i < 10; i++) {
+      IndexWriterConfig iwc = newIndexWriterConfig();
+      iwc.setCodec(new Lucene50Codec(RandomPicks.randomFrom(random(), Mode.values())));
+      IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig());
+      Document doc = new Document();
+      doc.add(new StoredField("field1", "value1"));
+      doc.add(new StoredField("field2", "value2"));
+      iw.addDocument(doc);
+      if (random().nextInt(4) == 0) {
+        iw.forceMerge(1);
+      }
+      iw.commit();
+      iw.close();
+    }
+    
+    DirectoryReader ir = DirectoryReader.open(dir);
+    assertEquals(10, ir.numDocs());
+    ir.close();
+    // checkindex
+    dir.close();
+  }
+  
+  public void testInvalidOptions() throws Exception {
+    try {
+      new Lucene50Codec(null);
+      fail("didn't hit exception");
+    } catch (NullPointerException expected) {
+      // expected
+    }
+    
+    try {
+      new Lucene50StoredFieldsFormat(null);
+      fail("didn't hit exception");
+    } catch (NullPointerException expected) {
+      // expected
+    }
+  }
+}

Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java?rev=1643490&r1=1643489&r2=1643490&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java Sat Dec  6 03:30:02 2014
@@ -36,6 +36,9 @@ import org.apache.lucene.codecs.assertin
 import org.apache.lucene.codecs.asserting.AssertingPostingsFormat;
 import org.apache.lucene.codecs.cheapbastard.CheapBastardCodec;
 import org.apache.lucene.codecs.compressing.CompressingCodec;
+import org.apache.lucene.codecs.lucene50.Lucene50Codec;
+import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
+import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
 import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat;
 import org.apache.lucene.codecs.simpletext.SimpleTextCodec;
 import org.apache.lucene.index.RandomCodec;
@@ -44,7 +47,9 @@ import org.apache.lucene.search.similari
 import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
 import org.junit.internal.AssumptionViolatedException;
+
 import com.carrotsearch.randomizedtesting.RandomizedContext;
+import com.carrotsearch.randomizedtesting.generators.RandomPicks;
 
 import static org.apache.lucene.util.LuceneTestCase.INFOSTREAM;
 import static org.apache.lucene.util.LuceneTestCase.LiveIWCFlushMode;
@@ -198,6 +203,8 @@ final class TestRuleSetupAndRestoreClass
       codec = new AssertingCodec();
     } else if ("Compressing".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 6 && !shouldAvoidCodec("Compressing"))) {
       codec = CompressingCodec.randomInstance(random);
+    } else if ("Lucene50".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene50"))) {
+      codec = new Lucene50Codec(RandomPicks.randomFrom(random, Lucene50StoredFieldsFormat.Mode.values()));
     } else if (!"random".equals(TEST_CODEC)) {
       codec = Codec.forName(TEST_CODEC);
     } else if ("random".equals(TEST_POSTINGSFORMAT)) {