You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/12/06 05:02:46 UTC
svn commit: r1643491 - in /lucene/dev/branches/branch_5x: ./ lucene/
lucene/core/ lucene/core/src/java/org/apache/lucene/codecs/lucene50/
lucene/core/src/test/org/apache/lucene/codecs/lucene50/
lucene/test-framework/ lucene/test-framework/src/java/org/...
Author: rmuir
Date: Sat Dec 6 04:02:45 2014
New Revision: 1643491
URL: http://svn.apache.org/r1643491
Log:
LUCENE-5914: More options for stored fields compression
Added:
lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java
- copied unchanged from r1643490, lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50StoredFieldsFormatHighCompression.java
Modified:
lucene/dev/branches/branch_5x/ (props changed)
lucene/dev/branches/branch_5x/lucene/ (props changed)
lucene/dev/branches/branch_5x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_5x/lucene/core/ (props changed)
lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java
lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java
lucene/dev/branches/branch_5x/lucene/test-framework/ (props changed)
lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java
Modified: lucene/dev/branches/branch_5x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/CHANGES.txt?rev=1643491&r1=1643490&r2=1643491&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/lucene/CHANGES.txt Sat Dec 6 04:02:45 2014
@@ -101,6 +101,9 @@ New Features
queries provided that term vectors with positions, offsets, and payloads are present. This is the
only highlighter that can highlight such queries accurately. (David Smiley)
+* LUCENE-5914: Add an option to Lucene50Codec to support either BEST_SPEED
+ or BEST_COMPRESSION for stored fields. (Adrien Grand, Robert Muir)
+
Optimizations
* LUCENE-5960: Use a more efficient bitset, not a Set<Integer>, to
Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java?rev=1643491&r1=1643490&r2=1643491&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50Codec.java Sat Dec 6 04:02:45 2014
@@ -17,6 +17,8 @@ package org.apache.lucene.codecs.lucene5
* limitations under the License.
*/
+import java.util.Objects;
+
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
@@ -28,6 +30,7 @@ import org.apache.lucene.codecs.Postings
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
+import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
@@ -42,7 +45,6 @@ import org.apache.lucene.codecs.perfield
* @lucene.experimental
*/
public class Lucene50Codec extends Codec {
- private final StoredFieldsFormat fieldsFormat = new Lucene50StoredFieldsFormat();
private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat();
private final FieldInfosFormat fieldInfosFormat = new Lucene50FieldInfosFormat();
private final SegmentInfoFormat segmentInfosFormat = new Lucene50SegmentInfoFormat();
@@ -62,15 +64,30 @@ public class Lucene50Codec extends Codec
return Lucene50Codec.this.getDocValuesFormatForField(field);
}
};
+
+ private final StoredFieldsFormat storedFieldsFormat;
- /** Sole constructor. */
+ /**
+ * Instantiates a new codec.
+ */
public Lucene50Codec() {
+ this(Mode.BEST_SPEED);
+ }
+
+ /**
+ * Instantiates a new codec, specifying the stored fields compression
+ * mode to use.
+ * @param mode stored fields compression mode to use for newly
+ * flushed/merged segments.
+ */
+ public Lucene50Codec(Mode mode) {
super("Lucene50");
+ this.storedFieldsFormat = new Lucene50StoredFieldsFormat(Objects.requireNonNull(mode));
}
@Override
public final StoredFieldsFormat storedFieldsFormat() {
- return fieldsFormat;
+ return storedFieldsFormat;
}
@Override
@@ -106,7 +123,11 @@ public class Lucene50Codec extends Codec
/** Returns the postings format that should be used for writing
* new segments of <code>field</code>.
*
- * The default implementation always returns "Lucene50"
+ * The default implementation always returns "Lucene50".
+ * <p>
+ * <b>WARNING:</b> if you subclass, you are responsible for index
+ * backwards compatibility: future version of Lucene are only
+ * guaranteed to be able to read the default implementation.
*/
public PostingsFormat getPostingsFormatForField(String field) {
return defaultFormat;
@@ -115,7 +136,11 @@ public class Lucene50Codec extends Codec
/** Returns the docvalues format that should be used for writing
* new segments of <code>field</code>.
*
- * The default implementation always returns "Lucene50"
+ * The default implementation always returns "Lucene50".
+ * <p>
+ * <b>WARNING:</b> if you subclass, you are responsible for index
+ * backwards compatibility: future version of Lucene are only
+ * guaranteed to be able to read the default implementation.
*/
public DocValuesFormat getDocValuesFormatForField(String field) {
return defaultDVFormat;
Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java?rev=1643491&r1=1643490&r2=1643491&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50StoredFieldsFormat.java Sat Dec 6 04:02:45 2014
@@ -17,26 +17,46 @@ package org.apache.lucene.codecs.lucene5
* limitations under the License.
*/
+import java.io.IOException;
+import java.util.Objects;
+
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.StoredFieldsFormat;
+import org.apache.lucene.codecs.StoredFieldsReader;
+import org.apache.lucene.codecs.StoredFieldsWriter;
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsFormat;
import org.apache.lucene.codecs.compressing.CompressingStoredFieldsIndexWriter;
import org.apache.lucene.codecs.compressing.CompressionMode;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.packed.PackedInts;
/**
* Lucene 5.0 stored fields format.
*
* <p><b>Principle</b></p>
- * <p>This {@link StoredFieldsFormat} compresses blocks of 16KB of documents in
+ * <p>This {@link StoredFieldsFormat} compresses blocks of documents in
* order to improve the compression ratio compared to document-level
* compression. It uses the <a href="http://code.google.com/p/lz4/">LZ4</a>
- * compression algorithm, which is fast to compress and very fast to decompress
- * data. Although the compression method that is used focuses more on speed
- * than on compression ratio, it should provide interesting compression ratios
- * for redundant inputs (such as log files, HTML or plain text).</p>
+ * compression algorithm by default in 16KB blocks, which is fast to compress
+ * and very fast to decompress data. Although the default compression method
+ * that is used ({@link Mode#BEST_SPEED BEST_SPEED}) focuses more on speed than on
+ * compression ratio, it should provide interesting compression ratios
+ * for redundant inputs (such as log files, HTML or plain text). For higher
+ * compression, you can choose ({@link Mode#BEST_COMPRESSION BEST_COMPRESSION}), which uses
+ * the <a href="http://en.wikipedia.org/wiki/DEFLATE">DEFLATE</a> algorithm with 24KB blocks
+ * for a better ratio at the expense of slower performance.
+ * These two options can be configured like this: </p>
+ * <pre class="prettyprint">
+ * // the default: for high performance
+ * indexWriterConfig.setCodec(new Lucene50Codec(Mode.BEST_SPEED));
+ * // instead for higher performance (but slower):
+ * // indexWriterConfig.setCodec(new Lucene50Codec(Mode.BEST_COMPRESSION));
+ * </pre>
* <p><b>File formats</b></p>
* <p>Stored fields are represented by two files:</p>
* <ol>
@@ -114,11 +134,58 @@ import org.apache.lucene.util.packed.Pac
* larger than (<tt>2<sup>31</sup> - 2<sup>14</sup></tt>) bytes.</p>
* @lucene.experimental
*/
-public final class Lucene50StoredFieldsFormat extends CompressingStoredFieldsFormat {
-
- /** Sole constructor. */
+public final class Lucene50StoredFieldsFormat extends StoredFieldsFormat {
+
+ /** Configuration option for stored fields. */
+ public static enum Mode {
+ /** Trade compression ratio for retrieval speed. */
+ BEST_SPEED,
+ /** Trade retrieval speed for compression ratio. */
+ BEST_COMPRESSION
+ }
+
+ /** Attribute key for compression mode. */
+ public static final String MODE_KEY = Lucene50StoredFieldsFormat.class.getSimpleName() + ".mode";
+
+ final Mode mode;
+
+ /** Stored fields format with default options */
public Lucene50StoredFieldsFormat() {
- super("Lucene50StoredFields", CompressionMode.FAST, 1 << 14, 128);
+ this(Mode.BEST_SPEED);
+ }
+
+ /** Stored fields format with specified mode */
+ public Lucene50StoredFieldsFormat(Mode mode) {
+ this.mode = Objects.requireNonNull(mode);
}
+ @Override
+ public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException {
+ String value = si.getAttribute(MODE_KEY);
+ if (value == null) {
+ throw new IllegalStateException("missing value for " + MODE_KEY + " for segment: " + si.name);
+ }
+ Mode mode = Mode.valueOf(value);
+ return impl(mode).fieldsReader(directory, si, fn, context);
+ }
+
+ @Override
+ public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, IOContext context) throws IOException {
+ String previous = si.putAttribute(MODE_KEY, mode.name());
+ if (previous != null) {
+ throw new IllegalStateException("found existing value for " + MODE_KEY + " for segment: " + si.name +
+ "old=" + previous + ", new=" + mode.name());
+ }
+ return impl(mode).fieldsWriter(directory, si, context);
+ }
+
+ StoredFieldsFormat impl(Mode mode) {
+ switch (mode) {
+ case BEST_SPEED:
+ return new CompressingStoredFieldsFormat("Lucene50StoredFieldsFast", CompressionMode.FAST, 1 << 14, 128);
+ case BEST_COMPRESSION:
+ return new CompressingStoredFieldsFormat("Lucene50StoredFieldsHigh", CompressionMode.HIGH_COMPRESSION, 24576, 512);
+ default: throw new AssertionError();
+ }
+ }
}
Modified: lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java?rev=1643491&r1=1643490&r2=1643491&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java (original)
+++ lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java Sat Dec 6 04:02:45 2014
@@ -36,6 +36,9 @@ import org.apache.lucene.codecs.assertin
import org.apache.lucene.codecs.asserting.AssertingPostingsFormat;
import org.apache.lucene.codecs.cheapbastard.CheapBastardCodec;
import org.apache.lucene.codecs.compressing.CompressingCodec;
+import org.apache.lucene.codecs.lucene50.Lucene50Codec;
+import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
+import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat;
import org.apache.lucene.codecs.simpletext.SimpleTextCodec;
import org.apache.lucene.index.RandomCodec;
@@ -44,7 +47,9 @@ import org.apache.lucene.search.similari
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
import org.junit.internal.AssumptionViolatedException;
+
import com.carrotsearch.randomizedtesting.RandomizedContext;
+import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import static org.apache.lucene.util.LuceneTestCase.INFOSTREAM;
import static org.apache.lucene.util.LuceneTestCase.LiveIWCFlushMode;
@@ -198,6 +203,8 @@ final class TestRuleSetupAndRestoreClass
codec = new AssertingCodec();
} else if ("Compressing".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 6 && !shouldAvoidCodec("Compressing"))) {
codec = CompressingCodec.randomInstance(random);
+ } else if ("Lucene50".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene50"))) {
+ codec = new Lucene50Codec(RandomPicks.randomFrom(random, Lucene50StoredFieldsFormat.Mode.values()));
} else if (!"random".equals(TEST_CODEC)) {
codec = Codec.forName(TEST_CODEC);
} else if ("random".equals(TEST_POSTINGSFORMAT)) {