You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/02/05 21:53:00 UTC

svn commit: r1442738 - in /lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene: codecs/lucene42/ search/similarities/

Author: rmuir
Date: Tue Feb  5 20:53:00 2013
New Revision: 1442738

URL: http://svn.apache.org/viewvc?rev=1442738&view=rev
Log:
tune default perf (fasterButMoreRam)

Modified:
    lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java
    lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java
    lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java
    lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42NormsFormat.java
    lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java

Modified: lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java?rev=1442738&r1=1442737&r2=1442738&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java (original)
+++ lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java Tue Feb  5 20:53:00 2013
@@ -38,6 +38,7 @@ import org.apache.lucene.util.fst.Util;
 import org.apache.lucene.util.packed.BlockPackedWriter;
 import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
 import org.apache.lucene.util.packed.PackedInts;
+import org.apache.lucene.util.packed.PackedInts.FormatAndBits;
 
 /**
  * Writer for {@link Lucene42DocValuesFormat}
@@ -51,14 +52,20 @@ class Lucene42DocValuesConsumer extends 
   static final byte FST = 2;
 
   static final int BLOCK_SIZE = 4096;
+  
+  static final byte DELTA_COMPRESSED = 0;
+  static final byte TABLE_COMPRESSED = 1;
+  static final byte UNCOMPRESSED = 2;
 
   final IndexOutput data, meta;
   final int maxDoc;
+  final float acceptableOverheadRatio;
   
-  Lucene42DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
+  Lucene42DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension, float acceptableOverheadRatio) throws IOException {
+    this.acceptableOverheadRatio = acceptableOverheadRatio;
+    maxDoc = state.segmentInfo.getDocCount();
     boolean success = false;
     try {
-      maxDoc = state.segmentInfo.getDocCount();
       String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
       data = state.directory.createOutput(dataName, state.context);
       CodecUtil.writeHeader(data, dataCodec, VERSION_CURRENT);
@@ -95,29 +102,37 @@ class Lucene42DocValuesConsumer extends 
       }
     }
 
-    final long delta = maxValue - minValue;
-    if (uniqueValues != null && (delta < 0 || PackedInts.bitsRequired(uniqueValues.size()-1) < PackedInts.bitsRequired(delta))) {
-      // smaller to tableize
+    if (uniqueValues != null) {
+      // small number of unique values
       final int bitsPerValue = PackedInts.bitsRequired(uniqueValues.size()-1);
-      meta.writeByte((byte)1); // table-compressed
-      Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]);
-      final HashMap<Long,Integer> encode = new HashMap<Long,Integer>();
-      data.writeVInt(decode.length);
-      for (int i = 0; i < decode.length; i++) {
-        data.writeLong(decode[i]);
-        encode.put(decode[i], i);
-      }
-
-      meta.writeVInt(PackedInts.VERSION_CURRENT);
-      data.writeVInt(bitsPerValue);
+      FormatAndBits formatAndBits = PackedInts.fastestFormatAndBits(maxDoc, bitsPerValue, acceptableOverheadRatio);
+      if (formatAndBits.bitsPerValue == 8 && minValue >= Byte.MIN_VALUE && maxValue <= Byte.MAX_VALUE) {
+        meta.writeByte(UNCOMPRESSED); // uncompressed
+        for (Number nv : values) {
+          data.writeByte((byte) nv.longValue());
+        }
+      } else {
+        meta.writeByte(TABLE_COMPRESSED); // table-compressed
+        Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]);
+        final HashMap<Long,Integer> encode = new HashMap<Long,Integer>();
+        data.writeVInt(decode.length);
+        for (int i = 0; i < decode.length; i++) {
+          data.writeLong(decode[i]);
+          encode.put(decode[i], i);
+        }
 
-      final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, PackedInts.Format.PACKED, maxDoc, bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
-      for(Number nv : values) {
-        writer.add(encode.get(nv));
+        meta.writeVInt(PackedInts.VERSION_CURRENT);
+        data.writeVInt(formatAndBits.format.getId());
+        data.writeVInt(formatAndBits.bitsPerValue);
+
+        final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, formatAndBits.format, maxDoc, formatAndBits.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
+        for(Number nv : values) {
+          writer.add(encode.get(nv.longValue()));
+        }
+        writer.finish();
       }
-      writer.finish();
     } else {
-      meta.writeByte((byte)0); // delta-compressed
+      meta.writeByte(DELTA_COMPRESSED); // delta-compressed
 
       meta.writeVInt(PackedInts.VERSION_CURRENT);
       data.writeVInt(BLOCK_SIZE);

Modified: lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java?rev=1442738&r1=1442737&r2=1442738&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java (original)
+++ lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java Tue Feb  5 20:53:00 2013
@@ -41,6 +41,9 @@ import org.apache.lucene.util.packed.Blo
  *        the minimum value is encoded, and each entry is a delta from that minimum value.
  *    <li>Table-compressed Numerics: when the number of unique values is very small, a lookup table
  *        is written instead. Each per-document entry is instead the ordinal to this table.
+ *    <li>Uncompressed Numerics: when all values would fit into a single byte, and the 
+ *        <code>acceptableOverheadRatio</code> would pack values into 8 bits per value anyway, they
+ *        are written as absolute values (with no indirection or packing) for performance.
  *    <li>Fixed-width Binary: one large concatenated byte[] is written, along with the fixed length.
  *        Each document's value can be addressed by maxDoc*length. 
  *    <li>Variable-width Binary: one large concatenated byte[] is written, along with end addresses 
@@ -83,6 +86,9 @@ import org.apache.lucene.util.packed.Blo
  *             from the minimum value within the block. 
  *         <li>1 --&gt; table-compressed. When the number of unique numeric values is small and it would save space,
  *             a lookup table of unique values is written, followed by the ordinal for each document.
+ *         <li>2 --&gt; uncompressed. When the <code>acceptableOverheadRatio</code> parameter would upgrade the number
+ *             of bits required to 8, and all values fit in a byte, these are written as absolute binary values
+ *             for performance.
  *      </ul>
  *   <p>MinLength and MaxLength represent the min and max byte[] value lengths for Binary values.
  *      If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length).
@@ -93,11 +99,12 @@ import org.apache.lucene.util.packed.Blo
  *   <p>For DocValues field, this stores the actual per-document data (the heavy-lifting)</p>
  *   <p>DocValues data (.dvd) --&gt; Header,&lt;NumericData | BinaryData | SortedData&gt;<sup>NumFields</sup></p>
  *   <ul>
- *     <li>NumericData --&gt; DeltaCompressedNumerics | TableCompressedNumerics</li>
+ *     <li>NumericData --&gt; DeltaCompressedNumerics | TableCompressedNumerics | UncompressedNumerics</li>
  *     <li>BinaryData --&gt;  {@link DataOutput#writeByte Byte}<sup>DataLength</sup>,Addresses</li>
  *     <li>SortedData --&gt; {@link FST FST&lt;Int64&gt;}</li>
  *     <li>DeltaCompressedNumerics --&gt; {@link BlockPackedWriter BlockPackedInts(blockSize=4096)}</li>
  *     <li>TableCompressedNumerics --&gt; TableSize,{@link DataOutput#writeLong Int64}<sup>TableSize</sup>,{@link PackedInts PackedInts}</li>
+ *     <li>UncompressedNumerics --&gt; {@link DataOutput#writeByte Byte}<sup>maxdoc</sup></li>
  *     <li>Addresses --&gt; {@link MonotonicBlockPackedWriter MonotonicBlockPackedInts(blockSize=4096)}</li>
  *   </ul>
  * </ol>
@@ -111,7 +118,8 @@ public final class Lucene42DocValuesForm
 
   @Override
   public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
-    return new Lucene42DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
+    // note: we choose DEFAULT here (its reasonably fast, and for small bpv has tiny waste)
+    return new Lucene42DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION, PackedInts.DEFAULT);
   }
   
   @Override

Modified: lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java?rev=1442738&r1=1442737&r2=1442738&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java (original)
+++ lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java Tue Feb  5 20:53:00 2013
@@ -104,8 +104,10 @@ class Lucene42DocValuesProducer extends 
       if (fieldType == Lucene42DocValuesConsumer.NUMBER) {
         NumericEntry entry = new NumericEntry();
         entry.offset = meta.readLong();
-        entry.tableized = meta.readByte() != 0;
-        entry.packedIntsVersion = meta.readVInt();
+        entry.format = meta.readByte();
+        if (entry.format != Lucene42DocValuesConsumer.UNCOMPRESSED) {
+          entry.packedIntsVersion = meta.readVInt();
+        }
         numerics.put(fieldNumber, entry);
       } else if (fieldType == Lucene42DocValuesConsumer.BYTES) {
         BinaryEntry entry = new BinaryEntry();
@@ -143,21 +145,22 @@ class Lucene42DocValuesProducer extends 
   private NumericDocValues loadNumeric(FieldInfo field) throws IOException {
     NumericEntry entry = numerics.get(field.number);
     data.seek(entry.offset);
-    if (entry.tableized) {
+    if (entry.format == Lucene42DocValuesConsumer.TABLE_COMPRESSED) {
       int size = data.readVInt();
       final long decode[] = new long[size];
       for (int i = 0; i < decode.length; i++) {
         decode[i] = data.readLong();
       }
+      final int formatID = data.readVInt();
       final int bitsPerValue = data.readVInt();
-      final PackedInts.Reader reader = PackedInts.getReaderNoHeader(data, PackedInts.Format.PACKED, entry.packedIntsVersion, maxDoc, bitsPerValue);
+      final PackedInts.Reader reader = PackedInts.getReaderNoHeader(data, PackedInts.Format.byId(formatID), entry.packedIntsVersion, maxDoc, bitsPerValue);
       return new NumericDocValues() {
         @Override
         public long get(int docID) {
           return decode[(int)reader.get(docID)];
         }
       };
-    } else {
+    } else if (entry.format == Lucene42DocValuesConsumer.DELTA_COMPRESSED) {
       final int blockSize = data.readVInt();
       final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, blockSize, maxDoc, false);
       return new NumericDocValues() {
@@ -166,6 +169,17 @@ class Lucene42DocValuesProducer extends 
           return reader.get(docID);
         }
       };
+    } else if (entry.format == Lucene42DocValuesConsumer.UNCOMPRESSED) {
+      final byte bytes[] = new byte[maxDoc];
+      data.readBytes(bytes, 0, bytes.length);
+      return new NumericDocValues() {
+        @Override
+        public long get(int docID) {
+          return bytes[docID];
+        }
+      };
+    } else {
+      throw new IllegalStateException();
     }
   }
 
@@ -279,7 +293,7 @@ class Lucene42DocValuesProducer extends 
   
   static class NumericEntry {
     long offset;
-    boolean tableized;
+    byte format;
     int packedIntsVersion;
   }
   

Modified: lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42NormsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42NormsFormat.java?rev=1442738&r1=1442737&r2=1442738&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42NormsFormat.java (original)
+++ lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42NormsFormat.java Tue Feb  5 20:53:00 2013
@@ -24,12 +24,15 @@ import org.apache.lucene.codecs.DocValue
 import org.apache.lucene.codecs.NormsFormat;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.util.packed.PackedInts;
 
 /**
  * Lucene 4.2 score normalization format.
  * <p>
  * NOTE: this uses the same format as {@link Lucene42DocValuesFormat}
- * Numeric DocValues, but with different file extensions.
+ * Numeric DocValues, but with different file extensions, and passing
+ * {@link PackedInts#FASTEST} for uncompressed encoding: trading off
+ * space for performance.
  * <p>
  * Files:
  * <ul>
@@ -45,7 +48,8 @@ public final class Lucene42NormsFormat e
   
   @Override
   public DocValuesConsumer normsConsumer(SegmentWriteState state) throws IOException {
-    return new Lucene42DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
+    // note: we choose FASTEST here (otherwise our norms are half as big but 15% slower than previous lucene)
+    return new Lucene42DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION, PackedInts.FASTEST);
   }
   
   @Override

Modified: lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java?rev=1442738&r1=1442737&r2=1442738&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java (original)
+++ lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java Tue Feb  5 20:53:00 2013
@@ -772,23 +772,16 @@ public abstract class TFIDFSimilarity ex
     private final IDFStats stats;
     private final float weightValue;
     private final NumericDocValues norms;
-    private static final int SCORE_CACHE_SIZE = 32;
-    private float[] scoreCache = new float[SCORE_CACHE_SIZE];
     
     ExactTFIDFDocScorer(IDFStats stats, NumericDocValues norms) throws IOException {
       this.stats = stats;
       this.weightValue = stats.value;
       this.norms = norms; 
-      for (int i = 0; i < SCORE_CACHE_SIZE; i++)
-        scoreCache[i] = tf(i) * weightValue;
     }
     
     @Override
     public float score(int doc, int freq) {
-      final float raw =                                // compute tf(f)*weight
-        freq < SCORE_CACHE_SIZE                        // check cache
-        ? scoreCache[freq]                             // cache hit
-        : tf(freq)*weightValue;        // cache miss
+      final float raw = tf(freq)*weightValue;  // compute tf(f)*weight
 
       return norms == null ? raw : raw * decodeNormValue((byte)norms.get(doc)); // normalize for field
     }