You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/06/11 17:54:59 UTC

svn commit: r1601936 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/codecs/ lucene/codecs/src/java/org/apache/lucene/codecs/memory/ lucene/core/ lucene/core/src/java/org/apache/lucene/codecs/lucene49/

Author: rmuir
Date: Wed Jun 11 15:54:58 2014
New Revision: 1601936

URL: http://svn.apache.org/r1601936
Log:
LUCENE-5751: Bring MemoryDocValues up to speed

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/codecs/   (props changed)
    lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesConsumer.java
    lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java
    lucene/dev/branches/branch_4x/lucene/core/   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49DocValuesConsumer.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1601936&r1=1601935&r2=1601936&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Wed Jun 11 15:54:58 2014
@@ -172,6 +172,8 @@ Optimizations
 * LUCENE-5750: Speed up monotonic addressing for BINARY and SORTED_SET 
   docvalues. (Robert Muir)
 
+* LUCENE-5751: Speed up MemoryDocValues. (Adrien Grand, Robert Muir)
+
 Bug fixes
 
 * LUCENE-5738: Ensure NativeFSLock prevents opening the file channel for the

Modified: lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesConsumer.java?rev=1601936&r1=1601935&r2=1601936&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesConsumer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesConsumer.java Wed Jun 11 15:54:58 2014
@@ -51,9 +51,9 @@ import static org.apache.lucene.codecs.m
 import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.NUMBER;
 import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.FST;
 import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.DELTA_COMPRESSED;
+import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.BLOCK_COMPRESSED;
 import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.GCD_COMPRESSED;
 import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.TABLE_COMPRESSED;
-import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.UNCOMPRESSED;
 
 /**
  * Writer for {@link MemoryDocValuesFormat}
@@ -93,6 +93,7 @@ class MemoryDocValuesConsumer extends Do
     meta.writeLong(data.getFilePointer());
     long minValue = Long.MAX_VALUE;
     long maxValue = Long.MIN_VALUE;
+    long blockSum = 0;
     long gcd = 0;
     boolean missing = false;
     // TODO: more efficient?
@@ -101,6 +102,8 @@ class MemoryDocValuesConsumer extends Do
       uniqueValues = new HashSet<>();
 
       long count = 0;
+      long currentBlockMin = Long.MAX_VALUE;
+      long currentBlockMax = Long.MIN_VALUE;
       for (Number nv : values) {
         final long v;
         if (nv == null) {
@@ -121,6 +124,9 @@ class MemoryDocValuesConsumer extends Do
           }
         }
 
+        currentBlockMin = Math.min(minValue, v);
+        currentBlockMax = Math.max(maxValue, v);
+        
         minValue = Math.min(minValue, v);
         maxValue = Math.max(maxValue, v);
 
@@ -133,8 +139,22 @@ class MemoryDocValuesConsumer extends Do
         }
 
         ++count;
+        if (count % BLOCK_SIZE == 0) {
+          final long blockDelta = currentBlockMax - currentBlockMin;
+          final int blockDeltaRequired = blockDelta < 0 ? 64 : PackedInts.bitsRequired(blockDelta);
+          final int blockBPV = PackedInts.fastestFormatAndBits(BLOCK_SIZE, blockDeltaRequired, acceptableOverheadRatio).bitsPerValue;
+          blockSum += blockBPV;
+          currentBlockMax = Long.MIN_VALUE;
+          currentBlockMin = Long.MAX_VALUE;
+        }
       }
       assert count == maxDoc;
+    } else {
+      for (Number nv : values) {
+        long v = nv.longValue();
+        maxValue = Math.max(v, maxValue);
+        minValue = Math.min(v, minValue);
+      }
     }
     
     if (missing) {
@@ -145,60 +165,99 @@ class MemoryDocValuesConsumer extends Do
     } else {
       meta.writeLong(-1L);
     }
-
+    
+    final long delta = maxValue - minValue;
+    final int deltaRequired = delta < 0 ? 64 : PackedInts.bitsRequired(delta);
+    final FormatAndBits deltaBPV = PackedInts.fastestFormatAndBits(maxDoc, deltaRequired, acceptableOverheadRatio);
+        
+    final FormatAndBits tableBPV;
     if (uniqueValues != null) {
+      tableBPV = PackedInts.fastestFormatAndBits(maxDoc, PackedInts.bitsRequired(uniqueValues.size()-1), acceptableOverheadRatio);
+    } else {
+      tableBPV = null;
+    }
+    
+    final FormatAndBits gcdBPV;
+    if (gcd != 0 && gcd != 1) {
+      final long gcdDelta = (maxValue - minValue) / gcd;
+      final int gcdRequired = gcdDelta < 0 ? 64 : PackedInts.bitsRequired(gcdDelta);
+      gcdBPV = PackedInts.fastestFormatAndBits(maxDoc, gcdRequired, acceptableOverheadRatio);
+    } else {
+      gcdBPV = null;
+    }
+    
+    boolean doBlock = false;
+    if (blockSum != 0) {
+      int numBlocks = maxDoc / BLOCK_SIZE;
+      float avgBPV = blockSum / (float)numBlocks;
+      // just a heuristic, with tiny amounts of blocks our estimate is skewed as we ignore the final "incomplete" block.
+      // with at least 4 blocks its pretty accurate. The difference must also be significant (according to acceptable overhead).
+      if (numBlocks >= 4 && (avgBPV+avgBPV*acceptableOverheadRatio) < deltaBPV.bitsPerValue) {
+        doBlock = true;
+      }
+    }
+    
+    if (tableBPV != null && (tableBPV.bitsPerValue+tableBPV.bitsPerValue*acceptableOverheadRatio) < deltaBPV.bitsPerValue) {
       // small number of unique values
-      final int bitsPerValue = PackedInts.bitsRequired(uniqueValues.size()-1);
-      FormatAndBits formatAndBits = PackedInts.fastestFormatAndBits(maxDoc, bitsPerValue, acceptableOverheadRatio);
-      if (formatAndBits.bitsPerValue == 8 && minValue >= Byte.MIN_VALUE && maxValue <= Byte.MAX_VALUE) {
-        meta.writeByte(UNCOMPRESSED); // uncompressed
-        for (Number nv : values) {
-          data.writeByte(nv == null ? 0 : (byte) nv.longValue());
-        }
-      } else {
-        meta.writeByte(TABLE_COMPRESSED); // table-compressed
-        Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]);
-        final HashMap<Long,Integer> encode = new HashMap<>();
-        data.writeVInt(decode.length);
-        for (int i = 0; i < decode.length; i++) {
-          data.writeLong(decode[i]);
-          encode.put(decode[i], i);
-        }
-
-        meta.writeVInt(PackedInts.VERSION_CURRENT);
-        data.writeVInt(formatAndBits.format.getId());
-        data.writeVInt(formatAndBits.bitsPerValue);
-
-        final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, formatAndBits.format, maxDoc, formatAndBits.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
-        for(Number nv : values) {
-          writer.add(encode.get(nv == null ? 0 : nv.longValue()));
-        }
-        writer.finish();
+      meta.writeByte(TABLE_COMPRESSED); // table-compressed
+      Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]);
+      final HashMap<Long,Integer> encode = new HashMap<>();
+      int length = 1 << tableBPV.bitsPerValue;
+      data.writeVInt(length);
+      for (int i = 0; i < decode.length; i++) {
+        data.writeLong(decode[i]);
+        encode.put(decode[i], i);
       }
-    } else if (gcd != 0 && gcd != 1) {
+      for (int i = decode.length; i < length; i++) {
+        data.writeLong(0);
+      }
+      
+      meta.writeVInt(PackedInts.VERSION_CURRENT);
+      data.writeVInt(tableBPV.format.getId());
+      data.writeVInt(tableBPV.bitsPerValue);
+      
+      final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, tableBPV.format, maxDoc, tableBPV.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
+      for(Number nv : values) {
+        writer.add(encode.get(nv == null ? 0 : nv.longValue()));
+      }
+      writer.finish();
+    } else if (gcdBPV != null && (gcdBPV.bitsPerValue+gcdBPV.bitsPerValue*acceptableOverheadRatio) < deltaBPV.bitsPerValue) {
       meta.writeByte(GCD_COMPRESSED);
       meta.writeVInt(PackedInts.VERSION_CURRENT);
       data.writeLong(minValue);
       data.writeLong(gcd);
-      data.writeVInt(BLOCK_SIZE);
+      data.writeVInt(gcdBPV.format.getId());
+      data.writeVInt(gcdBPV.bitsPerValue);
 
-      final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
+      final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, gcdBPV.format, maxDoc, gcdBPV.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
       for (Number nv : values) {
         long value = nv == null ? 0 : nv.longValue();
         writer.add((value - minValue) / gcd);
       }
       writer.finish();
-    } else {
-      meta.writeByte(DELTA_COMPRESSED); // delta-compressed
-
+    } else if (doBlock) {
+      meta.writeByte(BLOCK_COMPRESSED); // block delta-compressed
       meta.writeVInt(PackedInts.VERSION_CURRENT);
       data.writeVInt(BLOCK_SIZE);
-
       final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
       for (Number nv : values) {
         writer.add(nv == null ? 0 : nv.longValue());
       }
       writer.finish();
+    } else {
+      meta.writeByte(DELTA_COMPRESSED); // delta-compressed
+      meta.writeVInt(PackedInts.VERSION_CURRENT);
+      final long minDelta = deltaBPV.bitsPerValue == 64 ? 0 : minValue;
+      data.writeLong(minDelta);
+      data.writeVInt(deltaBPV.format.getId());
+      data.writeVInt(deltaBPV.bitsPerValue);
+
+      final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, deltaBPV.format, maxDoc, deltaBPV.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
+      for (Number nv : values) {
+        long v = nv == null ? 0 : nv.longValue();
+        writer.add(v - minDelta);
+      }
+      writer.finish();
     }
   }
   

Modified: lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java?rev=1601936&r1=1601935&r2=1601936&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java Wed Jun 11 15:54:58 2014
@@ -90,13 +90,14 @@ class MemoryDocValuesProducer extends Do
   
   static final byte DELTA_COMPRESSED = 0;
   static final byte TABLE_COMPRESSED = 1;
-  static final byte UNCOMPRESSED = 2;
+  static final byte BLOCK_COMPRESSED = 2;
   static final byte GCD_COMPRESSED = 3;
   
   static final int VERSION_START = 0;
   static final int VERSION_GCD_COMPRESSION = 1;
   static final int VERSION_CHECKSUM = 2;
-  static final int VERSION_CURRENT = VERSION_CHECKSUM;
+  static final int VERSION_BLOCKDETECTION = 3;
+  static final int VERSION_CURRENT = VERSION_BLOCKDETECTION;
     
   MemoryDocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
     maxDoc = state.segmentInfo.getDocCount();
@@ -163,15 +164,13 @@ class MemoryDocValuesProducer extends Do
         switch(entry.format) {
           case DELTA_COMPRESSED:
           case TABLE_COMPRESSED:
+          case BLOCK_COMPRESSED:
           case GCD_COMPRESSED:
-          case UNCOMPRESSED:
                break;
           default:
                throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta);
         }
-        if (entry.format != UNCOMPRESSED) {
-          entry.packedIntsVersion = meta.readVInt();
-        }
+        entry.packedIntsVersion = meta.readVInt();
         numerics.put(fieldNumber, entry);
       } else if (fieldType == BYTES) {
         BinaryEntry entry = new BinaryEntry();
@@ -248,25 +247,28 @@ class MemoryDocValuesProducer extends Do
           }
         };
       case DELTA_COMPRESSED:
-        final int blockSize = data.readVInt();
-        final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, blockSize, maxDoc, false);
-        ramBytesUsed.addAndGet(reader.ramBytesUsed());
-        return reader;
-      case UNCOMPRESSED:
-        final byte bytes[] = new byte[maxDoc];
-        data.readBytes(bytes, 0, bytes.length);
-        ramBytesUsed.addAndGet(RamUsageEstimator.sizeOf(bytes));
+        final long minDelta = data.readLong();
+        final int formatIDDelta = data.readVInt();
+        final int bitsPerValueDelta = data.readVInt();
+        final PackedInts.Reader deltaReader = PackedInts.getReaderNoHeader(data, PackedInts.Format.byId(formatIDDelta), entry.packedIntsVersion, maxDoc, bitsPerValueDelta);
+        ramBytesUsed.addAndGet(deltaReader.ramBytesUsed());
         return new NumericDocValues() {
           @Override
           public long get(int docID) {
-            return bytes[docID];
+            return minDelta + deltaReader.get(docID);
           }
         };
+      case BLOCK_COMPRESSED:
+        final int blockSize = data.readVInt();
+        final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, blockSize, maxDoc, false);
+        ramBytesUsed.addAndGet(reader.ramBytesUsed());
+        return reader;
       case GCD_COMPRESSED:
         final long min = data.readLong();
         final long mult = data.readLong();
-        final int quotientBlockSize = data.readVInt();
-        final BlockPackedReader quotientReader = new BlockPackedReader(data, entry.packedIntsVersion, quotientBlockSize, maxDoc, false);
+        final int formatIDGCD = data.readVInt();
+        final int bitsPerValueGCD = data.readVInt();
+        final PackedInts.Reader quotientReader = PackedInts.getReaderNoHeader(data, PackedInts.Format.byId(formatIDGCD), entry.packedIntsVersion, maxDoc, bitsPerValueGCD);
         ramBytesUsed.addAndGet(quotientReader.ramBytesUsed());
         return new NumericDocValues() {
           @Override

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49DocValuesConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49DocValuesConsumer.java?rev=1601936&r1=1601935&r2=1601936&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49DocValuesConsumer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/lucene49/Lucene49DocValuesConsumer.java Wed Jun 11 15:54:58 2014
@@ -150,13 +150,15 @@ public class Lucene49DocValuesConsumer e
     }
     
     final long delta = maxValue - minValue;
+    final int deltaBitsRequired = delta < 0 ? 64 : DirectWriter.bitsRequired(delta);
 
     final int format;
-    if (uniqueValues != null
-        && (delta < 0L || PackedInts.bitsRequired(uniqueValues.size() - 1) < PackedInts.bitsRequired(delta))) {
+    if (uniqueValues != null && DirectWriter.bitsRequired(uniqueValues.size() - 1) < deltaBitsRequired) {
       format = TABLE_COMPRESSED;
     } else if (gcd != 0 && gcd != 1) {
-      format = GCD_COMPRESSED;
+      final long gcdDelta = (maxValue - minValue) / gcd;
+      final long gcdBitsRequired = gcdDelta < 0 ? 64 : DirectWriter.bitsRequired(gcdDelta);
+      format = gcdBitsRequired < deltaBitsRequired ? GCD_COMPRESSED : DELTA_COMPRESSED;
     } else {
       format = DELTA_COMPRESSED;
     }
@@ -189,9 +191,8 @@ public class Lucene49DocValuesConsumer e
       case DELTA_COMPRESSED:
         final long minDelta = delta < 0 ? 0 : minValue;
         meta.writeLong(minDelta);
-        final int bpv = delta < 0 ? 64 : DirectWriter.bitsRequired(delta);
-        meta.writeVInt(bpv);
-        final DirectWriter writer = DirectWriter.getInstance(data, count, bpv);
+        meta.writeVInt(deltaBitsRequired);
+        final DirectWriter writer = DirectWriter.getInstance(data, count, deltaBitsRequired);
         for (Number nv : values) {
           long v = nv == null ? 0 : nv.longValue();
           writer.add(v - minDelta);