You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2015/11/17 11:22:27 UTC

svn commit: r1714749 - in /lucene/dev/trunk/lucene: ./ codecs/src/java/org/apache/lucene/codecs/simpletext/ core/src/java/org/apache/lucene/util/bkd/

Author: mikemccand
Date: Tue Nov 17 10:22:27 2015
New Revision: 1714749

URL: http://svn.apache.org/viewvc?rev=1714749&view=rev
Log:
LUCENE-6891: use prefix coding when writing dimensional values in each leaf block

Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java
    lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDimensionalWriter.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1714749&r1=1714748&r2=1714749&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Nov 17 10:22:27 2015
@@ -83,6 +83,12 @@ API Changes
   IndexOutput.getName returns its name (Dawid Weiss, Robert Muir, Mike
   McCandless)
 
+Optimizations
+
+* LUCENE-6891: Use prefix coding when writing dimensional values in 
+  each leaf block in the default codec, to reduce the index
+  size (Mike McCandless)
+
 Changes in Runtime Behavior
 
 * LUCENE-6789: IndexSearcher's default Similarity is changed to BM25Similarity.

Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java?rev=1714749&r1=1714748&r2=1714749&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java Tue Nov 17 10:22:27 2015
@@ -63,7 +63,8 @@ class SimpleTextBKDReader extends BKDRea
   }
 
   @Override
-  protected void visitDocValues(byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
+  protected void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
+    // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
     assert scratchPackedValue.length == packedBytesLength;
     BytesRefBuilder scratch = new BytesRefBuilder();
     for(int i=0;i<count;i++) {

Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDimensionalWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDimensionalWriter.java?rev=1714749&r1=1714748&r2=1714749&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDimensionalWriter.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDimensionalWriter.java Tue Nov 17 10:22:27 2015
@@ -128,10 +128,16 @@ class SimpleTextDimensionalWriter extend
         }
 
         @Override
-        protected void writeLeafBlockPackedValue(IndexOutput out, byte[] bytes, int offset, int length) throws IOException {
-          assert length == packedBytesLength;
+        protected void writeCommonPrefixes(IndexOutput out, int[] commonPrefixLengths, byte[] packedValue) {
+          // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
+        }
+
+        @Override
+        protected void writeLeafBlockPackedValue(IndexOutput out, int[] commonPrefixLengths, byte[] bytes) throws IOException {
+          // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
+          assert bytes.length == packedBytesLength;
           write(out, BLOCK_VALUE);
-          write(out, new BytesRef(bytes, offset, length).toString());
+          write(out, new BytesRef(bytes, 0, bytes.length).toString());
           newline(out);
         }          
       };

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java?rev=1714749&r1=1714748&r2=1714749&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java Tue Nov 17 10:22:27 2015
@@ -85,21 +85,25 @@ public class BKDReader implements Accoun
     final IndexInput in;
     final int[] scratchDocIDs;
     final byte[] scratchPackedValue;
+    final int[] commonPrefixLengths;
 
     final IntersectVisitor visitor;
 
-    public IntersectState(IndexInput in, int packedBytesLength,
+    public IntersectState(IndexInput in, int numDims,
+                          int packedBytesLength,
                           int maxPointsInLeafNode,
                           IntersectVisitor visitor) {
       this.in = in;
       this.visitor = visitor;
+      this.commonPrefixLengths = new int[numDims];
       this.scratchDocIDs = new int[maxPointsInLeafNode];
       this.scratchPackedValue = new byte[packedBytesLength];
     }
   }
 
   public void intersect(IntersectVisitor visitor) throws IOException {
-    IntersectState state = new IntersectState(in.clone(), packedBytesLength,
+    IntersectState state = new IntersectState(in.clone(), numDims,
+                                              packedBytesLength,
                                               maxPointsInLeafNode,
                                               visitor);
     byte[] rootMinPacked = new byte[packedBytesLength];
@@ -147,10 +151,21 @@ public class BKDReader implements Accoun
     return count;
   }
 
-  protected void visitDocValues(byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
+  protected void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
     visitor.grow(count);
+    for(int dim=0;dim<numDims;dim++) {
+      int prefix = in.readVInt();
+      commonPrefixLengths[dim] = prefix;
+      if (prefix > 0) {
+        in.readBytes(scratchPackedValue, dim*bytesPerDim, prefix);
+      }
+      //System.out.println("R: " + dim + " of " + numDims + " prefix=" + prefix);
+    }
     for(int i=0;i<count;i++) {
-      in.readBytes(scratchPackedValue, 0, scratchPackedValue.length);
+      for(int dim=0;dim<numDims;dim++) {
+        int prefix = commonPrefixLengths[dim];
+        in.readBytes(scratchPackedValue, dim*bytesPerDim + prefix, bytesPerDim - prefix);
+      }
       visitor.visit(docIDs[i], scratchPackedValue);
     }
   }
@@ -186,7 +201,7 @@ public class BKDReader implements Accoun
       int count = readDocIDs(state.in, leafBlockFPs[nodeID-leafNodeOffset], state.scratchDocIDs);
 
       // Again, this time reading values and checking with the visitor
-      visitDocValues(state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
+      visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
 
     } else {
       

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java?rev=1714749&r1=1714748&r2=1714749&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java Tue Nov 17 10:22:27 2015
@@ -104,6 +104,7 @@ public class BKDWriter implements Closea
   final byte[] scratchPackedValue;
   final byte[] scratch1;
   final byte[] scratch2;
+  final int[] commonPrefixLengths;
 
   private OfflinePointWriter offlinePointWriter;
   private HeapPointWriter heapPointWriter;
@@ -133,6 +134,7 @@ public class BKDWriter implements Closea
     scratchPackedValue = new byte[packedBytesLength];
     scratch1 = new byte[packedBytesLength];
     scratch2 = new byte[packedBytesLength];
+    commonPrefixLengths = new int[numDims];
 
     // dimensional values (numDims * bytesPerDim) + ord (long) + docID (int)
     bytesPerDoc = packedBytesLength + RamUsageEstimator.NUM_BYTES_LONG + RamUsageEstimator.NUM_BYTES_INT;
@@ -398,6 +400,7 @@ public class BKDWriter implements Closea
     innerNodeCount--;
 
     int numLeaves = (int) (innerNodeCount+1);
+    //System.out.println("LEAVES: " + numLeaves);
 
     // NOTE: we could save the 1+ here, to use a bit less heap at search time, but then we'd need a somewhat costly check at each
     // step of the recursion to recompute the split dim:
@@ -495,8 +498,19 @@ public class BKDWriter implements Closea
     }
   }
 
-  protected void writeLeafBlockPackedValue(IndexOutput out, byte[] bytes, int offset, int length) throws IOException {
-    out.writeBytes(bytes, 0, length);
+  protected void writeLeafBlockPackedValue(IndexOutput out, int[] commonPrefixLengths, byte[] bytes) throws IOException {
+    for(int dim=0;dim<numDims;dim++) {
+      int prefix = commonPrefixLengths[dim];
+      out.writeBytes(bytes, dim*bytesPerDim+prefix, bytesPerDim-prefix);
+    }
+  }
+
+  protected void writeCommonPrefixes(IndexOutput out, int[] commonPrefixes, byte[] packedValue) throws IOException {
+    for(int dim=0;dim<numDims;dim++) {
+      out.writeVInt(commonPrefixes[dim]);
+      //System.out.println(commonPrefixes[dim] + " of " + bytesPerDim);
+      out.writeBytes(packedValue, dim*bytesPerDim, commonPrefixes[dim]);
+    }
   }
 
   @Override
@@ -646,14 +660,35 @@ public class BKDWriter implements Closea
       // TODO: we should delta compress / only write suffix bytes, like terms dict (the values will all be "close together" since we are at
       // a leaf cell):
 
-      // Now write the full values:
+      // First pass: find the per-dim common prefix for all values in this block:
+      Arrays.fill(commonPrefixLengths, bytesPerDim);
+      for (int i=0;i<source.count;i++) {
+        if (i == 0) {
+          heapSource.readPackedValue(Math.toIntExact(source.start + i), scratch1);
+        } else {
+          heapSource.readPackedValue(Math.toIntExact(source.start + i), scratchPackedValue);
+          for(int dim=0;dim<numDims;dim++) {
+            int offset = dim * bytesPerDim;
+            for(int j=0;j<commonPrefixLengths[dim];j++) {
+              if (scratch1[offset+j] != scratchPackedValue[offset+j]) {
+                commonPrefixLengths[dim] = j;
+                break;
+              }
+            }
+          }
+        }
+      }
+
+      writeCommonPrefixes(out, commonPrefixLengths, scratch1);
+
+      // Second pass: write the full values:
       for (int i=0;i<source.count;i++) {
         // TODO: we could do bulk copying here, avoiding the intermediate copy:
         heapSource.readPackedValue(Math.toIntExact(source.start + i), scratchPackedValue);
 
         // Make sure this value does in fact fall within this leaf cell:
         assert valueInBounds(scratchPackedValue, minPackedValue, maxPackedValue);
-        writeLeafBlockPackedValue(out, scratchPackedValue, 0, scratchPackedValue.length);
+        writeLeafBlockPackedValue(out, commonPrefixLengths, scratchPackedValue);
       }
 
     } else {