You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2015/11/17 11:22:27 UTC
svn commit: r1714749 - in /lucene/dev/trunk/lucene: ./
codecs/src/java/org/apache/lucene/codecs/simpletext/
core/src/java/org/apache/lucene/util/bkd/
Author: mikemccand
Date: Tue Nov 17 10:22:27 2015
New Revision: 1714749
URL: http://svn.apache.org/viewvc?rev=1714749&view=rev
Log:
LUCENE-6891: use prefix coding when writing dimensional values in each leaf block
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDimensionalWriter.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1714749&r1=1714748&r2=1714749&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Nov 17 10:22:27 2015
@@ -83,6 +83,12 @@ API Changes
IndexOutput.getName returns its name (Dawid Weiss, Robert Muir, Mike
McCandless)
+Optimizations
+
+* LUCENE-6891: Use prefix coding when writing dimensional values in
+ each leaf block in the default codec, to reduce the index
+ size (Mike McCandless)
+
Changes in Runtime Behavior
* LUCENE-6789: IndexSearcher's default Similarity is changed to BM25Similarity.
Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java?rev=1714749&r1=1714748&r2=1714749&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextBKDReader.java Tue Nov 17 10:22:27 2015
@@ -63,7 +63,8 @@ class SimpleTextBKDReader extends BKDRea
}
@Override
- protected void visitDocValues(byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
+ protected void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
+ // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
assert scratchPackedValue.length == packedBytesLength;
BytesRefBuilder scratch = new BytesRefBuilder();
for(int i=0;i<count;i++) {
Modified: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDimensionalWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDimensionalWriter.java?rev=1714749&r1=1714748&r2=1714749&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDimensionalWriter.java (original)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDimensionalWriter.java Tue Nov 17 10:22:27 2015
@@ -128,10 +128,16 @@ class SimpleTextDimensionalWriter extend
}
@Override
- protected void writeLeafBlockPackedValue(IndexOutput out, byte[] bytes, int offset, int length) throws IOException {
- assert length == packedBytesLength;
+ protected void writeCommonPrefixes(IndexOutput out, int[] commonPrefixLengths, byte[] packedValue) {
+ // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
+ }
+
+ @Override
+ protected void writeLeafBlockPackedValue(IndexOutput out, int[] commonPrefixLengths, byte[] bytes) throws IOException {
+ // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
+ assert bytes.length == packedBytesLength;
write(out, BLOCK_VALUE);
- write(out, new BytesRef(bytes, offset, length).toString());
+ write(out, new BytesRef(bytes, 0, bytes.length).toString());
newline(out);
}
};
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java?rev=1714749&r1=1714748&r2=1714749&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java Tue Nov 17 10:22:27 2015
@@ -85,21 +85,25 @@ public class BKDReader implements Accoun
final IndexInput in;
final int[] scratchDocIDs;
final byte[] scratchPackedValue;
+ final int[] commonPrefixLengths;
final IntersectVisitor visitor;
- public IntersectState(IndexInput in, int packedBytesLength,
+ public IntersectState(IndexInput in, int numDims,
+ int packedBytesLength,
int maxPointsInLeafNode,
IntersectVisitor visitor) {
this.in = in;
this.visitor = visitor;
+ this.commonPrefixLengths = new int[numDims];
this.scratchDocIDs = new int[maxPointsInLeafNode];
this.scratchPackedValue = new byte[packedBytesLength];
}
}
public void intersect(IntersectVisitor visitor) throws IOException {
- IntersectState state = new IntersectState(in.clone(), packedBytesLength,
+ IntersectState state = new IntersectState(in.clone(), numDims,
+ packedBytesLength,
maxPointsInLeafNode,
visitor);
byte[] rootMinPacked = new byte[packedBytesLength];
@@ -147,10 +151,21 @@ public class BKDReader implements Accoun
return count;
}
- protected void visitDocValues(byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
+ protected void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
visitor.grow(count);
+ for(int dim=0;dim<numDims;dim++) {
+ int prefix = in.readVInt();
+ commonPrefixLengths[dim] = prefix;
+ if (prefix > 0) {
+ in.readBytes(scratchPackedValue, dim*bytesPerDim, prefix);
+ }
+ //System.out.println("R: " + dim + " of " + numDims + " prefix=" + prefix);
+ }
for(int i=0;i<count;i++) {
- in.readBytes(scratchPackedValue, 0, scratchPackedValue.length);
+ for(int dim=0;dim<numDims;dim++) {
+ int prefix = commonPrefixLengths[dim];
+ in.readBytes(scratchPackedValue, dim*bytesPerDim + prefix, bytesPerDim - prefix);
+ }
visitor.visit(docIDs[i], scratchPackedValue);
}
}
@@ -186,7 +201,7 @@ public class BKDReader implements Accoun
int count = readDocIDs(state.in, leafBlockFPs[nodeID-leafNodeOffset], state.scratchDocIDs);
// Again, this time reading values and checking with the visitor
- visitDocValues(state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
+ visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
} else {
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java?rev=1714749&r1=1714748&r2=1714749&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java Tue Nov 17 10:22:27 2015
@@ -104,6 +104,7 @@ public class BKDWriter implements Closea
final byte[] scratchPackedValue;
final byte[] scratch1;
final byte[] scratch2;
+ final int[] commonPrefixLengths;
private OfflinePointWriter offlinePointWriter;
private HeapPointWriter heapPointWriter;
@@ -133,6 +134,7 @@ public class BKDWriter implements Closea
scratchPackedValue = new byte[packedBytesLength];
scratch1 = new byte[packedBytesLength];
scratch2 = new byte[packedBytesLength];
+ commonPrefixLengths = new int[numDims];
// dimensional values (numDims * bytesPerDim) + ord (long) + docID (int)
bytesPerDoc = packedBytesLength + RamUsageEstimator.NUM_BYTES_LONG + RamUsageEstimator.NUM_BYTES_INT;
@@ -398,6 +400,7 @@ public class BKDWriter implements Closea
innerNodeCount--;
int numLeaves = (int) (innerNodeCount+1);
+ //System.out.println("LEAVES: " + numLeaves);
// NOTE: we could save the 1+ here, to use a bit less heap at search time, but then we'd need a somewhat costly check at each
// step of the recursion to recompute the split dim:
@@ -495,8 +498,19 @@ public class BKDWriter implements Closea
}
}
- protected void writeLeafBlockPackedValue(IndexOutput out, byte[] bytes, int offset, int length) throws IOException {
- out.writeBytes(bytes, 0, length);
+ protected void writeLeafBlockPackedValue(IndexOutput out, int[] commonPrefixLengths, byte[] bytes) throws IOException {
+ for(int dim=0;dim<numDims;dim++) {
+ int prefix = commonPrefixLengths[dim];
+ out.writeBytes(bytes, dim*bytesPerDim+prefix, bytesPerDim-prefix);
+ }
+ }
+
+ protected void writeCommonPrefixes(IndexOutput out, int[] commonPrefixes, byte[] packedValue) throws IOException {
+ for(int dim=0;dim<numDims;dim++) {
+ out.writeVInt(commonPrefixes[dim]);
+ //System.out.println(commonPrefixes[dim] + " of " + bytesPerDim);
+ out.writeBytes(packedValue, dim*bytesPerDim, commonPrefixes[dim]);
+ }
}
@Override
@@ -646,14 +660,35 @@ public class BKDWriter implements Closea
// TODO: we should delta compress / only write suffix bytes, like terms dict (the values will all be "close together" since we are at
// a leaf cell):
- // Now write the full values:
+ // First pass: find the per-dim common prefix for all values in this block:
+ Arrays.fill(commonPrefixLengths, bytesPerDim);
+ for (int i=0;i<source.count;i++) {
+ if (i == 0) {
+ heapSource.readPackedValue(Math.toIntExact(source.start + i), scratch1);
+ } else {
+ heapSource.readPackedValue(Math.toIntExact(source.start + i), scratchPackedValue);
+ for(int dim=0;dim<numDims;dim++) {
+ int offset = dim * bytesPerDim;
+ for(int j=0;j<commonPrefixLengths[dim];j++) {
+ if (scratch1[offset+j] != scratchPackedValue[offset+j]) {
+ commonPrefixLengths[dim] = j;
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ writeCommonPrefixes(out, commonPrefixLengths, scratch1);
+
+ // Second pass: write the full values:
for (int i=0;i<source.count;i++) {
// TODO: we could do bulk copying here, avoiding the intermediate copy:
heapSource.readPackedValue(Math.toIntExact(source.start + i), scratchPackedValue);
// Make sure this value does in fact fall within this leaf cell:
assert valueInBounds(scratchPackedValue, minPackedValue, maxPackedValue);
- writeLeafBlockPackedValue(out, scratchPackedValue, 0, scratchPackedValue.length);
+ writeLeafBlockPackedValue(out, commonPrefixLengths, scratchPackedValue);
}
} else {