You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2013/01/22 15:54:56 UTC
svn commit: r1436983 - in /lucene/dev/branches/lucene4547/lucene:
codecs/src/java/org/apache/lucene/codecs/diskdv/
core/src/java/org/apache/lucene/codecs/lucene42/
core/src/java/org/apache/lucene/util/packed/
core/src/test/org/apache/lucene/util/packed/
Author: jpountz
Date: Tue Jan 22 14:54:56 2013
New Revision: 1436983
URL: http://svn.apache.org/viewvc?rev=1436983&view=rev
Log:
Improved memory efficiency for the bytes index.
Added:
lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/AbstractBlockPackedWriter.java (with props)
lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicBlockPackedReader.java (with props)
lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicBlockPackedWriter.java (with props)
Modified:
lucene/dev/branches/lucene4547/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesConsumer.java
lucene/dev/branches/lucene4547/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java
lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java
lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java
lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReader.java
lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedWriter.java
lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java
lucene/dev/branches/lucene4547/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java
Modified: lucene/dev/branches/lucene4547/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesConsumer.java?rev=1436983&r1=1436982&r2=1436983&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesConsumer.java (original)
+++ lucene/dev/branches/lucene4547/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesConsumer.java Tue Jan 22 14:54:56 2013
@@ -18,7 +18,6 @@ package org.apache.lucene.codecs.diskdv;
*/
import java.io.IOException;
-import java.util.Iterator;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesConsumer;
@@ -29,6 +28,7 @@ import org.apache.lucene.store.IndexOutp
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.BlockPackedWriter;
+import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
import org.apache.lucene.util.packed.PackedInts;
class DiskDocValuesConsumer extends DocValuesConsumer {
@@ -99,32 +99,18 @@ class DiskDocValuesConsumer extends DocV
// if minLength == maxLength, its a fixed-length byte[], we are done (the addresses are implicit)
// otherwise, we need to record the length fields...
- // TODO: make this more efficient. this is just as inefficient as 4.0 codec.... we can do much better.
if (minLength != maxLength) {
- addNumericField(field, new Iterable<Number>() {
- @Override
- public Iterator<Number> iterator() {
- final Iterator<BytesRef> inner = values.iterator();
- return new Iterator<Number>() {
- long addr = 0;
-
- @Override
- public boolean hasNext() {
- return inner.hasNext();
- }
-
- @Override
- public Number next() {
- BytesRef b = inner.next();
- addr += b.length;
- return Long.valueOf(addr);
- }
-
- @Override
- public void remove() { throw new UnsupportedOperationException(); }
- };
- }
- });
+ meta.writeLong(data.getFilePointer());
+ meta.writeVInt(PackedInts.VERSION_CURRENT);
+ meta.writeVInt(BLOCK_SIZE);
+
+ final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE);
+ long addr = 0;
+ for (BytesRef v : values) {
+ addr += v.length;
+ writer.add(addr);
+ }
+ writer.finish();
}
}
Modified: lucene/dev/branches/lucene4547/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java?rev=1436983&r1=1436982&r2=1436983&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java (original)
+++ lucene/dev/branches/lucene4547/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java Tue Jan 22 14:54:56 2013
@@ -33,9 +33,11 @@ import org.apache.lucene.index.NumericDo
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.BlockPackedReader;
+import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
class DiskDocValuesProducer extends DocValuesProducer {
private final Map<Integer,NumericEntry> numerics;
@@ -81,28 +83,14 @@ class DiskDocValuesProducer extends DocV
} else if (type == DocValuesType.BINARY) {
BinaryEntry b = readBinaryEntry(meta);
binaries.put(fieldNumber, b);
- if (b.minLength != b.maxLength) {
- if (meta.readVInt() != fieldNumber) {
- throw new CorruptIndexException("binary entry for field: " + fieldNumber + " is corrupt");
- }
- // variable length byte[]: read addresses as a numeric dv field
- numerics.put(fieldNumber, readNumericEntry(meta));
- }
} else if (type == DocValuesType.SORTED) {
BinaryEntry b = readBinaryEntry(meta);
binaries.put(fieldNumber, b);
- if (b.minLength != b.maxLength) {
- if (meta.readVInt() != fieldNumber) {
- throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt");
- }
- // variable length byte[]: read addresses as a numeric dv field
- numerics.put(fieldNumber, readNumericEntry(meta));
- }
- // sorted byte[]: read ords as a numeric dv field
if (meta.readVInt() != fieldNumber) {
throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt");
}
- ords.put(fieldNumber, readNumericEntry(meta));
+ NumericEntry n = readNumericEntry(meta);
+ ords.put(fieldNumber, n);
}
fieldNumber = meta.readVInt();
}
@@ -123,6 +111,11 @@ class DiskDocValuesProducer extends DocV
entry.maxLength = meta.readVInt();
entry.count = meta.readVInt();
entry.offset = meta.readLong();
+ if (entry.minLength != entry.maxLength) {
+ entry.addressesOffset = meta.readLong();
+ entry.packedIntsVersion = meta.readVInt();
+ entry.blockSize = meta.readVInt();
+ }
return entry;
}
@@ -157,6 +150,7 @@ class DiskDocValuesProducer extends DocV
private BinaryDocValues getFixedBinary(FieldInfo field, final BinaryEntry bytes) {
final IndexInput data = this.data.clone();
+
return new BinaryDocValues() {
@Override
public void get(int docID, BytesRef result) {
@@ -178,18 +172,20 @@ class DiskDocValuesProducer extends DocV
private BinaryDocValues getVariableBinary(FieldInfo field, final BinaryEntry bytes) throws IOException {
final IndexInput data = this.data.clone();
- final NumericDocValues addresses = getNumeric(field);
+ data.seek(bytes.addressesOffset);
+
+ final MonotonicBlockPackedReader addresses = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, bytes.count, true);
return new BinaryDocValues() {
@Override
public void get(int docID, BytesRef result) {
- long startAddress = docID == 0 ? bytes.offset : bytes.offset + addresses.get(docID-1);
+ long startAddress = bytes.offset + (docID == 0 ? 0 : + addresses.get(docID-1));
long endAddress = bytes.offset + addresses.get(docID);
int length = (int) (endAddress - startAddress);
try {
data.seek(startAddress);
if (result.bytes.length < length) {
result.offset = 0;
- result.bytes = new byte[length];
+ result.bytes = new byte[ArrayUtil.oversize(length, 1)];
}
data.readBytes(result.bytes, result.offset, length);
result.length = length;
@@ -243,5 +239,8 @@ class DiskDocValuesProducer extends DocV
int count;
int minLength;
int maxLength;
+ long addressesOffset;
+ int packedIntsVersion;
+ int blockSize;
}
}
Modified: lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java?rev=1436983&r1=1436982&r2=1436983&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java (original)
+++ lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java Tue Jan 22 14:54:56 2013
@@ -20,7 +20,6 @@ package org.apache.lucene.codecs.lucene4
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
-import java.util.Iterator;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesConsumer;
@@ -37,6 +36,7 @@ import org.apache.lucene.util.fst.FST.IN
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.packed.BlockPackedWriter;
+import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
import org.apache.lucene.util.packed.PackedInts;
/**
@@ -113,8 +113,8 @@ class Lucene42DocValuesConsumer extends
encode.put(decode[i], i);
}
- data.writeVInt(PackedInts.VERSION_CURRENT);
- data.writeVInt(count);
+ meta.writeVInt(PackedInts.VERSION_CURRENT);
+ meta.writeVInt(count);
data.writeVInt(bitsPerValue);
final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, PackedInts.Format.PACKED, count, bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
@@ -125,8 +125,8 @@ class Lucene42DocValuesConsumer extends
} else {
meta.writeByte((byte)0); // delta-compressed
- data.writeVInt(PackedInts.VERSION_CURRENT);
- data.writeVInt(count);
+ meta.writeVInt(PackedInts.VERSION_CURRENT);
+ meta.writeVInt(count);
data.writeVInt(BLOCK_SIZE);
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
@@ -164,11 +164,14 @@ class Lucene42DocValuesConsumer extends
int minLength = Integer.MAX_VALUE;
int maxLength = Integer.MIN_VALUE;
final long startFP = data.getFilePointer();
+ int count = 0;
for(BytesRef v : values) {
minLength = Math.min(minLength, v.length);
maxLength = Math.max(maxLength, v.length);
data.writeBytes(v.bytes, v.offset, v.length);
+ ++count;
}
+ meta.writeVInt(count);
meta.writeLong(startFP);
meta.writeLong(data.getFilePointer() - startFP);
meta.writeVInt(minLength);
@@ -176,32 +179,17 @@ class Lucene42DocValuesConsumer extends
// if minLength == maxLength, its a fixed-length byte[], we are done (the addresses are implicit)
// otherwise, we need to record the length fields...
- // TODO: make this more efficient. this is just as inefficient as 4.0 codec.... we can do much better.
if (minLength != maxLength) {
- addNumericField(field, new Iterable<Number>() {
- @Override
- public Iterator<Number> iterator() {
- final Iterator<BytesRef> inner = values.iterator();
- return new Iterator<Number>() {
- long addr = 0;
-
- @Override
- public boolean hasNext() {
- return inner.hasNext();
- }
-
- @Override
- public Number next() {
- BytesRef b = inner.next();
- addr += b.length;
- return Long.valueOf(addr);
- }
-
- @Override
- public void remove() { throw new UnsupportedOperationException(); }
- };
- }
- });
+ meta.writeVInt(PackedInts.VERSION_CURRENT);
+ meta.writeVInt(BLOCK_SIZE);
+
+ final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE);
+ long addr = 0;
+ for (BytesRef v : values) {
+ addr += v.length;
+ writer.add(addr);
+ }
+ writer.finish();
}
}
Modified: lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java?rev=1436983&r1=1436982&r2=1436983&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java (original)
+++ lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java Tue Jan 22 14:54:56 2013
@@ -43,6 +43,7 @@ import org.apache.lucene.util.fst.FST.By
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.packed.BlockPackedReader;
+import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
import org.apache.lucene.util.packed.PackedInts;
class Lucene42DocValuesProducer extends DocValuesProducer {
@@ -101,13 +102,20 @@ class Lucene42DocValuesProducer extends
NumericEntry entry = new NumericEntry();
entry.offset = meta.readLong();
entry.tableized = meta.readByte() != 0;
+ entry.packedIntsVersion = meta.readVInt();
+ entry.count = meta.readVInt();
numerics.put(fieldNumber, entry);
} else if (fieldType == Lucene42DocValuesConsumer.BYTES) {
BinaryEntry entry = new BinaryEntry();
+ entry.count = meta.readVInt();
entry.offset = meta.readLong();
entry.numBytes = meta.readLong();
entry.minLength = meta.readVInt();
entry.maxLength = meta.readVInt();
+ if (entry.minLength != entry.maxLength) {
+ entry.packedIntsVersion = meta.readVInt();
+ entry.blockSize = meta.readVInt();
+ }
binaries.put(fieldNumber, entry);
} else if (fieldType == Lucene42DocValuesConsumer.FST) {
FSTEntry entry = new FSTEntry();
@@ -140,10 +148,8 @@ class Lucene42DocValuesProducer extends
for (int i = 0; i < decode.length; i++) {
decode[i] = data.readLong();
}
- final int packedIntsVersion = data.readVInt();
- final int count = data.readVInt();
final int bitsPerValue = data.readVInt();
- final PackedInts.Reader reader = PackedInts.getReaderNoHeader(data, PackedInts.Format.PACKED, packedIntsVersion, count, bitsPerValue);
+ final PackedInts.Reader reader = PackedInts.getReaderNoHeader(data, PackedInts.Format.PACKED, entry.packedIntsVersion, entry.count, bitsPerValue);
return new NumericDocValues() {
@Override
public long get(int docID) {
@@ -151,10 +157,8 @@ class Lucene42DocValuesProducer extends
}
};
} else {
- final int packedIntsVersion = data.readVInt();
- final int count = data.readVInt();
final int blockSize = data.readVInt();
- final BlockPackedReader reader = new BlockPackedReader(data, packedIntsVersion, blockSize, count, false);
+ final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, blockSize, entry.count, false);
return new NumericDocValues() {
@Override
public long get(int docID) {
@@ -191,15 +195,15 @@ class Lucene42DocValuesProducer extends
}
};
} else {
- final NumericDocValues addresses = getNumeric(field);
+ final MonotonicBlockPackedReader addresses = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, false);
return new BinaryDocValues() {
@Override
public void get(int docID, BytesRef result) {
- int startAddress = docID == 0 ? 0 : (int) addresses.get(docID-1);
- int endAddress = (int)addresses.get(docID);
+ long startAddress = docID == 0 ? 0 : addresses.get(docID-1);
+ long endAddress = addresses.get(docID);
result.bytes = bytes;
- result.offset = startAddress;
- result.length = endAddress - startAddress;
+ result.offset = (int) startAddress;
+ result.length = (int) (endAddress - startAddress);
}
};
}
@@ -275,13 +279,18 @@ class Lucene42DocValuesProducer extends
static class NumericEntry {
long offset;
boolean tableized;
+ int packedIntsVersion;
+ int count;
}
static class BinaryEntry {
+ int count;
long offset;
long numBytes;
int minLength;
int maxLength;
+ int packedIntsVersion;
+ int blockSize;
}
static class FSTEntry {
Added: lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/AbstractBlockPackedWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/AbstractBlockPackedWriter.java?rev=1436983&view=auto
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/AbstractBlockPackedWriter.java (added)
+++ lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/AbstractBlockPackedWriter.java Tue Jan 22 14:54:56 2013
@@ -0,0 +1,132 @@
+package org.apache.lucene.util.packed;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.store.DataOutput;
+
+abstract class AbstractBlockPackedWriter {
+
+ static final int MAX_BLOCK_SIZE = 1 << (30 - 3);
+ static final int MIN_VALUE_EQUALS_0 = 1 << 0;
+ static final int BPV_SHIFT = 1;
+
+ static void checkBlockSize(int blockSize) {
+ if (blockSize <= 0 || blockSize > MAX_BLOCK_SIZE) {
+ throw new IllegalArgumentException("blockSize must be > 0 and < " + MAX_BLOCK_SIZE + ", got " + blockSize);
+ }
+ if (blockSize < 64) {
+ throw new IllegalArgumentException("blockSize must be >= 64, got " + blockSize);
+ }
+ if ((blockSize & (blockSize - 1)) != 0) {
+ throw new IllegalArgumentException("blockSize must be a power of two, got " + blockSize);
+ }
+ }
+
+ static long zigZagEncode(long n) {
+ return (n >> 63) ^ (n << 1);
+ }
+
+ // same as DataOutput.writeVLong but accepts negative values
+ static void writeVLong(DataOutput out, long i) throws IOException {
+ int k = 0;
+ while ((i & ~0x7FL) != 0L && k++ < 8) {
+ out.writeByte((byte)((i & 0x7FL) | 0x80L));
+ i >>>= 7;
+ }
+ out.writeByte((byte) i);
+ }
+
+ protected DataOutput out;
+ protected final long[] values;
+ protected byte[] blocks;
+ protected int off;
+ protected long ord;
+ protected boolean finished;
+
+ /**
+ * Sole constructor.
+ * @param blockSize the number of values of a single block, must be a multiple of <tt>64</tt>
+ */
+ public AbstractBlockPackedWriter(DataOutput out, int blockSize) {
+ checkBlockSize(blockSize);
+ reset(out);
+ values = new long[blockSize];
+ }
+
+ /** Reset this writer to wrap <code>out</code>. The block size remains unchanged. */
+ public void reset(DataOutput out) {
+ assert out != null;
+ this.out = out;
+ off = 0;
+ ord = 0L;
+ finished = false;
+ }
+
+ private void checkNotFinished() {
+ if (finished) {
+ throw new IllegalStateException("Already finished");
+ }
+ }
+
+ /** Append a new long. */
+ public void add(long l) throws IOException {
+ checkNotFinished();
+ if (off == values.length) {
+ flush();
+ }
+ values[off++] = l;
+ ++ord;
+ }
+
+ /** Flush all buffered data to disk. This instance is not usable anymore
+ * after this method has been called until {@link #reset(DataOutput)} has
+ * been called. */
+ public void finish() throws IOException {
+ checkNotFinished();
+ if (off > 0) {
+ flush();
+ }
+ finished = true;
+ }
+
+ /** Return the number of values which have been added. */
+ public long ord() {
+ return ord;
+ }
+
+ protected abstract void flush() throws IOException;
+
+ protected final void writeValues(int bitsRequired) throws IOException {
+ final PackedInts.Encoder encoder = PackedInts.getEncoder(PackedInts.Format.PACKED, PackedInts.VERSION_CURRENT, bitsRequired);
+ final int iterations = values.length / encoder.valueCount();
+ final int blockSize = encoder.blockCount() * 8 * iterations;
+ if (blocks == null || blocks.length < blockSize) {
+ blocks = new byte[blockSize];
+ }
+ if (off < values.length) {
+ Arrays.fill(values, off, values.length, 0L);
+ }
+ encoder.encode(values, 0, blocks, 0, iterations);
+ final int blockCount = (int) PackedInts.Format.PACKED.byteCount(PackedInts.VERSION_CURRENT, off, bitsRequired);
+ out.writeBytes(blocks, blockCount);
+ }
+
+}
Modified: lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReader.java?rev=1436983&r1=1436982&r2=1436983&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReader.java (original)
+++ lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReader.java Tue Jan 22 14:54:56 2013
@@ -42,7 +42,7 @@ public final class BlockPackedReader {
public BlockPackedReader(IndexInput in, int packedIntsVersion, int blockSize, long valueCount, boolean direct) throws IOException {
checkBlockSize(blockSize);
this.valueCount = valueCount;
- blockShift = Long.numberOfTrailingZeros(blockSize);
+ blockShift = Integer.numberOfTrailingZeros(blockSize);
blockMask = blockSize - 1;
final int numBlocks = (int) (valueCount / blockSize) + (valueCount % blockSize == 0 ? 0 : 1);
if (numBlocks * blockSize < valueCount) {
@@ -62,7 +62,9 @@ public final class BlockPackedReader {
}
minValues[i] = zigZagDecode(1L + readVLong(in));
}
- if (bitsPerValue != 0) {
+ if (bitsPerValue == 0) {
+ subReaders[i] = new PackedInts.NullReader(blockSize);
+ } else {
final int size = (int) Math.min(blockSize, valueCount - (long) i * blockSize);
if (direct) {
final long pointer = in.getFilePointer();
@@ -80,9 +82,6 @@ public final class BlockPackedReader {
public long get(long index) {
assert index >= 0 && index < valueCount;
final int block = (int) (index >>> blockShift);
- if (subReaders[block] == null) {
- return minValues == null ? 0 : minValues[block];
- }
final int idx = (int) (index & blockMask);
return (minValues == null ? 0 : minValues[block]) + subReaders[block].get(idx);
}
Modified: lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedWriter.java?rev=1436983&r1=1436982&r2=1436983&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedWriter.java (original)
+++ lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedWriter.java Tue Jan 22 14:54:56 2013
@@ -18,7 +18,6 @@ package org.apache.lucene.util.packed;
*/
import java.io.IOException;
-import java.util.Arrays;
import org.apache.lucene.store.DataOutput;
@@ -33,92 +32,17 @@ import org.apache.lucene.store.DataOutpu
* @see BlockPackedReaderIterator
* @lucene.internal
*/
-public final class BlockPackedWriter {
-
- static final int MAX_BLOCK_SIZE = 1 << (30 - 3);
- static final int MIN_VALUE_EQUALS_0 = 1 << 0;
- static final int BPV_SHIFT = 1;
-
- static void checkBlockSize(int blockSize) {
- if (blockSize <= 0 || blockSize > MAX_BLOCK_SIZE) {
- throw new IllegalArgumentException("blockSize must be > 0 and < " + MAX_BLOCK_SIZE + ", got " + blockSize);
- }
- if (blockSize < 64) {
- throw new IllegalArgumentException("blockSize must be >= 64, got " + blockSize);
- }
- if ((blockSize & (blockSize - 1)) != 0) {
- throw new IllegalArgumentException("blockSize must be a power of two, got " + blockSize);
- }
- }
-
- static long zigZagEncode(long n) {
- return (n >> 63) ^ (n << 1);
- }
-
- // same as DataOutput.writeVLong but accepts negative values
- static void writeVLong(DataOutput out, long i) throws IOException {
- int k = 0;
- while ((i & ~0x7FL) != 0L && k++ < 8) {
- out.writeByte((byte)((i & 0x7FL) | 0x80L));
- i >>>= 7;
- }
- out.writeByte((byte) i);
- }
-
- DataOutput out;
- final long[] values;
- byte[] blocks;
- int off;
- long ord;
- boolean finished;
+public final class BlockPackedWriter extends AbstractBlockPackedWriter {
/**
* Sole constructor.
- * @param blockSize the number of values of a single block, must be a multiple of <tt>64</tt>
+ * @param blockSize the number of values of a single block, must be a power of 2
*/
public BlockPackedWriter(DataOutput out, int blockSize) {
- checkBlockSize(blockSize);
- reset(out);
- values = new long[blockSize];
- }
-
- /** Reset this writer to wrap <code>out</code>. The block size remains unchanged. */
- public void reset(DataOutput out) {
- assert out != null;
- this.out = out;
- off = 0;
- ord = 0L;
- finished = false;
+ super(out, blockSize);
}
- private void checkNotFinished() {
- if (finished) {
- throw new IllegalStateException("Already finished");
- }
- }
-
- /** Append a new long. */
- public void add(long l) throws IOException {
- checkNotFinished();
- if (off == values.length) {
- flush();
- }
- values[off++] = l;
- ++ord;
- }
-
- /** Flush all buffered data to disk. This instance is not usable anymore
- * after this method has been called until {@link #reset(DataOutput)} has
- * been called. */
- public void finish() throws IOException {
- checkNotFinished();
- if (off > 0) {
- flush();
- }
- finished = true;
- }
-
- private void flush() throws IOException {
+ protected void flush() throws IOException {
assert off > 0;
long min = Long.MAX_VALUE, max = Long.MIN_VALUE;
for (int i = 0; i < off; ++i) {
@@ -149,26 +73,10 @@ public final class BlockPackedWriter {
values[i] -= min;
}
}
- final PackedInts.Encoder encoder = PackedInts.getEncoder(PackedInts.Format.PACKED, PackedInts.VERSION_CURRENT, bitsRequired);
- final int iterations = values.length / encoder.valueCount();
- final int blockSize = encoder.blockCount() * 8 * iterations;
- if (blocks == null || blocks.length < blockSize) {
- blocks = new byte[blockSize];
- }
- if (off < values.length) {
- Arrays.fill(values, off, values.length, 0L);
- }
- encoder.encode(values, 0, blocks, 0, iterations);
- final int blockCount = (int) PackedInts.Format.PACKED.byteCount(PackedInts.VERSION_CURRENT, off, bitsRequired);
- out.writeBytes(blocks, blockCount);
+ writeValues(bitsRequired);
}
off = 0;
}
- /** Return the number of values which have been added. */
- public long ord() {
- return ord;
- }
-
}
Added: lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicBlockPackedReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicBlockPackedReader.java?rev=1436983&view=auto
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicBlockPackedReader.java (added)
+++ lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicBlockPackedReader.java Tue Jan 22 14:54:56 2013
@@ -0,0 +1,83 @@
+package org.apache.lucene.util.packed;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static org.apache.lucene.util.packed.AbstractBlockPackedWriter.checkBlockSize;
+import static org.apache.lucene.util.packed.BlockPackedReaderIterator.zigZagDecode;
+
+import java.io.IOException;
+
+import org.apache.lucene.store.IndexInput;
+
+/**
+ * Provides random access to a stream written with
+ * {@link MonotonicBlockPackedWriter}.
+ * @lucene.internal
+ */
+public final class MonotonicBlockPackedReader {
+
+ private final int blockShift, blockMask;
+ private final long valueCount;
+ private final long[] minValues;
+ private final float[] averages;
+ private final PackedInts.Reader[] subReaders;
+
+ /** Sole constructor. */
+ public MonotonicBlockPackedReader(IndexInput in, int packedIntsVersion, int blockSize, long valueCount, boolean direct) throws IOException {
+ checkBlockSize(blockSize);
+ this.valueCount = valueCount;
+ blockShift = Integer.numberOfTrailingZeros(blockSize);
+ blockMask = blockSize - 1;
+ final int numBlocks = (int) (valueCount / blockSize) + (valueCount % blockSize == 0 ? 0 : 1);
+ if (numBlocks * blockSize < valueCount) {
+ throw new IllegalArgumentException("valueCount is too large for this block size");
+ }
+ minValues = new long[numBlocks];
+ averages = new float[numBlocks];
+ subReaders = new PackedInts.Reader[numBlocks];
+ for (int i = 0; i < numBlocks; ++i) {
+ minValues[i] = in.readVLong();
+ averages[i] = Float.intBitsToFloat(in.readInt());
+ final int bitsPerValue = in.readVInt();
+ if (bitsPerValue > 64) {
+ throw new IOException("Corrupted");
+ }
+ if (bitsPerValue == 0) {
+ subReaders[i] = new PackedInts.NullReader(blockSize);
+ } else {
+ final int size = (int) Math.min(blockSize, valueCount - (long) i * blockSize);
+ if (direct) {
+ final long pointer = in.getFilePointer();
+ subReaders[i] = PackedInts.getDirectReaderNoHeader(in, PackedInts.Format.PACKED, packedIntsVersion, size, bitsPerValue);
+ in.seek(pointer + PackedInts.Format.PACKED.byteCount(packedIntsVersion, size, bitsPerValue));
+ } else {
+ subReaders[i] = PackedInts.getReaderNoHeader(in, PackedInts.Format.PACKED, packedIntsVersion, size, bitsPerValue);
+ }
+ }
+ }
+ }
+
+ /** Get value at <code>index</code>. */
+ public long get(long index) {
+ assert index >= 0 && index < valueCount;
+ final int block = (int) (index >>> blockShift);
+ final int idx = (int) (index & blockMask);
+ return minValues[block] + (long) (idx * averages[block]) + zigZagDecode(subReaders[block].get(idx));
+ }
+
+}
Added: lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicBlockPackedWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicBlockPackedWriter.java?rev=1436983&view=auto
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicBlockPackedWriter.java (added)
+++ lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/MonotonicBlockPackedWriter.java Tue Jan 22 14:54:56 2013
@@ -0,0 +1,76 @@
+package org.apache.lucene.util.packed;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.store.DataOutput;
+
+/**
+ * A writer for large monotonically increasing sequences of positive longs.
+ * <p>
+ * The sequence is divided into fixed-size blocks and for each block, the
+ * average value per ord is computed, followed by the delta from the expected
+ * value for every ord, using as few bits as possible. Each block has an
+ * overhead between 6 and 14 bytes.
+ * @see MonotonicBlockPackedReader
+ * @lucene.internal
+ */
+public final class MonotonicBlockPackedWriter extends AbstractBlockPackedWriter {
+
+ /**
+ * Sole constructor.
+ * @param blockSize the number of values of a single block, must be a power of 2
+ */
+ public MonotonicBlockPackedWriter(DataOutput out, int blockSize) {
+ super(out, blockSize);
+ }
+
+ @Override
+ public void add(long l) throws IOException {
+ assert l >= 0;
+ super.add(l);
+ }
+
+ protected void flush() throws IOException {
+ assert off > 0;
+
+ // TODO: perform a true linear regression?
+ final long min = values[0];
+ final float avg = off == 1 ? 0f : (float) (values[off - 1] - min) / (off - 1);
+
+ long maxZigZagDelta = 0;
+ for (int i = 0; i < off; ++i) {
+ values[i] = zigZagEncode(values[i] - min - (long) (avg * i));
+ maxZigZagDelta = Math.max(maxZigZagDelta, values[i]);
+ }
+
+ out.writeVLong(min);
+ out.writeInt(Float.floatToIntBits(avg));
+ if (maxZigZagDelta == 0) {
+ out.writeVInt(0);
+ } else {
+ final int bitsRequired = PackedInts.bitsRequired(maxZigZagDelta);
+ out.writeVInt(bitsRequired);
+ writeValues(bitsRequired);
+ }
+
+ off = 0;
+ }
+
+}
Modified: lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java?rev=1436983&r1=1436982&r2=1436983&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java (original)
+++ lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java Tue Jan 22 14:54:56 2013
@@ -24,7 +24,6 @@ import org.apache.lucene.store.DataInput
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.LongsRef;
-import org.apache.lucene.util.packed.PackedInts.Format;
/**
* Simplistic compression for array of unsigned long values.
@@ -656,6 +655,53 @@ public class PackedInts {
}
}
+ /** A {@link Reader} which has all its values equal to 0 (bitsPerValue = 0). */
+ public static final class NullReader implements Reader {
+
+ private final int valueCount;
+
+ /** Sole constructor. */
+ public NullReader(int valueCount) {
+ this.valueCount = valueCount;
+ }
+
+ @Override
+ public long get(int index) {
+ return 0;
+ }
+
+ @Override
+ public int get(int index, long[] arr, int off, int len) {
+ return 0;
+ }
+
+ @Override
+ public int getBitsPerValue() {
+ return 0;
+ }
+
+ @Override
+ public int size() {
+ return valueCount;
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ return 0;
+ }
+
+ @Override
+ public Object getArray() {
+ return null;
+ }
+
+ @Override
+ public boolean hasArray() {
+ return false;
+ }
+
+ }
+
/** A write-once Writer.
* @lucene.internal
*/
Modified: lucene/dev/branches/lucene4547/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java?rev=1436983&r1=1436982&r2=1436983&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java (original)
+++ lucene/dev/branches/lucene4547/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java Tue Jan 22 14:54:56 2013
@@ -979,4 +979,45 @@ public class TestPackedInts extends Luce
}
}
+ public void testMonotonicBlockPackedReaderWriter() throws IOException {
+ final int iters = atLeast(2);
+ for (int iter = 0; iter < iters; ++iter) {
+ final int blockSize = 1 << _TestUtil.nextInt(random(), 6, 18);
+ final int valueCount = random().nextInt(1 << 18);
+ final long[] values = new long[valueCount];
+ if (valueCount > 0) {
+ values[0] = random().nextBoolean() ? random().nextInt(10) : random().nextInt(Integer.MAX_VALUE);
+ int maxDelta = random().nextInt(64);
+ for (int i = 1; i < valueCount; ++i) {
+ if (random().nextDouble() < 0.1d) {
+ maxDelta = random().nextInt(64);
+ }
+ values[i] = Math.max(0, values[i-1] + _TestUtil.nextInt(random(), -16, maxDelta));
+ }
+ }
+
+ final Directory dir = newDirectory();
+ final IndexOutput out = dir.createOutput("out.bin", IOContext.DEFAULT);
+ final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(out, blockSize);
+ for (int i = 0; i < valueCount; ++i) {
+ assertEquals(i, writer.ord());
+ writer.add(values[i]);
+ }
+ assertEquals(valueCount, writer.ord());
+ writer.finish();
+ assertEquals(valueCount, writer.ord());
+ final long fp = out.getFilePointer();
+ out.close();
+
+ final IndexInput in = dir.openInput("out.bin", IOContext.DEFAULT);
+ final MonotonicBlockPackedReader reader = new MonotonicBlockPackedReader(in, PackedInts.VERSION_CURRENT, blockSize, valueCount, random().nextBoolean());
+ assertEquals(fp, in.getFilePointer());
+ for (int i = 0; i < valueCount; ++i) {
+ assertEquals("i=" +i, values[i], reader.get(i));
+ }
+ in.close();
+ dir.close();
+ }
+ }
+
}