You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2013/01/08 12:16:14 UTC
svn commit: r1430210 - in /lucene/dev/trunk/lucene/core/src:
java/org/apache/lucene/util/packed/BlockPackedReader.java
java/org/apache/lucene/util/packed/BlockPackedWriter.java
test/org/apache/lucene/util/packed/TestPackedInts.java
Author: jpountz
Date: Tue Jan 8 11:16:14 2013
New Revision: 1430210
URL: http://svn.apache.org/viewvc?rev=1430210&view=rev
Log:
LUCENE-4643: New API to read/write fixed-size blocks of packed ints.
Added:
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReader.java (with props)
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedWriter.java (with props)
Modified:
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java
Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReader.java?rev=1430210&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReader.java (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReader.java Tue Jan 8 11:16:14 2013
@@ -0,0 +1,227 @@
+package org.apache.lucene.util.packed;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static org.apache.lucene.util.packed.BlockPackedWriter.BPV_SHIFT;
+import static org.apache.lucene.util.packed.BlockPackedWriter.MIN_VALUE_EQUALS_0;
+import static org.apache.lucene.util.packed.BlockPackedWriter.checkBlockSize;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.LongsRef;
+
+/**
+ * Reader for sequences of longs written with {@link BlockPackedWriter}.
+ * @see BlockPackedWriter
+ * @lucene.internal
+ */
+public final class BlockPackedReader {
+
+ static long zigZagDecode(long n) {
+ return ((n >>> 1) ^ -(n & 1));
+ }
+
+ // same as DataInput.readVLong but supports negative values
+ static long readVLong(DataInput in) throws IOException {
+ byte b = in.readByte();
+ if (b >= 0) return b;
+ long i = b & 0x7FL;
+ b = in.readByte();
+ i |= (b & 0x7FL) << 7;
+ if (b >= 0) return i;
+ b = in.readByte();
+ i |= (b & 0x7FL) << 14;
+ if (b >= 0) return i;
+ b = in.readByte();
+ i |= (b & 0x7FL) << 21;
+ if (b >= 0) return i;
+ b = in.readByte();
+ i |= (b & 0x7FL) << 28;
+ if (b >= 0) return i;
+ b = in.readByte();
+ i |= (b & 0x7FL) << 35;
+ if (b >= 0) return i;
+ b = in.readByte();
+ i |= (b & 0x7FL) << 42;
+ if (b >= 0) return i;
+ b = in.readByte();
+ i |= (b & 0x7FL) << 49;
+ if (b >= 0) return i;
+ b = in.readByte();
+ i |= (b & 0xFFL) << 56;
+ return i;
+ }
+
+ final DataInput in;
+ final int packedIntsVersion;
+ final long valueCount;
+ final int blockSize;
+ final LongsRef values;
+ byte[] blocks;
+ int off;
+ long ord;
+
+ /** Sole constructor.
+ * @param blockSize the number of values of a block, must be equal to the
+ * block size of the {@link BlockPackedWriter} which has
+ * been used to write the stream
+ */
+ public BlockPackedReader(DataInput in, int packedIntsVersion, int blockSize, long valueCount) {
+ checkBlockSize(blockSize);
+ this.in = in;
+ this.packedIntsVersion = packedIntsVersion;
+ this.blockSize = blockSize;
+ this.values = new LongsRef(blockSize);
+ assert valueCount >= 0;
+ this.valueCount = valueCount;
+ off = blockSize;
+ ord = 0;
+ }
+
+ /** Skip exactly <code>count</code> values. */
+ public void skip(long count) throws IOException {
+ assert count >= 0;
+ if (ord + count > valueCount || ord + count < 0) {
+ throw new EOFException();
+ }
+
+ // 1. skip buffered values
+ final int skipBuffer = (int) Math.min(count, blockSize - off);
+ off += skipBuffer;
+ ord += skipBuffer;
+ count -= skipBuffer;
+ if (count == 0L) {
+ return;
+ }
+
+ // 2. skip as many blocks as necessary
+ assert off == blockSize;
+ while (count >= blockSize) {
+ final int token = in.readByte() & 0xFF;
+ final int bitsPerValue = token >>> BPV_SHIFT;
+ if (bitsPerValue > 64) {
+ throw new IOException("Corrupted");
+ }
+ if ((token & MIN_VALUE_EQUALS_0) == 0) {
+ readVLong(in);
+ }
+ final long blockBytes = PackedInts.Format.PACKED.byteCount(packedIntsVersion, blockSize, bitsPerValue);
+ skipBytes(blockBytes);
+ ord += blockSize;
+ count -= blockSize;
+ }
+ if (count == 0L) {
+ return;
+ }
+
+ // 3. skip last values
+ assert count < blockSize;
+ refill();
+ ord += count;
+ off += count;
+ }
+
+ private void skipBytes(long count) throws IOException {
+ if (in instanceof IndexInput) {
+ final IndexInput iin = (IndexInput) in;
+ iin.seek(iin.getFilePointer() + count);
+ } else {
+ if (blocks == null) {
+ blocks = new byte[blockSize];
+ }
+ long skipped = 0;
+ while (skipped < count) {
+ final int toSkip = (int) Math.min(blocks.length, count - skipped);
+ in.readBytes(blocks, 0, toSkip);
+ skipped += toSkip;
+ }
+ }
+ }
+
+ /** Read the next value. */
+ public long next() throws IOException {
+ next(1);
+ assert values.length == 1;
+ return values.longs[values.offset];
+ }
+
+ /** Read between <tt>1</tt> and <code>count</code> values. */
+ public LongsRef next(int count) throws IOException {
+ assert count > 0;
+ if (ord == valueCount) {
+ throw new EOFException();
+ }
+ if (off == blockSize) {
+ refill();
+ }
+
+ count = Math.min(count, blockSize - off);
+ count = (int) Math.min(count, valueCount - ord);
+
+ values.offset = off;
+ values.length = count;
+ off += count;
+ ord += count;
+ return values;
+ }
+
+ private void refill() throws IOException {
+ final int token = in.readByte() & 0xFF;
+ final boolean minEquals0 = (token & MIN_VALUE_EQUALS_0) != 0;
+ final int bitsPerValue = token >>> BPV_SHIFT;
+ if (bitsPerValue > 64) {
+ throw new IOException("Corrupted");
+ }
+ final long minValue = minEquals0 ? 0L : zigZagDecode(1L + readVLong(in));
+ assert minEquals0 || minValue != 0;
+
+ if (bitsPerValue == 0) {
+ Arrays.fill(values.longs, minValue);
+ } else {
+ final PackedInts.Decoder decoder = PackedInts.getDecoder(PackedInts.Format.PACKED, packedIntsVersion, bitsPerValue);
+ final int iterations = blockSize / decoder.valueCount();
+ final int blocksSize = iterations * 8 * decoder.blockCount();
+ if (blocks == null || blocks.length < blocksSize) {
+ blocks = new byte[blocksSize];
+ }
+
+ final int valueCount = (int) Math.min(this.valueCount - ord, blockSize);
+ final int blocksCount = (int) PackedInts.Format.PACKED.byteCount(packedIntsVersion, valueCount, bitsPerValue);
+ in.readBytes(blocks, 0, blocksCount);
+
+ decoder.decode(blocks, 0, values.longs, 0, iterations);
+
+ if (minValue != 0) {
+ for (int i = 0; i < valueCount; ++i) {
+ values.longs[i] += minValue;
+ }
+ }
+ }
+ off = 0;
+ }
+
+ /** Return the offset of the next value to read. */
+ public long ord() {
+ return ord;
+ }
+
+}
Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedWriter.java?rev=1430210&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedWriter.java (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedWriter.java Tue Jan 8 11:16:14 2013
@@ -0,0 +1,164 @@
+package org.apache.lucene.util.packed;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.store.DataOutput;
+
+/**
+ * A writer for large sequences of longs.
+ * <p>
+ * The sequence is divided into fixed-size blocks and for each block, the
+ * difference between each value and the minimum value of the block is encoded
+ * using as few bits as possible. Memory usage of this class is proportional to
+ * the block size. Each block has an overhead between 1 and 10 bytes to store
+ * the minimum value and the number of bits per value of the block.
+ * @see BlockPackedReader
+ * @lucene.internal
+ */
+public final class BlockPackedWriter {
+
+ static final int MAX_BLOCK_SIZE = 1 << (30 - 3);
+ static final int MIN_VALUE_EQUALS_0 = 1 << 0;
+ static final int BPV_SHIFT = 1;
+
+ static void checkBlockSize(int blockSize) {
+ if (blockSize <= 0 || blockSize > MAX_BLOCK_SIZE) {
+ throw new IllegalArgumentException("blockSize must be > 0 and < " + MAX_BLOCK_SIZE + ", got " + blockSize);
+ }
+ if (blockSize % 64 != 0) {
+ throw new IllegalArgumentException("blockSize must be a multiple of 64, got " + blockSize);
+ }
+ }
+
+ static long zigZagEncode(long n) {
+ return (n >> 63) ^ (n << 1);
+ }
+
+ // same as DataOutput.writeVLong but accepts negative values
+ static void writeVLong(DataOutput out, long i) throws IOException {
+ int k = 0;
+ while ((i & ~0x7FL) != 0L && k++ < 8) {
+ out.writeByte((byte)((i & 0x7FL) | 0x80L));
+ i >>>= 7;
+ }
+ out.writeByte((byte) i);
+ }
+
+ final DataOutput out;
+ final long[] values;
+ byte[] blocks;
+ int off;
+ long ord;
+ boolean finished;
+
+ /**
+ * Sole constructor.
+ * @param blockSize the number of values of a single block, must be a multiple of <tt>64</tt>
+ */
+ public BlockPackedWriter(DataOutput out, int blockSize) {
+ checkBlockSize(blockSize);
+ this.out = out;
+ values = new long[blockSize];
+ off = 0;
+ ord = 0L;
+ finished = false;
+ }
+
+ private void checkNotFinished() {
+ if (finished) {
+ throw new IllegalStateException("Already finished");
+ }
+ }
+
+ /** Append a new long. */
+ public void add(long l) throws IOException {
+ checkNotFinished();
+ if (off == values.length) {
+ flush();
+ }
+ values[off++] = l;
+ ++ord;
+ }
+
+ /** Flush all buffered data to disk. This instance is not usable anymore
+ * after this method has been called. */
+ public void finish() throws IOException {
+ checkNotFinished();
+ if (off > 0) {
+ flush();
+ }
+ finished = true;
+ }
+
+ private void flush() throws IOException {
+ assert off > 0;
+ long min = Long.MAX_VALUE, max = Long.MIN_VALUE;
+ for (int i = 0; i < off; ++i) {
+ min = Math.min(values[i], min);
+ max = Math.max(values[i], max);
+ }
+
+ final long delta = max - min;
+ final int bitsRequired = delta < 0 ? 64 : delta == 0L ? 0 : PackedInts.bitsRequired(delta);
+ if (bitsRequired == 64) {
+ // no need to delta-encode
+ min = 0L;
+ } else if (min > 0L) {
+ // make min as small as possible so that writeVLong requires fewer bytes
+ min = Math.max(0L, max - PackedInts.maxValue(bitsRequired));
+ }
+
+ final int token = (bitsRequired << BPV_SHIFT) | (min == 0 ? MIN_VALUE_EQUALS_0 : 0);
+ out.writeByte((byte) token);
+
+ if (min != 0) {
+ writeVLong(out, zigZagEncode(min) - 1);
+ }
+
+ if (bitsRequired > 0) {
+ if (min != 0) {
+ for (int i = 0; i < off; ++i) {
+ values[i] -= min;
+ }
+ }
+ final PackedInts.Encoder encoder = PackedInts.getEncoder(PackedInts.Format.PACKED, PackedInts.VERSION_CURRENT, bitsRequired);
+ final int iterations = values.length / encoder.valueCount();
+ final int blockSize = encoder.blockCount() * 8 * iterations;
+ if (blocks == null || blocks.length < blockSize) {
+ blocks = new byte[blockSize];
+ }
+ if (off < values.length) {
+ Arrays.fill(values, off, values.length, 0L);
+ }
+ encoder.encode(values, 0, blocks, 0, iterations);
+ final int blockCount = (int) PackedInts.Format.PACKED.byteCount(PackedInts.VERSION_CURRENT, off, bitsRequired);
+ out.writeBytes(blocks, blockCount);
+ }
+
+ off = 0;
+ }
+
+ /** Return the number of values which have been added. */
+ public long ord() {
+ return ord;
+ }
+
+}
Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java?rev=1430210&r1=1430209&r2=1430210&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java Tue Jan 8 11:16:14 2013
@@ -27,6 +27,8 @@ import java.util.Locale;
import java.util.Random;
import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
@@ -830,4 +832,102 @@ public class TestPackedInts extends Luce
dir.close();
}
+ public void testBlockPackedReaderWriter() throws IOException {
+ final int iters = atLeast(2);
+ for (int iter = 0; iter < iters; ++iter) {
+ final int blockSize = 64 * _TestUtil.nextInt(random(), 1, 1 << 12);
+ final int valueCount = random().nextInt(1 << 18);
+ final long[] values = new long[valueCount];
+ long minValue = 0;
+ int bpv = 0;
+ for (int i = 0; i < valueCount; ++i) {
+ if (i % blockSize == 0) {
+ minValue = rarely() ? random().nextInt(256) : rarely() ? -5 : random().nextLong();
+ bpv = random().nextInt(65);
+ }
+ if (bpv == 0) {
+ values[i] = minValue;
+ } else if (bpv == 64) {
+ values[i] = random().nextLong();
+ } else {
+ values[i] = minValue + _TestUtil.nextLong(random(), 0, (1L << bpv) - 1);
+ }
+ }
+
+ final Directory dir = newDirectory();
+ final IndexOutput out = dir.createOutput("out.bin", IOContext.DEFAULT);
+ final BlockPackedWriter writer = new BlockPackedWriter(out, blockSize);
+ for (int i = 0; i < valueCount; ++i) {
+ assertEquals(i, writer.ord());
+ writer.add(values[i]);
+ }
+ assertEquals(valueCount, writer.ord());
+ writer.finish();
+ assertEquals(valueCount, writer.ord());
+ final long fp = out.getFilePointer();
+ out.close();
+
+ DataInput in = dir.openInput("out.bin", IOContext.DEFAULT);
+ if (random().nextBoolean()) {
+ byte[] buf = new byte[(int) fp];
+ in.readBytes(buf, 0, (int) fp);
+ ((IndexInput) in).close();
+ in = new ByteArrayDataInput(buf);
+ }
+ final BlockPackedReader reader = new BlockPackedReader(in, PackedInts.VERSION_CURRENT, blockSize, valueCount);
+ for (int i = 0; i < valueCount; ) {
+ if (random().nextBoolean()) {
+ assertEquals("" + i, values[i], reader.next());
+ ++i;
+ } else {
+ final LongsRef nextValues = reader.next(_TestUtil.nextInt(random(), 1, 1024));
+ for (int j = 0; j < nextValues.length; ++j) {
+ assertEquals("" + (i + j), values[i + j], nextValues.longs[nextValues.offset + j]);
+ }
+ i += nextValues.length;
+ }
+ assertEquals(i, reader.ord());
+ }
+ assertEquals(fp, in instanceof ByteArrayDataInput ? ((ByteArrayDataInput) in).getPosition() : ((IndexInput) in).getFilePointer());
+ try {
+ reader.next();
+ assertTrue(false);
+ } catch (IOException e) {
+ // OK
+ }
+
+ if (in instanceof ByteArrayDataInput) {
+ ((ByteArrayDataInput) in).setPosition(0);
+ } else {
+ ((IndexInput) in).seek(0L);
+ }
+ final BlockPackedReader reader2 = new BlockPackedReader(in, PackedInts.VERSION_CURRENT, blockSize, valueCount);
+ int i = 0;
+ while (true) {
+ final int skip = _TestUtil.nextInt(random(), 0, valueCount - i);
+ reader2.skip(skip);
+ i += skip;
+ assertEquals(i, reader2.ord());
+ if (i == valueCount) {
+ break;
+ } else {
+ assertEquals(values[i], reader2.next());
+ ++i;
+ }
+ }
+ assertEquals(fp, in instanceof ByteArrayDataInput ? ((ByteArrayDataInput) in).getPosition() : ((IndexInput) in).getFilePointer());
+ try {
+ reader2.skip(1);
+ assertTrue(false);
+ } catch (IOException e) {
+ // OK
+ }
+
+ if (in instanceof IndexInput) {
+ ((IndexInput) in).close();
+ }
+ dir.close();
+ }
+ }
+
}