You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2013/01/08 12:16:14 UTC

svn commit: r1430210 - in /lucene/dev/trunk/lucene/core/src: java/org/apache/lucene/util/packed/BlockPackedReader.java java/org/apache/lucene/util/packed/BlockPackedWriter.java test/org/apache/lucene/util/packed/TestPackedInts.java

Author: jpountz
Date: Tue Jan  8 11:16:14 2013
New Revision: 1430210

URL: http://svn.apache.org/viewvc?rev=1430210&view=rev
Log:
LUCENE-4643: New API to read/write fixed-size blocks of packed ints.

Added:
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReader.java   (with props)
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedWriter.java   (with props)
Modified:
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java

Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReader.java?rev=1430210&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReader.java (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedReader.java Tue Jan  8 11:16:14 2013
@@ -0,0 +1,227 @@
+package org.apache.lucene.util.packed;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static org.apache.lucene.util.packed.BlockPackedWriter.BPV_SHIFT;
+import static org.apache.lucene.util.packed.BlockPackedWriter.MIN_VALUE_EQUALS_0;
+import static org.apache.lucene.util.packed.BlockPackedWriter.checkBlockSize;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.LongsRef;
+
+/**
+ * Reader for sequences of longs written with {@link BlockPackedWriter}.
+ * @see BlockPackedWriter
+ * @lucene.internal
+ */
+public final class BlockPackedReader {
+
+  static long zigZagDecode(long n) {
+    return ((n >>> 1) ^ -(n & 1));
+  }
+
+  // same as DataInput.readVLong but supports negative values
+  static long readVLong(DataInput in) throws IOException {
+    byte b = in.readByte();
+    if (b >= 0) return b;
+    long i = b & 0x7FL;
+    b = in.readByte();
+    i |= (b & 0x7FL) << 7;
+    if (b >= 0) return i;
+    b = in.readByte();
+    i |= (b & 0x7FL) << 14;
+    if (b >= 0) return i;
+    b = in.readByte();
+    i |= (b & 0x7FL) << 21;
+    if (b >= 0) return i;
+    b = in.readByte();
+    i |= (b & 0x7FL) << 28;
+    if (b >= 0) return i;
+    b = in.readByte();
+    i |= (b & 0x7FL) << 35;
+    if (b >= 0) return i;
+    b = in.readByte();
+    i |= (b & 0x7FL) << 42;
+    if (b >= 0) return i;
+    b = in.readByte();
+    i |= (b & 0x7FL) << 49;
+    if (b >= 0) return i;
+    b = in.readByte();
+    i |= (b & 0xFFL) << 56;
+    return i;
+  }
+
+  final DataInput in;
+  final int packedIntsVersion;
+  final long valueCount;
+  final int blockSize;
+  final LongsRef values;
+  byte[] blocks;
+  int off;
+  long ord;
+
+  /** Sole constructor.
+   * @param blockSize the number of values of a block, must be equal to the
+   *                  block size of the {@link BlockPackedWriter} which has
+   *                  been used to write the stream
+   */
+  public BlockPackedReader(DataInput in, int packedIntsVersion, int blockSize, long valueCount) {
+    checkBlockSize(blockSize);
+    this.in = in;
+    this.packedIntsVersion = packedIntsVersion;
+    this.blockSize = blockSize;
+    this.values = new LongsRef(blockSize);
+    assert valueCount >= 0;
+    this.valueCount = valueCount;
+    off = blockSize;
+    ord = 0;
+  }
+
+  /** Skip exactly <code>count</code> values. */
+  public void skip(long count) throws IOException {
+    assert count >= 0;
+    if (ord + count > valueCount || ord + count < 0) {
+      throw new EOFException();
+    }
+
+    // 1. skip buffered values
+    final int skipBuffer = (int) Math.min(count, blockSize - off);
+    off += skipBuffer;
+    ord += skipBuffer;
+    count -= skipBuffer;
+    if (count == 0L) {
+      return;
+    }
+
+    // 2. skip as many blocks as necessary
+    assert off == blockSize;
+    while (count >= blockSize) {
+      final int token = in.readByte() & 0xFF;
+      final int bitsPerValue = token >>> BPV_SHIFT;
+      if (bitsPerValue > 64) {
+        throw new IOException("Corrupted");
+      }
+      if ((token & MIN_VALUE_EQUALS_0) == 0) {
+        readVLong(in);
+      }
+      final long blockBytes = PackedInts.Format.PACKED.byteCount(packedIntsVersion, blockSize, bitsPerValue);
+      skipBytes(blockBytes);
+      ord += blockSize;
+      count -= blockSize;
+    }
+    if (count == 0L) {
+      return;
+    }
+
+    // 3. skip last values
+    assert count < blockSize;
+    refill();
+    ord += count;
+    off += count;
+  }
+
+  private void skipBytes(long count) throws IOException {
+    if (in instanceof IndexInput) {
+      final IndexInput iin = (IndexInput) in;
+      iin.seek(iin.getFilePointer() + count);
+    } else {
+      if (blocks == null) {
+        blocks = new byte[blockSize];
+      }
+      long skipped = 0;
+      while (skipped < count) {
+        final int toSkip = (int) Math.min(blocks.length, count - skipped);
+        in.readBytes(blocks, 0, toSkip);
+        skipped += toSkip;
+      }
+    }
+  }
+
+  /** Read the next value. */
+  public long next() throws IOException {
+    next(1);
+    assert values.length == 1;
+    return values.longs[values.offset];
+  }
+
+  /** Read between <tt>1</tt> and <code>count</code> values. */
+  public LongsRef next(int count) throws IOException {
+    assert count > 0;
+    if (ord == valueCount) {
+      throw new EOFException();
+    }
+    if (off == blockSize) {
+      refill();
+    }
+
+    count = Math.min(count, blockSize - off);
+    count = (int) Math.min(count, valueCount - ord);
+
+    values.offset = off;
+    values.length = count;
+    off += count;
+    ord += count;
+    return values;
+  }
+
+  private void refill() throws IOException {
+    final int token = in.readByte() & 0xFF;
+    final boolean minEquals0 = (token & MIN_VALUE_EQUALS_0) != 0;
+    final int bitsPerValue = token >>> BPV_SHIFT;
+    if (bitsPerValue > 64) {
+      throw new IOException("Corrupted");
+    }
+    final long minValue = minEquals0 ? 0L : zigZagDecode(1L + readVLong(in));
+    assert minEquals0 || minValue != 0;
+
+    if (bitsPerValue == 0) {
+      Arrays.fill(values.longs, minValue);
+    } else {
+      final PackedInts.Decoder decoder = PackedInts.getDecoder(PackedInts.Format.PACKED, packedIntsVersion, bitsPerValue);
+      final int iterations = blockSize / decoder.valueCount();
+      final int blocksSize = iterations * 8 * decoder.blockCount();
+      if (blocks == null || blocks.length < blocksSize) {
+        blocks = new byte[blocksSize];
+      }
+
+      final int valueCount = (int) Math.min(this.valueCount - ord, blockSize);
+      final int blocksCount = (int) PackedInts.Format.PACKED.byteCount(packedIntsVersion, valueCount, bitsPerValue);
+      in.readBytes(blocks, 0, blocksCount);
+
+      decoder.decode(blocks, 0, values.longs, 0, iterations);
+
+      if (minValue != 0) {
+        for (int i = 0; i < valueCount; ++i) {
+          values.longs[i] += minValue;
+        }
+      }
+    }
+    off = 0;
+  }
+
+  /** Return the offset of the next value to read. */
+  public long ord() {
+    return ord;
+  }
+
+}

Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedWriter.java?rev=1430210&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedWriter.java (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/packed/BlockPackedWriter.java Tue Jan  8 11:16:14 2013
@@ -0,0 +1,164 @@
+package org.apache.lucene.util.packed;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.store.DataOutput;
+
+/**
+ * A writer for large sequences of longs.
+ * <p>
+ * The sequence is divided into fixed-size blocks and for each block, the
+ * difference between each value and the minimum value of the block is encoded
+ * using as few bits as possible. Memory usage of this class is proportional to
+ * the block size. Each block has an overhead between 1 and 10 bytes to store
+ * the minimum value and the number of bits per value of the block.
+ * @see BlockPackedReader
+ * @lucene.internal
+ */
+public final class BlockPackedWriter {
+
+  static final int MAX_BLOCK_SIZE = 1 << (30 - 3);
+  static final int MIN_VALUE_EQUALS_0 = 1 << 0;
+  static final int BPV_SHIFT = 1;
+
+  static void checkBlockSize(int blockSize) {
+    if (blockSize <= 0 || blockSize > MAX_BLOCK_SIZE) {
+      throw new IllegalArgumentException("blockSize must be > 0 and < " + MAX_BLOCK_SIZE + ", got " + blockSize);
+    }
+    if (blockSize % 64 != 0) {
+      throw new IllegalArgumentException("blockSize must be a multiple of 64, got " + blockSize);
+    }
+  }
+
+  static long zigZagEncode(long n) {
+    return (n >> 63) ^ (n << 1);
+  }
+
+  // same as DataOutput.writeVLong but accepts negative values
+  static void writeVLong(DataOutput out, long i) throws IOException {
+    int k = 0;
+    while ((i & ~0x7FL) != 0L && k++ < 8) {
+      out.writeByte((byte)((i & 0x7FL) | 0x80L));
+      i >>>= 7;
+    }
+    out.writeByte((byte) i);
+  }
+
+  final DataOutput out;
+  final long[] values;
+  byte[] blocks;
+  int off;
+  long ord;
+  boolean finished;
+
+  /**
+   * Sole constructor.
+   * @param blockSize the number of values of a single block, must be a multiple of <tt>64</tt>
+   */
+  public BlockPackedWriter(DataOutput out, int blockSize) {
+    checkBlockSize(blockSize);
+    this.out = out;
+    values = new long[blockSize];
+    off = 0;
+    ord = 0L;
+    finished = false;
+  }
+
+  private void checkNotFinished() {
+    if (finished) {
+      throw new IllegalStateException("Already finished");
+    }
+  }
+
+  /** Append a new long. */
+  public void add(long l) throws IOException {
+    checkNotFinished();
+    if (off == values.length) {
+      flush();
+    }
+    values[off++] = l;
+    ++ord;
+  }
+
+  /** Flush all buffered data to disk. This instance is not usable anymore
+   *  after this method has been called. */
+  public void finish() throws IOException {
+    checkNotFinished();
+    if (off > 0) {
+      flush();
+    }
+    finished = true;
+  }
+
+  private void flush() throws IOException {
+    assert off > 0;
+    long min = Long.MAX_VALUE, max = Long.MIN_VALUE;
+    for (int i = 0; i < off; ++i) {
+      min = Math.min(values[i], min);
+      max = Math.max(values[i], max);
+    }
+
+    final long delta = max - min;
+    final int bitsRequired = delta < 0 ? 64 : delta == 0L ? 0 : PackedInts.bitsRequired(delta);
+    if (bitsRequired == 64) {
+      // no need to delta-encode
+      min = 0L;
+    } else if (min > 0L) {
+      // make min as small as possible so that writeVLong requires fewer bytes
+      min = Math.max(0L, max - PackedInts.maxValue(bitsRequired));
+    }
+
+    final int token = (bitsRequired << BPV_SHIFT) | (min == 0 ? MIN_VALUE_EQUALS_0 : 0);
+    out.writeByte((byte) token);
+
+    if (min != 0) {
+      writeVLong(out, zigZagEncode(min) - 1);
+    }
+
+    if (bitsRequired > 0) {
+      if (min != 0) {
+        for (int i = 0; i < off; ++i) {
+          values[i] -= min;
+        }
+      }
+      final PackedInts.Encoder encoder = PackedInts.getEncoder(PackedInts.Format.PACKED, PackedInts.VERSION_CURRENT, bitsRequired);
+      final int iterations = values.length / encoder.valueCount();
+      final int blockSize = encoder.blockCount() * 8 * iterations;
+      if (blocks == null || blocks.length < blockSize) {
+        blocks = new byte[blockSize];
+      }
+      if (off < values.length) {
+        Arrays.fill(values, off, values.length, 0L);
+      }
+      encoder.encode(values, 0, blocks, 0, iterations);
+      final int blockCount = (int) PackedInts.Format.PACKED.byteCount(PackedInts.VERSION_CURRENT, off, bitsRequired);
+      out.writeBytes(blocks, blockCount);
+    }
+
+    off = 0;
+  }
+
+  /** Return the number of values which have been added. */
+  public long ord() {
+    return ord;
+  }
+
+}

Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java?rev=1430210&r1=1430209&r2=1430210&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java Tue Jan  8 11:16:14 2013
@@ -27,6 +27,8 @@ import java.util.Locale;
 import java.util.Random;
 
 import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexInput;
@@ -830,4 +832,102 @@ public class TestPackedInts extends Luce
     dir.close();
   }
 
+  public void testBlockPackedReaderWriter() throws IOException {
+    final int iters = atLeast(2);
+    for (int iter = 0; iter < iters; ++iter) {
+      final int blockSize = 64 * _TestUtil.nextInt(random(), 1, 1 << 12);
+      final int valueCount = random().nextInt(1 << 18);
+      final long[] values = new long[valueCount];
+      long minValue = 0;
+      int bpv = 0;
+      for (int i = 0; i < valueCount; ++i) {
+        if (i % blockSize == 0) {
+          minValue = rarely() ? random().nextInt(256) : rarely() ? -5 : random().nextLong();
+          bpv = random().nextInt(65);
+        }
+        if (bpv == 0) {
+          values[i] = minValue;
+        } else if (bpv == 64) {
+          values[i] = random().nextLong();
+        } else {
+          values[i] = minValue + _TestUtil.nextLong(random(), 0, (1L << bpv) - 1);
+        }
+      }
+  
+      final Directory dir = newDirectory();
+      final IndexOutput out = dir.createOutput("out.bin", IOContext.DEFAULT);
+      final BlockPackedWriter writer = new BlockPackedWriter(out, blockSize);
+      for (int i = 0; i < valueCount; ++i) {
+        assertEquals(i, writer.ord());
+        writer.add(values[i]);
+      }
+      assertEquals(valueCount, writer.ord());
+      writer.finish();
+      assertEquals(valueCount, writer.ord());
+      final long fp = out.getFilePointer();
+      out.close();
+
+      DataInput in = dir.openInput("out.bin", IOContext.DEFAULT);
+      if (random().nextBoolean()) {
+        byte[] buf = new byte[(int) fp];
+        in.readBytes(buf, 0, (int) fp);
+        ((IndexInput) in).close();
+        in = new ByteArrayDataInput(buf);
+      }
+      final BlockPackedReader reader = new BlockPackedReader(in, PackedInts.VERSION_CURRENT, blockSize, valueCount);
+      for (int i = 0; i < valueCount; ) {
+        if (random().nextBoolean()) {
+          assertEquals("" + i, values[i], reader.next());
+          ++i;
+        } else {
+          final LongsRef nextValues = reader.next(_TestUtil.nextInt(random(), 1, 1024));
+          for (int j = 0; j < nextValues.length; ++j) {
+            assertEquals("" + (i + j), values[i + j], nextValues.longs[nextValues.offset + j]);
+          }
+          i += nextValues.length;
+        }
+        assertEquals(i, reader.ord());
+      }
+      assertEquals(fp, in instanceof ByteArrayDataInput ? ((ByteArrayDataInput) in).getPosition() : ((IndexInput) in).getFilePointer());
+      try {
+        reader.next();
+        assertTrue(false);
+      } catch (IOException e) {
+        // OK
+      }
+
+      if (in instanceof ByteArrayDataInput) {
+        ((ByteArrayDataInput) in).setPosition(0);
+      } else {
+        ((IndexInput) in).seek(0L);
+      }
+      final BlockPackedReader reader2 = new BlockPackedReader(in, PackedInts.VERSION_CURRENT, blockSize, valueCount);
+      int i = 0;
+      while (true) {
+        final int skip = _TestUtil.nextInt(random(), 0, valueCount - i);
+        reader2.skip(skip);
+        i += skip;
+        assertEquals(i, reader2.ord());
+        if (i == valueCount) {
+          break;
+        } else {
+          assertEquals(values[i], reader2.next());
+          ++i;
+        }
+      }
+      assertEquals(fp, in instanceof ByteArrayDataInput ? ((ByteArrayDataInput) in).getPosition() : ((IndexInput) in).getFilePointer());
+      try {
+        reader2.skip(1);
+        assertTrue(false);
+      } catch (IOException e) {
+        // OK
+      }
+
+      if (in instanceof IndexInput) {
+        ((IndexInput) in).close();
+      }
+      dir.close();
+    }
+  }
+
 }