You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/02/11 16:53:10 UTC
svn commit: r1069851 - in
/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs:
intblock/VariableIntFixedPhyBlockIndexInput.java
intblock/VariableIntFixedPhyBlockIndexOutput.java simple64/Simple64Codec.java
Author: rmuir
Date: Fri Feb 11 15:53:10 2011
New Revision: 1069851
URL: http://svn.apache.org/viewvc?rev=1069851&view=rev
Log:
LUCENE-2905: write pointers and skip data more efficiently for varint codecs with a fixed physical blocksize
Added:
lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntFixedPhyBlockIndexInput.java (with props)
lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntFixedPhyBlockIndexOutput.java (with props)
Modified:
lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simple64/Simple64Codec.java
Added: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntFixedPhyBlockIndexInput.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntFixedPhyBlockIndexInput.java?rev=1069851&view=auto
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntFixedPhyBlockIndexInput.java (added)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntFixedPhyBlockIndexInput.java Fri Feb 11 15:53:10 2011
@@ -0,0 +1,203 @@
+package org.apache.lucene.index.codecs.intblock;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.BulkPostingsEnum;
+import org.apache.lucene.index.codecs.sep.IntIndexInput;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.IndexInput;
+
+// TODO: much of this can be shared code w/ the variable case
+// TODO: not specific to simple64, (e.g. can be used by simple9/simple16 at least)
+
+/** Abstract base class that reads variable-size blocks of ints
+ * from an IndexInput that have a fixed physical size in bytes.
+ * While this is a simple approach, a
+ * more performant approach would directly create an impl
+ * of IntIndexInput inside Directory. Wrapping a generic
+ * IndexInput will likely cost performance.
+ *
+ * @lucene.experimental
+ */
+public abstract class VariableIntFixedPhyBlockIndexInput extends IntIndexInput {
+
+ protected final IndexInput in;
+ protected final int maxBlockSize;
+ protected final int phyBlockSize;
+ protected final static int HEADER = 8; /* 2 ints */
+
+ protected VariableIntFixedPhyBlockIndexInput(final IndexInput in) throws IOException {
+ this.in = in;
+ maxBlockSize = in.readInt();
+ phyBlockSize = in.readInt();
+ }
+
+ @Override
+ public Reader reader() throws IOException {
+ final int[] buffer = new int[maxBlockSize];
+ final IndexInput clone = (IndexInput) in.clone();
+ // TODO: can this be simplified?
+ return new Reader(clone, buffer, this.getBlockReader(clone, buffer), phyBlockSize);
+ }
+
+ @Override
+ public void close() throws IOException {
+ in.close();
+ }
+
+ @Override
+ public Index index() {
+ return new Index();
+ }
+
+ protected abstract BlockReader getBlockReader(IndexInput in, int[] buffer) throws IOException;
+
+ public interface BlockReader {
+ public int readBlock() throws IOException;
+ // nocommit -- do we really need?
+ //public void seek(long pos) throws IOException;
+ }
+
+ public static class Reader extends BulkPostingsEnum.BlockReader {
+ private final IndexInput in;
+
+ public final int[] pending;
+
+ private int offset;
+ private long lastBlockFP;
+ //private int blockSize; // nocommit redundant w/ limit?
+ private final BlockReader blockReader;
+ private int limit;
+ private final int phyBlockSize;
+
+ public Reader(final IndexInput in, final int[] pending, final BlockReader blockReader, final int phyBlockSize)
+ throws IOException {
+ this.in = in;
+ this.pending = pending;
+ this.blockReader = blockReader;
+ this.phyBlockSize = phyBlockSize;
+ }
+
+ void seek(final long fp, final int upto) throws IOException {
+ //System.out.println("vintb seek fp=" + fp + " upto=" + upto);
+ // TODO: should we do this in real-time, not lazy?
+ offset = upto;
+ assert offset >= 0: "pendingUpto=" + offset;
+ if (fp != lastBlockFP) {
+ // Seek to new block
+ in.seek(fp);
+ // nocommit -- why?
+ //blockReader.seek(fp);
+ lastBlockFP = fp;
+ limit = blockReader.readBlock();
+ } else {
+ // Seek w/in current block
+ }
+
+ // TODO: if we were more clever when writing the
+ // index, such that a seek point wouldn't be written
+ // until the int encoder "committed", we could avoid
+ // this (likely minor) inefficiency:
+
+ // This is necessary for int encoders that are
+ // non-causal, ie must see future int values to
+ // encode the current ones.
+ while(offset >= limit) {
+ //System.out.println("NON CAUSAL! offset=" + offset + " limit=" + limit);
+ offset -= limit;
+ fill();
+ }
+ //System.out.println(" after skip bock offset=" + offset);
+ }
+
+ @Override
+ public int[] getBuffer() {
+ return pending;
+ }
+
+ @Override
+ public int end() {
+ return limit;
+ }
+
+ @Override
+ public int offset() {
+ return offset;
+ }
+
+ @Override
+ public int fill() throws IOException {
+ lastBlockFP += phyBlockSize;
+ return limit = blockReader.readBlock();
+ }
+ }
+
+ private class Index extends IntIndexInput.Index {
+ private long fp;
+ private int upto;
+
+ // This is used when reading skip data:
+ @Override
+ public void read(final DataInput indexIn, final boolean absolute) throws IOException {
+ if (absolute) {
+ fp = HEADER + (phyBlockSize * indexIn.readVLong());
+ upto = indexIn.readByte()&0xFF;
+ } else {
+ final long delta = indexIn.readVLong();
+ if ((delta & 1) == 1) {
+ // same block
+ upto += (delta >>> 1);
+ } else {
+ // new block
+ fp += (phyBlockSize * (delta >>> 1));
+ upto = indexIn.readByte()&0xFF;
+ }
+ }
+ // TODO: we can't do this assert because non-causal
+ // int encoders can have upto over the buffer size
+ //assert upto < maxBlockSize: "upto=" + upto + " max=" + maxBlockSize;
+ }
+
+ @Override
+ public String toString() {
+ return "VarIntFixedPhyBlock.Index fp=" + fp + " upto=" + upto + " maxBlock=" + maxBlockSize;
+ }
+
+ @Override
+ public void seek(final BulkPostingsEnum.BlockReader other) throws IOException {
+ ((Reader) other).seek(fp, upto);
+ }
+
+ @Override
+ public void set(final IntIndexInput.Index other) {
+ final Index idx = (Index) other;
+ fp = idx.fp;
+ upto = idx.upto;
+ }
+
+ @Override
+ public Object clone() {
+ Index other = new Index();
+ other.fp = fp;
+ other.upto = upto;
+ return other;
+ }
+ }
+}
Added: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntFixedPhyBlockIndexOutput.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntFixedPhyBlockIndexOutput.java?rev=1069851&view=auto
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntFixedPhyBlockIndexOutput.java (added)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntFixedPhyBlockIndexOutput.java Fri Feb 11 15:53:10 2011
@@ -0,0 +1,147 @@
+package org.apache.lucene.index.codecs.intblock;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.codecs.sep.IntIndexOutput;
+import org.apache.lucene.store.IndexOutput;
+
+//TODO: much of this can be shared code w/ the variable case
+//TODO: not specific to simple64, (e.g. can be used by simple9/simple16 at least)
+
+/** Abstract base class that writes variable-size blocks of ints
+ * to an IndexOutput that have a fixed physical size in bytes.
+ * While this is a simple approach, a
+ * more performant approach would directly create an impl
+ * of IntIndexOutput inside Directory. Wrapping a generic
+ * IndexInput will likely cost performance.
+ *
+ * @lucene.experimental
+ */
+public abstract class VariableIntFixedPhyBlockIndexOutput extends IntIndexOutput {
+
+ protected final IndexOutput out;
+
+ private int upto;
+
+ // TODO: use vint so we can use unused simple selectors for larger blocks of 1s?
+ private static final int MAX_BLOCK_SIZE = 1 << 8;
+ private final int phyBlockSize;
+ private static final int HEADER = 8; /* two ints */
+
+ /** NOTE: maxBlockSize plus the max non-causal lookahead
+ * of your codec must be less than 256. EG Simple9
+ * requires lookahead=1 because on seeing the Nth value
+ * it knows it must now encode the N-1 values before it. */
+ protected VariableIntFixedPhyBlockIndexOutput(IndexOutput out, int maxBlockSize, int phyBlockSize) throws IOException {
+ if (maxBlockSize > MAX_BLOCK_SIZE) {
+ throw new IllegalArgumentException("maxBlockSize must be <= " + MAX_BLOCK_SIZE + "; got " + maxBlockSize);
+ }
+ this.out = out;
+ this.phyBlockSize = phyBlockSize;
+ out.writeInt(maxBlockSize);
+ out.writeInt(phyBlockSize);
+ }
+
+ /** Called one value at a time. Return the number of
+ * buffered input values that have been written to out. */
+ protected abstract int add(int value) throws IOException;
+
+ @Override
+ public Index index() throws IOException {
+ return new Index();
+ }
+
+ private class Index extends IntIndexOutput.Index {
+ long fp;
+ int upto;
+ long lastFP;
+ int lastUpto;
+
+ @Override
+ public void mark() throws IOException {
+ fp = out.getFilePointer();
+ upto = VariableIntFixedPhyBlockIndexOutput.this.upto;
+ }
+
+ @Override
+ public void set(IntIndexOutput.Index other) throws IOException {
+ Index idx = (Index) other;
+ lastFP = fp = idx.fp;
+ lastUpto = upto = idx.upto;
+ }
+
+ @Override
+ public void write(IndexOutput indexOut, boolean absolute) throws IOException {
+ assert upto >= 0;
+ assert (fp - HEADER) % phyBlockSize == 0;
+ if (absolute) {
+ indexOut.writeVLong((fp - HEADER) / phyBlockSize);
+ indexOut.writeByte((byte) upto);
+ } else if (fp == lastFP) {
+ // same block
+ assert upto >= lastUpto;
+ int uptoDelta = upto - lastUpto;
+ indexOut.writeVLong(uptoDelta << 1 | 1);
+ } else {
+ // new block
+ indexOut.writeVLong(((fp - lastFP) / phyBlockSize) << 1);
+ indexOut.writeByte((byte) upto);
+ }
+ lastUpto = upto;
+ lastFP = fp;
+ }
+
+ @Override
+ public String toString() {
+ return "VarIntFixedPhyBlock.Output fp=" + fp + " upto=" + upto;
+ }
+ }
+
+ private boolean abort;
+
+ @Override
+ public void write(int v) throws IOException {
+ boolean success = false;
+ try {
+ upto -= add(v)-1;
+ assert upto >= 0;
+ success = true;
+ } finally {
+ abort |= !success;
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ try {
+ // stuff 0s in until the "real" data is flushed:
+ if (!abort) {
+ int stuffed = 0;
+ while(upto > stuffed) {
+ upto -= add(0)-1;
+ assert upto >= 0;
+ stuffed += 1;
+ }
+ }
+ } finally {
+ out.close();
+ }
+ }
+}
Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simple64/Simple64Codec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simple64/Simple64Codec.java?rev=1069851&r1=1069850&r2=1069851&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simple64/Simple64Codec.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simple64/Simple64Codec.java Fri Feb 11 15:53:10 2011
@@ -26,13 +26,13 @@ import org.apache.lucene.index.SegmentRe
import org.apache.lucene.index.codecs.Codec;
import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.FieldsProducer;
+import org.apache.lucene.index.codecs.intblock.VariableIntFixedPhyBlockIndexInput;
+import org.apache.lucene.index.codecs.intblock.VariableIntFixedPhyBlockIndexOutput;
import org.apache.lucene.index.codecs.sep.IntStreamFactory;
import org.apache.lucene.index.codecs.sep.IntIndexInput;
import org.apache.lucene.index.codecs.sep.IntIndexOutput;
import org.apache.lucene.index.codecs.sep.SepPostingsReaderImpl;
import org.apache.lucene.index.codecs.sep.SepPostingsWriterImpl;
-import org.apache.lucene.index.codecs.intblock.VariableIntBlockIndexInput;
-import org.apache.lucene.index.codecs.intblock.VariableIntBlockIndexOutput;
import org.apache.lucene.index.codecs.PostingsWriterBase;
import org.apache.lucene.index.codecs.PostingsReaderBase;
import org.apache.lucene.index.codecs.BlockTermsReader;
@@ -73,7 +73,7 @@ public class Simple64Codec extends Codec
@Override
public IntIndexInput openInput(Directory dir, final String fileName, int readBufferSize) throws IOException {
- return new VariableIntBlockIndexInput(dir.openInput(fileName, readBufferSize)) {
+ return new VariableIntFixedPhyBlockIndexInput(dir.openInput(fileName, readBufferSize)) {
@Override
protected BlockReader getBlockReader(final IndexInput in, final int[] buffer) throws IOException {
@@ -100,7 +100,7 @@ public class Simple64Codec extends Codec
@Override
public IntIndexOutput createOutput(Directory dir, String fileName) throws IOException {
- return new VariableIntBlockIndexOutput(dir.createOutput(fileName), 61*multiplier) {
+ return new VariableIntFixedPhyBlockIndexOutput(dir.createOutput(fileName), 61*multiplier, 8*multiplier) {
private final long[] buffer = new long[multiplier];
private int totWritten;
private int totConsumed;