You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/02/11 16:53:10 UTC

svn commit: r1069851 - in /lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs: intblock/VariableIntFixedPhyBlockIndexInput.java intblock/VariableIntFixedPhyBlockIndexOutput.java simple64/Simple64Codec.java

Author: rmuir
Date: Fri Feb 11 15:53:10 2011
New Revision: 1069851

URL: http://svn.apache.org/viewvc?rev=1069851&view=rev
Log:
LUCENE-2905: write pointers and skip data more efficiently for varint codecs with a fixed physical blocksize

Added:
    lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntFixedPhyBlockIndexInput.java   (with props)
    lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntFixedPhyBlockIndexOutput.java   (with props)
Modified:
    lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simple64/Simple64Codec.java

Added: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntFixedPhyBlockIndexInput.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntFixedPhyBlockIndexInput.java?rev=1069851&view=auto
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntFixedPhyBlockIndexInput.java (added)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntFixedPhyBlockIndexInput.java Fri Feb 11 15:53:10 2011
@@ -0,0 +1,203 @@
+package org.apache.lucene.index.codecs.intblock;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.BulkPostingsEnum;
+import org.apache.lucene.index.codecs.sep.IntIndexInput;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.IndexInput;
+
+// TODO: much of this can be shared code w/ the variable case
+// TODO: not specific to simple64, (e.g. can be used by simple9/simple16 at least)
+
+/** Abstract base class that reads variable-size blocks of ints
+ *  from an IndexInput that have a fixed physical size in bytes.  
+ *  While this is a simple approach, a
+ *  more performant approach would directly create an impl
+ *  of IntIndexInput inside Directory.  Wrapping a generic
+ *  IndexInput will likely cost performance.
+ *
+ * @lucene.experimental
+ */
+public abstract class VariableIntFixedPhyBlockIndexInput extends IntIndexInput {
+
+  protected final IndexInput in;
+  protected final int maxBlockSize;
+  protected final int phyBlockSize;
+  protected final static int HEADER = 8; /* 2 ints */
+  
+  protected VariableIntFixedPhyBlockIndexInput(final IndexInput in) throws IOException {
+    this.in = in;
+    maxBlockSize = in.readInt();
+    phyBlockSize = in.readInt();
+  }
+
+  @Override
+  public Reader reader() throws IOException {
+    final int[] buffer = new int[maxBlockSize];
+    final IndexInput clone = (IndexInput) in.clone();
+    // TODO: can this be simplified?
+    return new Reader(clone, buffer, this.getBlockReader(clone, buffer), phyBlockSize);
+  }
+
+  @Override
+  public void close() throws IOException {
+    in.close();
+  }
+
+  @Override
+  public Index index() {
+    return new Index();
+  }
+
+  protected abstract BlockReader getBlockReader(IndexInput in, int[] buffer) throws IOException;
+
+  public interface BlockReader {
+    public int readBlock() throws IOException;
+    // nocommit -- do we really need?
+    //public void seek(long pos) throws IOException;
+  }
+
+  public static class Reader extends BulkPostingsEnum.BlockReader {
+    private final IndexInput in;
+
+    public final int[] pending;
+
+    private int offset;
+    private long lastBlockFP;
+    //private int blockSize;                        // nocommit redundant w/ limit?
+    private final BlockReader blockReader;
+    private int limit;
+    private final int phyBlockSize;
+    
+    public Reader(final IndexInput in, final int[] pending, final BlockReader blockReader, final int phyBlockSize)
+      throws IOException {
+      this.in = in;
+      this.pending = pending;
+      this.blockReader = blockReader;
+      this.phyBlockSize = phyBlockSize;
+    }
+
+    void seek(final long fp, final int upto) throws IOException {
+      //System.out.println("vintb seek fp=" + fp + " upto=" + upto);
+      // TODO: should we do this in real-time, not lazy?
+      offset = upto;
+      assert offset >= 0: "pendingUpto=" + offset;
+      if (fp != lastBlockFP) {
+        // Seek to new block
+        in.seek(fp);
+        // nocommit -- why?
+        //blockReader.seek(fp);
+        lastBlockFP = fp;
+        limit = blockReader.readBlock();
+      } else {
+        // Seek w/in current block
+      }
+
+      // TODO: if we were more clever when writing the
+      // index, such that a seek point wouldn't be written
+      // until the int encoder "committed", we could avoid
+      // this (likely minor) inefficiency:
+
+      // This is necessary for int encoders that are
+      // non-causal, ie must see future int values to
+      // encode the current ones.
+      while(offset >= limit) {
+        //System.out.println("NON CAUSAL! offset=" + offset + " limit=" + limit);
+        offset -= limit;
+        fill();
+      }
+      //System.out.println("  after skip bock offset=" + offset);
+    }
+
+    @Override
+    public int[] getBuffer() {
+      return pending;
+    }
+
+    @Override
+    public int end() {
+      return limit;
+    }
+
+    @Override
+    public int offset() {
+      return offset;
+    }
+
+    @Override
+    public int fill() throws IOException {
+      lastBlockFP += phyBlockSize;
+      return limit = blockReader.readBlock();
+    }
+  }
+
+  private class Index extends IntIndexInput.Index {
+    private long fp;
+    private int upto;
+
+    // This is used when reading skip data:
+    @Override
+    public void read(final DataInput indexIn, final boolean absolute) throws IOException {
+      if (absolute) {
+        fp = HEADER + (phyBlockSize * indexIn.readVLong());
+        upto = indexIn.readByte()&0xFF;
+      } else {
+        final long delta = indexIn.readVLong();
+        if ((delta & 1) == 1) {
+          // same block
+          upto += (delta >>> 1);
+        } else {
+          // new block
+          fp += (phyBlockSize * (delta >>> 1));
+          upto = indexIn.readByte()&0xFF;
+        }
+      }
+      // TODO: we can't do this assert because non-causal
+      // int encoders can have upto over the buffer size
+      //assert upto < maxBlockSize: "upto=" + upto + " max=" + maxBlockSize;
+    }
+
+    @Override
+    public String toString() {
+      return "VarIntFixedPhyBlock.Index fp=" + fp + " upto=" + upto + " maxBlock=" + maxBlockSize;
+    }
+
+    @Override
+    public void seek(final BulkPostingsEnum.BlockReader other) throws IOException {
+      ((Reader) other).seek(fp, upto);
+    }
+
+    @Override
+    public void set(final IntIndexInput.Index other) {
+      final Index idx = (Index) other;
+      fp = idx.fp;
+      upto = idx.upto;
+    }
+
+    @Override
+    public Object clone() {
+      Index other = new Index();
+      other.fp = fp;
+      other.upto = upto;
+      return other;
+    }
+  }
+}

Added: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntFixedPhyBlockIndexOutput.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntFixedPhyBlockIndexOutput.java?rev=1069851&view=auto
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntFixedPhyBlockIndexOutput.java (added)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntFixedPhyBlockIndexOutput.java Fri Feb 11 15:53:10 2011
@@ -0,0 +1,147 @@
+package org.apache.lucene.index.codecs.intblock;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.codecs.sep.IntIndexOutput;
+import org.apache.lucene.store.IndexOutput;
+
+//TODO: much of this can be shared code w/ the variable case
+//TODO: not specific to simple64, (e.g. can be used by simple9/simple16 at least)
+
+/** Abstract base class that writes variable-size blocks of ints
+ *  to an IndexOutput that have a fixed physical size in bytes.  
+ *  While this is a simple approach, a
+ *  more performant approach would directly create an impl
+ *  of IntIndexOutput inside Directory.  Wrapping a generic
+ *  IndexInput will likely cost performance.
+ *
+ * @lucene.experimental
+ */
+public abstract class VariableIntFixedPhyBlockIndexOutput extends IntIndexOutput {
+
+  protected final IndexOutput out;
+
+  private int upto;
+
+  // TODO: use vint so we can use unused simple selectors for larger blocks of 1s?
+  private static final int MAX_BLOCK_SIZE = 1 << 8;
+  private final int phyBlockSize;
+  private static final int HEADER = 8; /* two ints */
+  
+  /** NOTE: maxBlockSize plus the max non-causal lookahead
+   *  of your codec must be less than 256.  EG Simple9
+   *  requires lookahead=1 because on seeing the Nth value
+   *  it knows it must now encode the N-1 values before it. */
+  protected VariableIntFixedPhyBlockIndexOutput(IndexOutput out, int maxBlockSize, int phyBlockSize) throws IOException {
+    if (maxBlockSize > MAX_BLOCK_SIZE) {
+      throw new IllegalArgumentException("maxBlockSize must be <= " + MAX_BLOCK_SIZE + "; got " + maxBlockSize);
+    }
+    this.out = out;
+    this.phyBlockSize = phyBlockSize;
+    out.writeInt(maxBlockSize);
+    out.writeInt(phyBlockSize);
+  }
+
+  /** Called one value at a time.  Return the number of
+   *  buffered input values that have been written to out. */
+  protected abstract int add(int value) throws IOException;
+
+  @Override
+  public Index index() throws IOException {
+    return new Index();
+  }
+
+  private class Index extends IntIndexOutput.Index {
+    long fp;
+    int upto;
+    long lastFP;
+    int lastUpto;
+
+    @Override
+    public void mark() throws IOException {
+      fp = out.getFilePointer();
+      upto = VariableIntFixedPhyBlockIndexOutput.this.upto;
+    }
+
+    @Override
+    public void set(IntIndexOutput.Index other) throws IOException {
+      Index idx = (Index) other;
+      lastFP = fp = idx.fp;
+      lastUpto = upto = idx.upto;
+    }
+
+    @Override
+    public void write(IndexOutput indexOut, boolean absolute) throws IOException {
+      assert upto >= 0;
+      assert (fp - HEADER) % phyBlockSize == 0;
+      if (absolute) {
+        indexOut.writeVLong((fp - HEADER) / phyBlockSize);
+        indexOut.writeByte((byte) upto);
+      } else if (fp == lastFP) {
+        // same block
+        assert upto >= lastUpto;
+        int uptoDelta = upto - lastUpto;
+        indexOut.writeVLong(uptoDelta << 1 | 1);
+      } else {      
+        // new block
+        indexOut.writeVLong(((fp - lastFP) / phyBlockSize) << 1);
+        indexOut.writeByte((byte) upto);
+      }
+      lastUpto = upto;
+      lastFP = fp;
+    }
+
+    @Override
+    public String toString() {
+      return "VarIntFixedPhyBlock.Output fp=" + fp + " upto=" + upto;
+    }
+  }
+
+  private boolean abort;
+
+  @Override
+  public void write(int v) throws IOException {
+    boolean success = false;
+    try {
+      upto -= add(v)-1;
+      assert upto >= 0;
+      success = true;
+    } finally {
+      abort |= !success;
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    try {
+      // stuff 0s in until the "real" data is flushed:
+      if (!abort) {
+        int stuffed = 0;
+        while(upto > stuffed) {
+          upto -= add(0)-1;
+          assert upto >= 0;
+          stuffed += 1;
+        }
+      }
+    } finally {
+      out.close();
+    }
+  }
+}

Modified: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simple64/Simple64Codec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simple64/Simple64Codec.java?rev=1069851&r1=1069850&r2=1069851&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simple64/Simple64Codec.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/codecs/simple64/Simple64Codec.java Fri Feb 11 15:53:10 2011
@@ -26,13 +26,13 @@ import org.apache.lucene.index.SegmentRe
 import org.apache.lucene.index.codecs.Codec;
 import org.apache.lucene.index.codecs.FieldsConsumer;
 import org.apache.lucene.index.codecs.FieldsProducer;
+import org.apache.lucene.index.codecs.intblock.VariableIntFixedPhyBlockIndexInput;
+import org.apache.lucene.index.codecs.intblock.VariableIntFixedPhyBlockIndexOutput;
 import org.apache.lucene.index.codecs.sep.IntStreamFactory;
 import org.apache.lucene.index.codecs.sep.IntIndexInput;
 import org.apache.lucene.index.codecs.sep.IntIndexOutput;
 import org.apache.lucene.index.codecs.sep.SepPostingsReaderImpl;
 import org.apache.lucene.index.codecs.sep.SepPostingsWriterImpl;
-import org.apache.lucene.index.codecs.intblock.VariableIntBlockIndexInput;
-import org.apache.lucene.index.codecs.intblock.VariableIntBlockIndexOutput;
 import org.apache.lucene.index.codecs.PostingsWriterBase;
 import org.apache.lucene.index.codecs.PostingsReaderBase;
 import org.apache.lucene.index.codecs.BlockTermsReader;
@@ -73,7 +73,7 @@ public class Simple64Codec extends Codec
 
     @Override
     public IntIndexInput openInput(Directory dir, final String fileName, int readBufferSize) throws IOException {
-      return new VariableIntBlockIndexInput(dir.openInput(fileName, readBufferSize)) {
+      return new VariableIntFixedPhyBlockIndexInput(dir.openInput(fileName, readBufferSize)) {
 
         @Override
         protected BlockReader getBlockReader(final IndexInput in, final int[] buffer) throws IOException {
@@ -100,7 +100,7 @@ public class Simple64Codec extends Codec
 
     @Override
     public IntIndexOutput createOutput(Directory dir, String fileName) throws IOException {
-      return new VariableIntBlockIndexOutput(dir.createOutput(fileName), 61*multiplier) {
+      return new VariableIntFixedPhyBlockIndexOutput(dir.createOutput(fileName), 61*multiplier, 8*multiplier) {
         private final long[] buffer = new long[multiplier];
         private int totWritten;
         private int totConsumed;