You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2013/07/16 10:51:38 UTC

svn commit: r1503619 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/core/ lucene/core/src/java/org/apache/lucene/util/WAH8DocIdSet.java lucene/core/src/java/org/apache/lucene/util/packed/AbstractAppendingLongBuffer.java

Author: jpountz
Date: Tue Jul 16 08:51:38 2013
New Revision: 1503619

URL: http://svn.apache.org/r1503619
Log:
LUCENE-5115: WAHDocIdSet's iterator cost() function now returns the exact cardinality of the set.

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/WAH8DocIdSet.java
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/packed/AbstractAppendingLongBuffer.java

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/WAH8DocIdSet.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/WAH8DocIdSet.java?rev=1503619&r1=1503618&r2=1503619&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/WAH8DocIdSet.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/WAH8DocIdSet.java Tue Jul 16 08:51:38 2013
@@ -26,7 +26,7 @@ import org.apache.lucene.search.DocIdSet
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.DataInput;
-import org.apache.lucene.util.packed.PackedInts;
+import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer;
 
 /**
  * {@link DocIdSet} implementation based on word-aligned hybrid encoding on
@@ -82,15 +82,15 @@ public final class WAH8DocIdSet extends 
   private static final int MIN_INDEX_INTERVAL = 8;
 
   /** Default index interval. */
-  // To compute this default value, I created a rather dense set (0.1% load
-  // factor, which is close to the worst case regarding both compression and
-  // speed for this DocIdSet impl since sequences are going to be short) and I
-  // started with interval=1 and doubled it at each iteration until advance
-  // became slower
-  public static final int DEFAULT_INDEX_INTERVAL = 16;
+  public static final int DEFAULT_INDEX_INTERVAL = MIN_INDEX_INTERVAL;
 
-  private static final PackedInts.Reader EMPTY_READER = new PackedInts.NullReader(1);
-  private static WAH8DocIdSet EMPTY = new WAH8DocIdSet(new byte[0], EMPTY_READER, EMPTY_READER);
+  private static final MonotonicAppendingLongBuffer SINGLE_ZERO_BUFFER = new MonotonicAppendingLongBuffer();
+  private static WAH8DocIdSet EMPTY = new WAH8DocIdSet(new byte[0], 0, 1, SINGLE_ZERO_BUFFER, SINGLE_ZERO_BUFFER);
+
+  static {
+    SINGLE_ZERO_BUFFER.add(0L);
+    SINGLE_ZERO_BUFFER.freeze();
+  }
 
   private static final Comparator<Iterator> SERIALIZED_LENGTH_COMPARATOR = new Comparator<Iterator>() {
     @Override
@@ -99,20 +99,6 @@ public final class WAH8DocIdSet extends 
     }
   };
 
-  /** Same as {@link #copyOf(DocIdSetIterator, int)} with the default index interval. */
-  public static WAH8DocIdSet copyOf(DocIdSetIterator it) throws IOException {
-    return copyOf(it, DEFAULT_INDEX_INTERVAL);
-  }
-
-  /** Return a copy of the provided iterator. */
-  public static WAH8DocIdSet copyOf(DocIdSetIterator it, int indexInterval) throws IOException {
-    Builder builder = new Builder().setIndexInterval(indexInterval);
-    for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
-      builder.add(doc);
-    }
-    return builder.build();
-  }
-
   /** Same as {@link #intersect(Collection, int)} with the default index interval. */
   public static WAH8DocIdSet intersect(Collection<WAH8DocIdSet> docIdSets) {
     return intersect(docIdSets, DEFAULT_INDEX_INTERVAL);
@@ -242,6 +228,7 @@ public final class WAH8DocIdSet extends 
     int lastWordNum;
     int numSequences;
     int indexInterval;
+    int cardinality;
 
     WordBuilder() {
       out = new GrowableByteArrayDataOutput(1024);
@@ -250,14 +237,15 @@ public final class WAH8DocIdSet extends 
       lastWordNum = -1;
       numSequences = 0;
       indexInterval = DEFAULT_INDEX_INTERVAL;
+      cardinality = 0;
     }
 
     /** Set the index interval. Smaller index intervals improve performance of
      *  {@link DocIdSetIterator#advance(int)} but make the {@link DocIdSet}
      *  larger. An index interval <code>i</code> makes the index add an overhead
      *  which is at most <code>4/i</code>, but likely much less.The default index
-     *  interval is <code>16</code>, meaning the index has an overhead of at most
-     *  25%. To disable indexing, you can pass {@link Integer#MAX_VALUE} as an
+     *  interval is <code>8</code>, meaning the index has an overhead of at most
+     *  50%. To disable indexing, you can pass {@link Integer#MAX_VALUE} as an
      *  index interval. */
     public WordBuilder setIndexInterval(int indexInterval) {
       if (indexInterval < MIN_INDEX_INTERVAL) {
@@ -322,11 +310,13 @@ public final class WAH8DocIdSet extends 
         }
       }
       lastWordNum = wordNum;
+      cardinality += BitUtil.bitCount(word);
     }
 
     /** Build a new {@link WAH8DocIdSet}. */
     public WAH8DocIdSet build() {
-      if (lastWordNum == -1) {
+      if (cardinality == 0) {
+        assert lastWordNum == -1;
         return EMPTY;
       }
       writeSequence(clean);
@@ -334,16 +324,18 @@ public final class WAH8DocIdSet extends 
 
       // Now build the index
       final int valueCount = (numSequences - 1) / indexInterval + 1;
-      final PackedInts.Reader indexPositions;
-      final PackedInts.Reader indexWordNums;
+      final MonotonicAppendingLongBuffer indexPositions, indexWordNums;
       if (valueCount <= 1) {
-        indexPositions = indexWordNums = EMPTY_READER;
+        indexPositions = indexWordNums = SINGLE_ZERO_BUFFER;
       } else {
-        // From the tests I ran, there is no need to expose acceptableOverheadRatio, these packed ints are never the bottleneck
-        final PackedInts.Mutable positions = PackedInts.getMutable(valueCount, PackedInts.bitsRequired(data.length - 1), PackedInts.COMPACT);
-        final PackedInts.Mutable wordNums = PackedInts.getMutable(valueCount, PackedInts.bitsRequired(lastWordNum), PackedInts.COMPACT);
-  
-        final Iterator it = new Iterator(data, null, null);
+        final int pageSize = 128;
+        final int initialPageCount = (valueCount + pageSize - 1) / pageSize;
+        final MonotonicAppendingLongBuffer positions = new MonotonicAppendingLongBuffer(initialPageCount, pageSize);
+        final MonotonicAppendingLongBuffer wordNums = new MonotonicAppendingLongBuffer(initialPageCount, pageSize);
+ 
+        positions.add(0L);
+        wordNums.add(0L);
+        final Iterator it = new Iterator(data, cardinality, Integer.MAX_VALUE, SINGLE_ZERO_BUFFER, SINGLE_ZERO_BUFFER);
         assert it.in.getPosition() == 0;
         assert it.wordNum == -1;
         for (int i = 1; i < valueCount; ++i) {
@@ -355,14 +347,16 @@ public final class WAH8DocIdSet extends 
           }
           final int position = it.in.getPosition();
           final int wordNum = it.wordNum;
-          positions.set(i, position);
-          wordNums.set(i, wordNum + 1);
+          positions.add(position);
+          wordNums.add(wordNum + 1);
         }
+        positions.freeze();
+        wordNums.freeze();
         indexPositions = positions;
         indexWordNums = wordNums;
       }
 
-      return new WAH8DocIdSet(data, indexPositions, indexWordNums);
+      return new WAH8DocIdSet(data, cardinality, indexInterval, indexPositions, indexWordNums);
     }
 
   }
@@ -401,6 +395,14 @@ public final class WAH8DocIdSet extends 
       return this;
     }
 
+    /** Add the content of the provided {@link DocIdSetIterator}. */
+    public Builder add(DocIdSetIterator disi) throws IOException {
+      for (int doc = disi.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi.nextDoc()) {
+        add(doc);
+      }
+      return this;
+    }
+
     @Override
     public Builder setIndexInterval(int indexInterval) {
       return (Builder) super.setIndexInterval(indexInterval);
@@ -418,11 +420,15 @@ public final class WAH8DocIdSet extends 
 
   // where the doc IDs are stored
   private final byte[] data;
+  private final int cardinality;
+  private final int indexInterval;
   // index for advance(int)
-  private final PackedInts.Reader positions, wordNums; // wordNums[i] starts at the sequence at positions[i]
+  private final MonotonicAppendingLongBuffer positions, wordNums; // wordNums[i] starts at the sequence at positions[i]
 
-  WAH8DocIdSet(byte[] data, PackedInts.Reader positions, PackedInts.Reader wordNums) {
+  WAH8DocIdSet(byte[] data, int cardinality, int indexInterval, MonotonicAppendingLongBuffer positions, MonotonicAppendingLongBuffer wordNums) {
     this.data = data;
+    this.cardinality = cardinality;
+    this.indexInterval = indexInterval;
     this.positions = positions;
     this.wordNums = wordNums;
   }
@@ -434,7 +440,7 @@ public final class WAH8DocIdSet extends 
 
   @Override
   public Iterator iterator() {
-    return new Iterator(data, positions, wordNums);
+    return new Iterator(data, cardinality, indexInterval, positions, wordNums);
   }
 
   static int readLength(ByteArrayDataInput in, int len) {
@@ -448,22 +454,28 @@ public final class WAH8DocIdSet extends 
   static class Iterator extends DocIdSetIterator {
 
     final ByteArrayDataInput in;
-    final PackedInts.Reader positions, wordNums;
+    final int cardinality;
+    final int indexInterval;
+    final MonotonicAppendingLongBuffer positions, wordNums;
     int dirtyLength;
 
     int wordNum; // byte offset
     byte word; // current word
     int bitList; // list of bits set in the current word
+    int sequenceNum; // in which sequence are we?
 
     int docID;
 
-    Iterator(byte[] data, PackedInts.Reader positions, PackedInts.Reader wordNums) {
+    Iterator(byte[] data, int cardinality, int indexInterval, MonotonicAppendingLongBuffer positions, MonotonicAppendingLongBuffer wordNums) {
       this.in = new ByteArrayDataInput(data);
+      this.cardinality = cardinality;
+      this.indexInterval = indexInterval;
       this.positions = positions;
       this.wordNums = wordNums;
       wordNum = -1;
       word = 0;
       bitList = 0;
+      sequenceNum = -1;
       docID = -1;
     }
 
@@ -476,6 +488,7 @@ public final class WAH8DocIdSet extends 
       final int cleanLength = (in.getPosition() == 1 ? 0 : 2) + readLength(in, token >>> 4);
       wordNum += cleanLength;
       dirtyLength = 1 + readLength(in, token & 0x0F);
+      ++sequenceNum;
       return true;
     }
 
@@ -508,8 +521,25 @@ public final class WAH8DocIdSet extends 
       --dirtyLength;
     }
 
-    int binarySearch(int targetWordNum) {
-      int lo = 0, hi = positions.size() - 1;
+    int forwardBinarySearch(int targetWordNum) {
+      // advance forward and double the window at each step
+      final int indexSize = (int) wordNums.size();
+      int lo = sequenceNum / indexInterval, hi = lo + 1;
+      assert sequenceNum == -1 || wordNums.get(lo) <= wordNum;
+      assert lo + 1 == wordNums.size() || wordNums.get(lo + 1) > wordNum;
+      while (true) {
+        if (hi >= indexSize) {
+          hi = indexSize - 1;
+          break;
+        } else if (wordNums.get(hi) >= targetWordNum) {
+          break;
+        }
+        final int newLo = hi;
+        hi += (hi - lo) << 1;
+        lo = newLo;
+      }
+
+      // we found a window containing our target, let's binary search now
       while (lo <= hi) {
         final int mid = (lo + hi) >>> 1;
         final int midWordNum = (int) wordNums.get(mid);
@@ -526,9 +556,6 @@ public final class WAH8DocIdSet extends 
 
     void advanceWord(int targetWordNum) {
       assert targetWordNum > wordNum;
-      if (dirtyLength == 0 && !readSequence()) {
-        return;
-      }
       int delta = targetWordNum - wordNum;
       if (delta <= dirtyLength + 1) {
         if (delta > 1) {
@@ -538,11 +565,12 @@ public final class WAH8DocIdSet extends 
         skipDirtyBytes();
         assert dirtyLength == 0;
         // use the index
-        final int i = binarySearch(targetWordNum);
+        final int i = forwardBinarySearch(targetWordNum);
         final int position = (int) positions.get(i);
         if (position > in.getPosition()) { // if the binary search returned a backward offset, don't move
           wordNum = (int) wordNums.get(i) - 1;
           in.setPosition(position);
+          sequenceNum = i * indexInterval - 1;
         }
 
         while (true) {
@@ -599,24 +627,19 @@ public final class WAH8DocIdSet extends 
 
     @Override
     public long cost() {
-      return in.length(); // good estimation of the cost of iterating over all docs
+      return cardinality;
     }
 
   }
 
-  /** Return the number of documents in this {@link DocIdSet}. This method
-   *  runs in linear time but is much faster than counting documents. */
+  /** Return the number of documents in this {@link DocIdSet} in constant time. */
   public int cardinality() {
-    int cardinality = 0;
-    for (Iterator it = iterator(); it.wordNum != Integer.MAX_VALUE; it.nextWord()) {
-      cardinality += BitUtil.bitCount(it.word);
-    }
     return cardinality;
   }
 
   /** Return the memory usage of this class in bytes. */
   public long ramBytesUsed() {
-    return RamUsageEstimator.alignObjectSize(3 * RamUsageEstimator.NUM_BYTES_OBJECT_REF)
+    return RamUsageEstimator.alignObjectSize(3 * RamUsageEstimator.NUM_BYTES_OBJECT_REF + 2 * RamUsageEstimator.NUM_BYTES_INT)
         + RamUsageEstimator.sizeOf(data)
         + positions.ramBytesUsed()
         + wordNums.ramBytesUsed();

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/packed/AbstractAppendingLongBuffer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/packed/AbstractAppendingLongBuffer.java?rev=1503619&r1=1503618&r2=1503619&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/packed/AbstractAppendingLongBuffer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/packed/AbstractAppendingLongBuffer.java Tue Jul 16 08:51:38 2013
@@ -95,9 +95,7 @@ abstract class AbstractAppendingLongBuff
 
   /** Get a value from this buffer. */
   public final long get(long index) {
-    if (index < 0 || index >= size()) {
-      throw new IndexOutOfBoundsException("" + index);
-    }
+    assert index >= 0 && index < size();
     final int block = (int) (index >> pageShift);
     final int element = (int) (index & pageMask);
     return get(block, element);