You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2013/07/10 21:24:24 UTC
svn commit: r1501925 - in /lucene/dev/trunk/lucene: ./
core/src/java/org/apache/lucene/codecs/compressing/
core/src/java/org/apache/lucene/codecs/lucene40/
core/src/java/org/apache/lucene/util/ core/src/test/org/apache/lucene/util/
Author: jpountz
Date: Wed Jul 10 19:24:24 2013
New Revision: 1501925
URL: http://svn.apache.org/r1501925
Log:
LUCENE-5081: WAH8DocIdSet.
Added:
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java (with props)
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/WAH8DocIdSet.java (with props)
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestWAH8DocIdSet.java (with props)
Removed:
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexWriter.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/BitVector.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/BitUtil.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/OpenBitSetIterator.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1501925&r1=1501924&r2=1501925&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Wed Jul 10 19:24:24 2013
@@ -51,6 +51,9 @@ New features
* LUCENE-5084: Added new Elias-Fano encoder, decoder and DocIdSet
implementations. (Paul Elschot via Adrien Grand)
+* LUCENE-5081: Added WAH8DocIdSet, an in-memory doc id set implementation based
+ on word-aligned hybrid encoding. (Adrien Grand)
+
======================= Lucene 4.4.0 =======================
Changes in backwards compatibility policy
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexWriter.java?rev=1501925&r1=1501924&r2=1501925&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexWriter.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsIndexWriter.java Wed Jul 10 19:24:24 2013
@@ -21,7 +21,6 @@ import java.io.Closeable;
import java.io.IOException;
import org.apache.lucene.codecs.Codec;
-import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.packed.PackedInts;
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java?rev=1501925&r1=1501924&r2=1501925&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java Wed Jul 10 19:24:24 2013
@@ -44,6 +44,7 @@ import org.apache.lucene.store.IndexOutp
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.GrowableByteArrayDataOutput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.PackedInts;
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java?rev=1501925&r1=1501924&r2=1501925&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java Wed Jul 10 19:24:24 2013
@@ -45,6 +45,7 @@ import org.apache.lucene.store.IndexOutp
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.GrowableByteArrayDataOutput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.packed.BlockPackedWriter;
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/BitVector.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/BitVector.java?rev=1501925&r1=1501924&r2=1501925&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/BitVector.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/lucene40/BitVector.java Wed Jul 10 19:24:24 2013
@@ -26,6 +26,7 @@ import org.apache.lucene.store.Directory
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.MutableBits;
@@ -166,7 +167,7 @@ final class BitVector implements Cloneab
int c = 0;
int end = bits.length;
for (int i = 0; i < end; i++) {
- c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte
+ c += BitUtil.bitCount(bits[i]); // sum bits per byte
}
count = c;
}
@@ -179,29 +180,12 @@ final class BitVector implements Cloneab
int c = 0;
int end = bits.length;
for (int i = 0; i < end; i++) {
- c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte
+ c += BitUtil.bitCount(bits[i]); // sum bits per byte
}
return c;
}
- private static final byte[] BYTE_COUNTS = { // table of bits/byte
- 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
- 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
- 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
- 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
- 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
- 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
- 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
- };
+
private static String CODEC = "BitVector";
@@ -294,7 +278,7 @@ final class BitVector implements Cloneab
output.writeVInt(i-last);
output.writeByte(bits[i]);
last = i;
- numCleared -= (8-BYTE_COUNTS[bits[i] & 0xFF]);
+ numCleared -= (8-BitUtil.bitCount(bits[i]));
assert numCleared >= 0 || (i == (bits.length-1) && numCleared == -(8-(size&7)));
}
}
@@ -399,7 +383,7 @@ final class BitVector implements Cloneab
while (n>0) {
last += input.readVInt();
bits[last] = input.readByte();
- n -= BYTE_COUNTS[bits[last] & 0xFF];
+ n -= BitUtil.bitCount(bits[last]);
assert n >= 0;
}
}
@@ -416,7 +400,7 @@ final class BitVector implements Cloneab
while (numCleared>0) {
last += input.readVInt();
bits[last] = input.readByte();
- numCleared -= 8-BYTE_COUNTS[bits[last] & 0xFF];
+ numCleared -= 8-BitUtil.bitCount(bits[last]);
assert numCleared >= 0 || (last == (bits.length-1) && numCleared == -(8-(size&7)));
}
}
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/BitUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/BitUtil.java?rev=1501925&r1=1501924&r2=1501925&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/BitUtil.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/BitUtil.java Wed Jul 10 19:24:24 2013
@@ -22,8 +22,93 @@ package org.apache.lucene.util; // from
*/
public final class BitUtil {
+ private static final byte[] BYTE_COUNTS = { // table of bits/byte
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+ 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
+ };
+
+ // The General Idea: instead of having an array per byte that has
+ // the offsets of the next set bit, that array could be
+ // packed inside a 32 bit integer (8 4 bit numbers). That
+ // should be faster than accessing an array for each index, and
+ // the total array size is kept smaller (256*sizeof(int))=1K
+ /***** the python code that generated bitlist
+ def bits2int(val):
+ arr=0
+ for shift in range(8,0,-1):
+ if val & 0x80:
+ arr = (arr << 4) | shift
+ val = val << 1
+ return arr
+
+ def int_table():
+ tbl = [ hex(bits2int(val)).strip('L') for val in range(256) ]
+ return ','.join(tbl)
+ ******/
+ private static final int[] BIT_LISTS = {
+ 0x0, 0x1, 0x2, 0x21, 0x3, 0x31, 0x32, 0x321, 0x4, 0x41, 0x42, 0x421, 0x43,
+ 0x431, 0x432, 0x4321, 0x5, 0x51, 0x52, 0x521, 0x53, 0x531, 0x532, 0x5321,
+ 0x54, 0x541, 0x542, 0x5421, 0x543, 0x5431, 0x5432, 0x54321, 0x6, 0x61, 0x62,
+ 0x621, 0x63, 0x631, 0x632, 0x6321, 0x64, 0x641, 0x642, 0x6421, 0x643,
+ 0x6431, 0x6432, 0x64321, 0x65, 0x651, 0x652, 0x6521, 0x653, 0x6531, 0x6532,
+ 0x65321, 0x654, 0x6541, 0x6542, 0x65421, 0x6543, 0x65431, 0x65432, 0x654321,
+ 0x7, 0x71, 0x72, 0x721, 0x73, 0x731, 0x732, 0x7321, 0x74, 0x741, 0x742,
+ 0x7421, 0x743, 0x7431, 0x7432, 0x74321, 0x75, 0x751, 0x752, 0x7521, 0x753,
+ 0x7531, 0x7532, 0x75321, 0x754, 0x7541, 0x7542, 0x75421, 0x7543, 0x75431,
+ 0x75432, 0x754321, 0x76, 0x761, 0x762, 0x7621, 0x763, 0x7631, 0x7632,
+ 0x76321, 0x764, 0x7641, 0x7642, 0x76421, 0x7643, 0x76431, 0x76432, 0x764321,
+ 0x765, 0x7651, 0x7652, 0x76521, 0x7653, 0x76531, 0x76532, 0x765321, 0x7654,
+ 0x76541, 0x76542, 0x765421, 0x76543, 0x765431, 0x765432, 0x7654321, 0x8,
+ 0x81, 0x82, 0x821, 0x83, 0x831, 0x832, 0x8321, 0x84, 0x841, 0x842, 0x8421,
+ 0x843, 0x8431, 0x8432, 0x84321, 0x85, 0x851, 0x852, 0x8521, 0x853, 0x8531,
+ 0x8532, 0x85321, 0x854, 0x8541, 0x8542, 0x85421, 0x8543, 0x85431, 0x85432,
+ 0x854321, 0x86, 0x861, 0x862, 0x8621, 0x863, 0x8631, 0x8632, 0x86321, 0x864,
+ 0x8641, 0x8642, 0x86421, 0x8643, 0x86431, 0x86432, 0x864321, 0x865, 0x8651,
+ 0x8652, 0x86521, 0x8653, 0x86531, 0x86532, 0x865321, 0x8654, 0x86541,
+ 0x86542, 0x865421, 0x86543, 0x865431, 0x865432, 0x8654321, 0x87, 0x871,
+ 0x872, 0x8721, 0x873, 0x8731, 0x8732, 0x87321, 0x874, 0x8741, 0x8742,
+ 0x87421, 0x8743, 0x87431, 0x87432, 0x874321, 0x875, 0x8751, 0x8752, 0x87521,
+ 0x8753, 0x87531, 0x87532, 0x875321, 0x8754, 0x87541, 0x87542, 0x875421,
+ 0x87543, 0x875431, 0x875432, 0x8754321, 0x876, 0x8761, 0x8762, 0x87621,
+ 0x8763, 0x87631, 0x87632, 0x876321, 0x8764, 0x87641, 0x87642, 0x876421,
+ 0x87643, 0x876431, 0x876432, 0x8764321, 0x8765, 0x87651, 0x87652, 0x876521,
+ 0x87653, 0x876531, 0x876532, 0x8765321, 0x87654, 0x876541, 0x876542,
+ 0x8765421, 0x876543, 0x8765431, 0x8765432, 0x87654321
+ };
+
private BitUtil() {} // no instance
+ /** Return the number of bits sets in b. */
+ public static int bitCount(byte b) {
+ return BYTE_COUNTS[b & 0xFF];
+ }
+
+ /** Return the list of bits which are set in b encoded as followed:
+ * <code>(i >>> (4 * n)) & 0x0F</code> is the offset of the n-th set bit of
+ * the given byte plus one, or 0 if there are n or less bits set in the given
+ * byte. For example <code>bitList(12)</code> returns 0x43:<ul>
+ * <li><code>0x43 & 0x0F</code> is 3, meaning the the first bit set is at offset 3-1 = 2,</li>
+ * <li><code>(0x43 >>> 4) & 0x0F</code> is 4, meaning there is a second bit set at offset 4-1=3,</li>
+ * <li><code>(0x43 >>> 8) & 0x0F</code> is 0, meaning there is no more bit set in this byte.</li>
+ * </ul>*/
+ public static int bitList(byte b) {
+ return BIT_LISTS[b & 0xFF];
+ }
+
// The pop methods used to rely on bit-manipulation tricks for speed but it
// turns out that it is faster to use the Long.bitCount method (which is an
// intrinsic since Java 6u18) in a naive loop, see LUCENE-2221
Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java?rev=1501925&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java Wed Jul 10 19:24:24 2013
@@ -0,0 +1,57 @@
+package org.apache.lucene.util;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.store.DataOutput;
+
+/**
+ * A {@link DataOutput} that can be used to build a byte[].
+ * @lucene.internal
+ */
+public final class GrowableByteArrayDataOutput extends DataOutput {
+
+ /** The bytes */
+ public byte[] bytes;
+ /** The length */
+ public int length;
+
+ /** Create a {@link GrowableByteArrayDataOutput} with the given initial capacity. */
+ public GrowableByteArrayDataOutput(int cp) {
+ this.bytes = new byte[ArrayUtil.oversize(cp, 1)];
+ this.length = 0;
+ }
+
+ @Override
+ public void writeByte(byte b) {
+ if (length >= bytes.length) {
+ bytes = ArrayUtil.grow(bytes);
+ }
+ bytes[length++] = b;
+ }
+
+ @Override
+ public void writeBytes(byte[] b, int off, int len) {
+ final int newLength = length + len;
+ if (newLength > bytes.length) {
+ bytes = ArrayUtil.grow(bytes, newLength);
+ }
+ System.arraycopy(b, off, bytes, length, len);
+ length = newLength;
+ }
+
+}
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/OpenBitSetIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/OpenBitSetIterator.java?rev=1501925&r1=1501924&r2=1501925&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/OpenBitSetIterator.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/OpenBitSetIterator.java Wed Jul 10 19:24:24 2013
@@ -25,55 +25,6 @@ import org.apache.lucene.search.DocIdSet
*/
public class OpenBitSetIterator extends DocIdSetIterator {
- // The General Idea: instead of having an array per byte that has
- // the offsets of the next set bit, that array could be
- // packed inside a 32 bit integer (8 4 bit numbers). That
- // should be faster than accessing an array for each index, and
- // the total array size is kept smaller (256*sizeof(int))=1K
- protected final static int[] bitlist={
- 0x0, 0x1, 0x2, 0x21, 0x3, 0x31, 0x32, 0x321, 0x4, 0x41, 0x42, 0x421, 0x43,
- 0x431, 0x432, 0x4321, 0x5, 0x51, 0x52, 0x521, 0x53, 0x531, 0x532, 0x5321,
- 0x54, 0x541, 0x542, 0x5421, 0x543, 0x5431, 0x5432, 0x54321, 0x6, 0x61, 0x62,
- 0x621, 0x63, 0x631, 0x632, 0x6321, 0x64, 0x641, 0x642, 0x6421, 0x643,
- 0x6431, 0x6432, 0x64321, 0x65, 0x651, 0x652, 0x6521, 0x653, 0x6531, 0x6532,
- 0x65321, 0x654, 0x6541, 0x6542, 0x65421, 0x6543, 0x65431, 0x65432, 0x654321,
- 0x7, 0x71, 0x72, 0x721, 0x73, 0x731, 0x732, 0x7321, 0x74, 0x741, 0x742,
- 0x7421, 0x743, 0x7431, 0x7432, 0x74321, 0x75, 0x751, 0x752, 0x7521, 0x753,
- 0x7531, 0x7532, 0x75321, 0x754, 0x7541, 0x7542, 0x75421, 0x7543, 0x75431,
- 0x75432, 0x754321, 0x76, 0x761, 0x762, 0x7621, 0x763, 0x7631, 0x7632,
- 0x76321, 0x764, 0x7641, 0x7642, 0x76421, 0x7643, 0x76431, 0x76432, 0x764321,
- 0x765, 0x7651, 0x7652, 0x76521, 0x7653, 0x76531, 0x76532, 0x765321, 0x7654,
- 0x76541, 0x76542, 0x765421, 0x76543, 0x765431, 0x765432, 0x7654321, 0x8,
- 0x81, 0x82, 0x821, 0x83, 0x831, 0x832, 0x8321, 0x84, 0x841, 0x842, 0x8421,
- 0x843, 0x8431, 0x8432, 0x84321, 0x85, 0x851, 0x852, 0x8521, 0x853, 0x8531,
- 0x8532, 0x85321, 0x854, 0x8541, 0x8542, 0x85421, 0x8543, 0x85431, 0x85432,
- 0x854321, 0x86, 0x861, 0x862, 0x8621, 0x863, 0x8631, 0x8632, 0x86321, 0x864,
- 0x8641, 0x8642, 0x86421, 0x8643, 0x86431, 0x86432, 0x864321, 0x865, 0x8651,
- 0x8652, 0x86521, 0x8653, 0x86531, 0x86532, 0x865321, 0x8654, 0x86541,
- 0x86542, 0x865421, 0x86543, 0x865431, 0x865432, 0x8654321, 0x87, 0x871,
- 0x872, 0x8721, 0x873, 0x8731, 0x8732, 0x87321, 0x874, 0x8741, 0x8742,
- 0x87421, 0x8743, 0x87431, 0x87432, 0x874321, 0x875, 0x8751, 0x8752, 0x87521,
- 0x8753, 0x87531, 0x87532, 0x875321, 0x8754, 0x87541, 0x87542, 0x875421,
- 0x87543, 0x875431, 0x875432, 0x8754321, 0x876, 0x8761, 0x8762, 0x87621,
- 0x8763, 0x87631, 0x87632, 0x876321, 0x8764, 0x87641, 0x87642, 0x876421,
- 0x87643, 0x876431, 0x876432, 0x8764321, 0x8765, 0x87651, 0x87652, 0x876521,
- 0x87653, 0x876531, 0x876532, 0x8765321, 0x87654, 0x876541, 0x876542,
- 0x8765421, 0x876543, 0x8765431, 0x8765432, 0x87654321
- };
- /***** the python code that generated bitlist
- def bits2int(val):
- arr=0
- for shift in range(8,0,-1):
- if val & 0x80:
- arr = (arr << 4) | shift
- val = val << 1
- return arr
-
- def int_table():
- tbl = [ hex(bits2int(val)).strip('L') for val in range(256) ]
- return ','.join(tbl)
- ******/
-
// hmmm, what about an iterator that finds zeros though,
// or a reverse iterator... should they be separate classes
// for efficiency, or have a common root interface? (or
@@ -101,7 +52,7 @@ public class OpenBitSetIterator extends
if ((int)word ==0) {wordShift +=32; word = word >>>32; }
if ((word & 0x0000FFFF) == 0) { wordShift +=16; word >>>=16; }
if ((word & 0x000000FF) == 0) { wordShift +=8; word >>>=8; }
- indexArray = bitlist[(int)word & 0xff];
+ indexArray = BitUtil.bitList((byte) word);
}
/***** alternate shift implementations
Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/WAH8DocIdSet.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/WAH8DocIdSet.java?rev=1501925&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/WAH8DocIdSet.java (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/WAH8DocIdSet.java Wed Jul 10 19:24:24 2013
@@ -0,0 +1,625 @@
+package org.apache.lucene.util;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.util.packed.PackedInts;
+
+/**
+ * {@link DocIdSet} implementation based on word-aligned hybrid encoding on
+ * words of 8 bits.
+ * <p>This implementation doesn't support random-access but has a fast
+ * {@link DocIdSetIterator} which can advance in logarithmic time thanks to
+ * an index.</p>
+ * <p>The compression scheme is simplistic and should work well with sparse doc
+ * id sets while being only slightly larger than a {@link FixedBitSet} for
+ * incompressible sets (overhead<2% in the worst case) in spite of the index.</p>
+ * <p><b>Format</b>: The format is byte-aligned. An 8-bits word is either clean,
+ * meaning composed only of zeros, or dirty, meaning that it contains at least one
+ * bit set. The idea is to encode sequences of clean words using run-length
+ * encoding and to leave sequences of dirty words as-is.</p>
+ * <table>
+ * <tr><th>Token</th><th>Clean length+</th><th>Dirty length+</th><th>Dirty words</th></tr>
+ * <tr><td>1 byte</td><td>0-n bytes</td><td>0-n bytes</td><td>0-n bytes</td></tr>
+ * </table>
+ * <ul>
+ * <li><b>Token</b> encodes the number of clean words minus 2 on the first 4
+ * bits and the number of dirty words minus 1 on the last 4 bits. The
+ * higher-order bit is a continuation bit, meaning that the number is incomplete
+ * and needs additional bytes to be read.</li>
+ * <li><b>Clean length+</b>: If clean length has its higher-order bit set,
+ * you need to read a {@link DataInput#readVInt() vint}, shift it by 3 bits on
+ * the left side and add it to the 3 bits which have been read in the token.</li>
+ * <li><b>Dirty length+</b> works the same way as <b>Clean length+</b> but
+ * for the length of dirty words.</li>
+ * <li><b>Dirty words</b> are the dirty words, there are <b>Dirty length</b>
+ * of them.</li>
+ * </ul>
+ * <p>This format cannot encode sequences of less than 2 clean words and 1 dirty
+ * word. The reason is that if you find a single clean word, you should rather
+ * encode it as a dirty word. This takes the same space as starting a new
+ * sequence (since you need one byte for the token) but will be lighter to
+ * decode. There is however an exception for the first sequence. Since the first
+ * sequence may start directly with a dirty word, the clean length is encoded
+ * directly, without subtracting 2.</p>
+ * <p>There is an additional restriction on the format: the sequence of dirty
+ * words must start and end with a non-null word and is not allowed to contain
+ * two consecutive null words. This restriction exists to make sure no space is
+ * wasted and to make sure iterators can read the next doc ID by reading at most
+ * 2 dirty words.</p>
+ * @lucene.experimental
+ */
+public final class WAH8DocIdSet extends DocIdSet {
+
+ // Minimum index interval, intervals below this value can't guarantee anymore
+ // that this set implementation won't be significantly larger than a FixedBitSet
+ // The reason is that a single sequence saves at least one byte and an index
+ // entry requires at most 8 bytes (2 ints) so there shouldn't be more than one
+ // index entry every 8 sequences
+ private static final int MIN_INDEX_INTERVAL = 8;
+
+ /** Default index interval. */
+ // To compute this default value, I created a rather dense set (0.1% load
+ // factor, which is close to the worst case regarding both compression and
+ // speed for this DocIdSet impl since sequences are going to be short) and I
+ // started with interval=1 and doubled it at each iteration until advance
+ // became slower
+ public static final int DEFAULT_INDEX_INTERVAL = 16;
+
+ private static final PackedInts.Reader EMPTY_READER = new PackedInts.NullReader(1);
+ private static WAH8DocIdSet EMPTY = new WAH8DocIdSet(new byte[0], EMPTY_READER, EMPTY_READER);
+
+ private static final Comparator<Iterator> SERIALIZED_LENGTH_COMPARATOR = new Comparator<Iterator>() {
+ @Override
+ public int compare(Iterator wi1, Iterator wi2) {
+ return wi1.in.length() - wi2.in.length();
+ }
+ };
+
+ /** Same as {@link #copyOf(DocIdSetIterator, int)} with the default index interval. */
+ public static WAH8DocIdSet copyOf(DocIdSetIterator it) throws IOException {
+ return copyOf(it, DEFAULT_INDEX_INTERVAL);
+ }
+
+ /** Return a copy of the provided iterator. */
+ public static WAH8DocIdSet copyOf(DocIdSetIterator it, int indexInterval) throws IOException {
+ Builder builder = new Builder().setIndexInterval(indexInterval);
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ builder.add(doc);
+ }
+ return builder.build();
+ }
+
+ /** Same as {@link #intersect(Collection, int)} with the default index interval. */
+ public static WAH8DocIdSet intersect(Collection<WAH8DocIdSet> docIdSets) {
+ return intersect(docIdSets, DEFAULT_INDEX_INTERVAL);
+ }
+
+ /**
+ * Compute the intersection of the provided sets. This method is much faster than
+ * computing the intersection manually since it operates directly at the byte level.
+ */
+ public static WAH8DocIdSet intersect(Collection<WAH8DocIdSet> docIdSets, int indexInterval) {
+ switch (docIdSets.size()) {
+ case 0:
+ throw new IllegalArgumentException("There must be at least one set to intersect");
+ case 1:
+ return docIdSets.iterator().next();
+ }
+ // The logic below is similar to ConjunctionScorer
+ final int numSets = docIdSets.size();
+ final Iterator[] iterators = new Iterator[numSets];
+ int i = 0;
+ for (WAH8DocIdSet set : docIdSets) {
+ final Iterator it = set.iterator();
+ iterators[i++] = it;
+ }
+ Arrays.sort(iterators, SERIALIZED_LENGTH_COMPARATOR);
+ final WordBuilder builder = new WordBuilder().setIndexInterval(indexInterval);
+ int wordNum = 0;
+ main:
+ while (true) {
+ // Advance the least costly iterator first
+ iterators[0].advanceWord(wordNum);
+ wordNum = iterators[0].wordNum;
+ if (wordNum == DocIdSetIterator.NO_MORE_DOCS) {
+ break;
+ }
+ byte word = iterators[0].word;
+ for (i = 1; i < numSets; ++i) {
+ if (iterators[i].wordNum < wordNum) {
+ iterators[i].advanceWord(wordNum);
+ }
+ if (iterators[i].wordNum > wordNum) {
+ wordNum = iterators[i].wordNum;
+ continue main;
+ }
+ assert iterators[i].wordNum == wordNum;
+ word &= iterators[i].word;
+ if (word == 0) {
+ // There are common words, but they don't share any bit
+ ++wordNum;
+ continue main;
+ }
+ }
+ // Found a common word
+ assert word != 0;
+ builder.addWord(wordNum, word);
+ ++wordNum;
+ }
+ return builder.build();
+ }
+
+ /** Same as {@link #union(Collection, int)} with the default index interval. */
+ public static WAH8DocIdSet union(Collection<WAH8DocIdSet> docIdSets) {
+ return union(docIdSets, DEFAULT_INDEX_INTERVAL);
+ }
+
+ /**
+ * Compute the union of the provided sets. This method is much faster than
+ * computing the union manually since it operates directly at the byte level.
+ */
+ public static WAH8DocIdSet union(Collection<WAH8DocIdSet> docIdSets, int indexInterval) {
+ switch (docIdSets.size()) {
+ case 0:
+ return EMPTY;
+ case 1:
+ return docIdSets.iterator().next();
+ }
+ // The logic below is very similar to DisjunctionScorer
+ final int numSets = docIdSets.size();
+ final PriorityQueue<Iterator> iterators = new PriorityQueue<WAH8DocIdSet.Iterator>(numSets) {
+ @Override
+ protected boolean lessThan(Iterator a, Iterator b) {
+ return a.wordNum < b.wordNum;
+ }
+ };
+ for (WAH8DocIdSet set : docIdSets) {
+ Iterator iterator = set.iterator();
+ iterator.nextWord();
+ iterators.add(iterator);
+ }
+
+ Iterator top = iterators.top();
+ if (top.wordNum == Integer.MAX_VALUE) {
+ return EMPTY;
+ }
+ int wordNum = top.wordNum;
+ byte word = top.word;
+ final WordBuilder builder = new WordBuilder().setIndexInterval(indexInterval);
+ while (true) {
+ top.nextWord();
+ iterators.updateTop();
+ top = iterators.top();
+ if (top.wordNum == wordNum) {
+ word |= top.word;
+ } else {
+ builder.addWord(wordNum, word);
+ if (top.wordNum == Integer.MAX_VALUE) {
+ break;
+ }
+ wordNum = top.wordNum;
+ word = top.word;
+ }
+ }
+ return builder.build();
+ }
+
+ static int wordNum(int docID) {
+ assert docID >= 0;
+ return docID >>> 3;
+ }
+
+ /** Word-based builder. */
+ static class WordBuilder {
+
+ final GrowableByteArrayDataOutput out;
+ final GrowableByteArrayDataOutput dirtyWords;
+ int clean;
+ int lastWordNum;
+ int numSequences;
+ int indexInterval;
+
+ WordBuilder() {
+ out = new GrowableByteArrayDataOutput(1024);
+ dirtyWords = new GrowableByteArrayDataOutput(128);
+ clean = 0;
+ lastWordNum = -1;
+ numSequences = 0;
+ indexInterval = DEFAULT_INDEX_INTERVAL;
+ }
+
+ /** Set the index interval. Smaller index intervals improve performance of
+ * {@link DocIdSetIterator#advance(int)} but make the {@link DocIdSet}
+ * larger. An index interval <code>i</code> makes the index add an overhead
+ * which is at most <code>4/i</code>, but likely much less.The default index
+ * interval is <code>16</code>, meaning the index has an overhead of at most
+ * 25%. To disable indexing, you can pass {@link Integer#MAX_VALUE} as an
+ * index interval. */
+ public WordBuilder setIndexInterval(int indexInterval) {
+ if (indexInterval < MIN_INDEX_INTERVAL) {
+ throw new IllegalArgumentException("indexInterval must be >= " + MIN_INDEX_INTERVAL);
+ }
+ this.indexInterval = indexInterval;
+ return this;
+ }
+
+ void writeHeader(int cleanLength) throws IOException {
+ final int cleanLengthMinus2 = cleanLength - 2;
+ final int dirtyLengthMinus1 = dirtyWords.length - 1;
+ assert cleanLengthMinus2 >= 0;
+ assert dirtyLengthMinus1 >= 0;
+ int token = ((cleanLengthMinus2 & 0x07) << 4) | (dirtyLengthMinus1 & 0x07);
+ if (cleanLengthMinus2 > 0x07) {
+ token |= 1 << 7;
+ }
+ if (dirtyLengthMinus1 > 0x07) {
+ token |= 1 << 3;
+ }
+ out.writeByte((byte) token);
+ if (cleanLengthMinus2 > 0x07) {
+ out.writeVInt(cleanLengthMinus2 >>> 3);
+ }
+ if (dirtyLengthMinus1 > 0x07) {
+ out.writeVInt(dirtyLengthMinus1 >>> 3);
+ }
+ }
+
+ void writeSequence(int cleanLength) {
+ try {
+ writeHeader(cleanLength);
+ out.writeBytes(dirtyWords.bytes, dirtyWords.length);
+ } catch (IOException cannotHappen) {
+ throw new AssertionError(cannotHappen);
+ }
+ dirtyWords.length = 0;
+ ++numSequences;
+ }
+
+ void addWord(int wordNum, byte word) {
+ assert wordNum > lastWordNum;
+ assert word != 0;
+
+ if (lastWordNum == -1) {
+ clean = 2 + wordNum; // special case for the 1st sequence
+ dirtyWords.writeByte(word);
+ } else {
+ switch (wordNum - lastWordNum) {
+ case 1:
+ dirtyWords.writeByte(word);
+ break;
+ case 2:
+ dirtyWords.writeByte((byte) 0);
+ dirtyWords.writeByte(word);
+ break;
+ default:
+ writeSequence(clean);
+ clean = wordNum - lastWordNum - 1;
+ dirtyWords.writeByte(word);
+ }
+ }
+ lastWordNum = wordNum;
+ }
+
+ /** Build a new {@link WAH8DocIdSet}. */
+ public WAH8DocIdSet build() {
+ if (lastWordNum == -1) {
+ return EMPTY;
+ }
+ writeSequence(clean);
+ final byte[] data = Arrays.copyOf(out.bytes, out.length);
+
+ // Now build the index
+ final int valueCount = (numSequences - 1) / indexInterval + 1;
+ final PackedInts.Reader indexPositions;
+ final PackedInts.Reader indexWordNums;
+ if (valueCount <= 1) {
+ indexPositions = indexWordNums = EMPTY_READER;
+ } else {
+ // From the tests I ran, there is no need to expose acceptableOverheadRatio, these packed ints are never the bottleneck
+ final PackedInts.Mutable positions = PackedInts.getMutable(valueCount, PackedInts.bitsRequired(data.length - 1), PackedInts.COMPACT);
+ final PackedInts.Mutable wordNums = PackedInts.getMutable(valueCount, PackedInts.bitsRequired(lastWordNum), PackedInts.COMPACT);
+
+ final Iterator it = new Iterator(data, null, null);
+ assert it.in.getPosition() == 0;
+ assert it.wordNum == -1;
+ for (int i = 1; i < valueCount; ++i) {
+ // skip indexInterval sequences
+ for (int j = 0; j < indexInterval; ++j) {
+ final boolean readSequence = it.readSequence();
+ assert readSequence;
+ it.skipDirtyBytes();
+ }
+ final int position = it.in.getPosition();
+ final int wordNum = it.wordNum;
+ positions.set(i, position);
+ wordNums.set(i, wordNum + 1);
+ }
+ indexPositions = positions;
+ indexWordNums = wordNums;
+ }
+
+ return new WAH8DocIdSet(data, indexPositions, indexWordNums);
+ }
+
+ }
+
+ /** A builder for {@link WAH8DocIdSet}s. */
+ public static final class Builder extends WordBuilder {
+
+ private int lastDocID;
+ private int wordNum, word;
+
+ /** Sole constructor */
+ public Builder() {
+ super();
+ lastDocID = -1;
+ wordNum = -1;
+ word = 0;
+ }
+
+ /** Add a document to this builder. Documents must be added in order. */
+ public Builder add(int docID) {
+ if (docID <= lastDocID) {
+ throw new IllegalArgumentException("Doc ids must be added in-order, got " + docID + " which is <= lastDocID=" + lastDocID);
+ }
+ final int wordNum = wordNum(docID);
+ if (this.wordNum == -1) {
+ this.wordNum = wordNum;
+ word = 1 << (docID & 0x07);
+ } else if (wordNum == this.wordNum) {
+ word |= 1 << (docID & 0x07);
+ } else {
+ addWord(this.wordNum, (byte) word);
+ this.wordNum = wordNum;
+ word = 1 << (docID & 0x07);
+ }
+ lastDocID = docID;
+ return this;
+ }
+
+ @Override
+ public Builder setIndexInterval(int indexInterval) {
+ return (Builder) super.setIndexInterval(indexInterval);
+ }
+
+ @Override
+ public WAH8DocIdSet build() {
+ if (this.wordNum != -1) {
+ addWord(wordNum, (byte) word);
+ }
+ return super.build();
+ }
+
+ }
+
+ // where the doc IDs are stored
+ private final byte[] data;
+ // index for advance(int)
+ private final PackedInts.Reader positions, wordNums; // wordNums[i] starts at the sequence at positions[i]
+
+ WAH8DocIdSet(byte[] data, PackedInts.Reader positions, PackedInts.Reader wordNums) {
+ this.data = data;
+ this.positions = positions;
+ this.wordNums = wordNums;
+ }
+
+ @Override
+ public boolean isCacheable() {
+ return true;
+ }
+
+ @Override
+ public Iterator iterator() {
+ return new Iterator(data, positions, wordNums);
+ }
+
+ static int readLength(ByteArrayDataInput in, int len) {
+ if ((len & 0x08) == 0) {
+ // no continuation bit
+ return len;
+ }
+ return (len & 0x07) | (in.readVInt() << 3);
+ }
+
+ static class Iterator extends DocIdSetIterator {
+
+ final ByteArrayDataInput in;
+ final PackedInts.Reader positions, wordNums;
+ int dirtyLength;
+
+ int wordNum; // byte offset
+ byte word; // current word
+ int bitList; // list of bits set in the current word
+
+ int docID;
+
+ Iterator(byte[] data, PackedInts.Reader positions, PackedInts.Reader wordNums) {
+ this.in = new ByteArrayDataInput(data);
+ this.positions = positions;
+ this.wordNums = wordNums;
+ wordNum = -1;
+ word = 0;
+ bitList = 0;
+ docID = -1;
+ }
+
+ boolean readSequence() {
+ if (in.eof()) {
+ wordNum = Integer.MAX_VALUE;
+ return false;
+ }
+ final int token = in.readByte() & 0xFF;
+ final int cleanLength = (in.getPosition() == 1 ? 0 : 2) + readLength(in, token >>> 4);
+ wordNum += cleanLength;
+ dirtyLength = 1 + readLength(in, token & 0x0F);
+ return true;
+ }
+
+ void skipDirtyBytes(int count) {
+ assert count >= 0;
+ assert count <= dirtyLength;
+ in.skipBytes(count);
+ wordNum += count;
+ dirtyLength -= count;
+ }
+
+ void skipDirtyBytes() {
+ in.skipBytes(dirtyLength);
+ wordNum += dirtyLength;
+ dirtyLength = 0;
+ }
+
+ void nextWord() {
+ if (dirtyLength == 0 && !readSequence()) {
+ return;
+ }
+ word = in.readByte();
+ if (word == 0) {
+ word = in.readByte();
+ assert word != 0; // there can never be two consecutive null dirty words
+ ++wordNum;
+ --dirtyLength;
+ }
+ ++wordNum;
+ --dirtyLength;
+ }
+
+ int binarySearch(int targetWordNum) {
+ int lo = 0, hi = positions.size() - 1;
+ while (lo <= hi) {
+ final int mid = (lo + hi) >>> 1;
+ final int midWordNum = (int) wordNums.get(mid);
+ if (midWordNum <= targetWordNum) {
+ lo = mid + 1;
+ } else {
+ hi = mid - 1;
+ }
+ }
+ assert wordNums.get(hi) <= targetWordNum;
+ assert hi+1 == wordNums.size() || wordNums.get(hi + 1) > targetWordNum;
+ return hi;
+ }
+
+ void advanceWord(int targetWordNum) {
+ assert targetWordNum > wordNum;
+ if (dirtyLength == 0 && !readSequence()) {
+ return;
+ }
+ int delta = targetWordNum - wordNum;
+ if (delta <= dirtyLength + 1) {
+ if (delta > 1) {
+ skipDirtyBytes(delta - 1);
+ }
+ } else {
+ skipDirtyBytes();
+ assert dirtyLength == 0;
+ // use the index
+ final int i = binarySearch(targetWordNum);
+ final int position = (int) positions.get(i);
+ if (position > in.getPosition()) { // if the binary search returned a backward offset, don't move
+ wordNum = (int) wordNums.get(i) - 1;
+ in.setPosition(position);
+ }
+
+ while (true) {
+ if (!readSequence()) {
+ return;
+ }
+ delta = targetWordNum - wordNum;
+ if (delta <= dirtyLength + 1) {
+ if (delta > 1) {
+ skipDirtyBytes(delta - 1);
+ }
+ break;
+ }
+ skipDirtyBytes();
+ }
+ }
+
+ nextWord();
+ }
+
+ @Override
+ public int docID() {
+ return docID;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ if (bitList != 0) { // there are remaining bits in the current word
+ docID = (wordNum << 3) | ((bitList & 0x0F) - 1);
+ bitList >>>= 4;
+ return docID;
+ }
+ nextWord();
+ if (wordNum == Integer.MAX_VALUE) {
+ return docID = NO_MORE_DOCS;
+ }
+ bitList = BitUtil.bitList(word);
+ assert bitList != 0;
+ docID = (wordNum << 3) | ((bitList & 0x0F) - 1);
+ bitList >>>= 4;
+ return docID;
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ assert target > docID;
+ final int targetWordNum = wordNum(target);
+ if (targetWordNum > wordNum) {
+ advanceWord(targetWordNum);
+ bitList = BitUtil.bitList(word);
+ }
+ return slowAdvance(target);
+ }
+
+ @Override
+ public long cost() {
+ return in.length(); // good estimation of the cost of iterating over all docs
+ }
+
+ }
+
+ /** Return the number of documents in this {@link DocIdSet}. This method
+ * runs in linear time but is much faster than counting documents. */
+ public int cardinality() {
+ int cardinality = 0;
+ for (Iterator it = iterator(); it.wordNum != Integer.MAX_VALUE; it.nextWord()) {
+ cardinality += BitUtil.bitCount(it.word);
+ }
+ return cardinality;
+ }
+
+ /** Return the memory usage of this class in bytes. */
+ public long ramBytesUsed() {
+ return RamUsageEstimator.alignObjectSize(3 * RamUsageEstimator.NUM_BYTES_OBJECT_REF)
+ + RamUsageEstimator.sizeOf(data)
+ + positions.ramBytesUsed()
+ + wordNums.ramBytesUsed();
+ }
+
+}
Added: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestWAH8DocIdSet.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestWAH8DocIdSet.java?rev=1501925&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestWAH8DocIdSet.java (added)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestWAH8DocIdSet.java Wed Jul 10 19:24:24 2013
@@ -0,0 +1,144 @@
+package org.apache.lucene.util;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;
+
+public class TestWAH8DocIdSet extends LuceneTestCase {
+
+ private static FixedBitSet randomSet(int numBits, int numBitsSet) {
+ assert numBitsSet <= numBits;
+ final FixedBitSet set = new FixedBitSet(numBits);
+ if (numBitsSet == numBits) {
+ set.set(0, set.length());
+ } else {
+ for (int i = 0; i < numBitsSet; ++i) {
+ while (true) {
+ final int o = random().nextInt(numBits);
+ if (!set.get(o)) {
+ set.set(o);
+ break;
+ }
+ }
+ }
+ }
+ return set;
+ }
+
+ private static FixedBitSet randomSet(int numBits, float percentSet) {
+ return randomSet(numBits, (int) (percentSet * numBits));
+ }
+
+ public void testAgainstFixedBitSet() throws IOException {
+ final int numBits = _TestUtil.nextInt(random(), 100, 1 << 20);
+ for (float percentSet : new float[] {0f, 0.0001f, random().nextFloat() / 2, 0.9f, 1f}) {
+ final FixedBitSet set = randomSet(numBits, percentSet);
+ final WAH8DocIdSet copy = WAH8DocIdSet.copyOf(set.iterator());
+ assertEquals(numBits, set, copy);
+ }
+ }
+
+ public void assertEquals(int numBits, FixedBitSet ds1, WAH8DocIdSet ds2) throws IOException {
+ assertEquals(ds1.cardinality(), ds2.cardinality());
+
+ // nextDoc
+ DocIdSetIterator it1 = ds1.iterator();
+ DocIdSetIterator it2 = ds2.iterator();
+ assertEquals(it1.docID(), it2.docID());
+ for (int doc = it1.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it1.nextDoc()) {
+ assertEquals(doc, it2.nextDoc());
+ assertEquals(it1.docID(), it2.docID());
+ }
+ assertEquals(DocIdSetIterator.NO_MORE_DOCS, it2.nextDoc());
+ assertEquals(it1.docID(), it2.docID());
+
+ // nextDoc / advance
+ it1 = ds1.iterator();
+ it2 = ds2.iterator();
+ for (int doc = -1; doc != DocIdSetIterator.NO_MORE_DOCS;) {
+ if (random().nextBoolean()) {
+ doc = it1.nextDoc();
+ assertEquals(doc, it2.nextDoc());
+ assertEquals(it1.docID(), it2.docID());
+ } else {
+ final int target = doc + 1 + random().nextInt(random().nextBoolean() ? 64 : numBits / 64);
+ doc = it1.advance(target);
+ assertEquals(doc, it2.advance(target));
+ assertEquals(it1.docID(), it2.docID());
+ }
+ }
+ }
+
+ public void testUnion() throws IOException {
+ final int numBits = _TestUtil.nextInt(random(), 100, 1 << 20);
+ final int numDocIdSets = _TestUtil.nextInt(random(), 0, 4);
+ final List<FixedBitSet> fixedSets = new ArrayList<FixedBitSet>(numDocIdSets);
+ for (int i = 0; i < numDocIdSets; ++i) {
+ fixedSets.add(randomSet(numBits, random().nextFloat() / 16));
+ }
+ final List<WAH8DocIdSet> compressedSets = new ArrayList<WAH8DocIdSet>(numDocIdSets);
+ for (FixedBitSet set : fixedSets) {
+ compressedSets.add(WAH8DocIdSet.copyOf(set.iterator()));
+ }
+
+ final WAH8DocIdSet union = WAH8DocIdSet.union(compressedSets);
+ final FixedBitSet expected = new FixedBitSet(numBits);
+ for (DocIdSet set : fixedSets) {
+ final DocIdSetIterator it = set.iterator();
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ expected.set(doc);
+ }
+ }
+ assertEquals(numBits, expected, union);
+ }
+
+ public void testIntersection() throws IOException {
+ final int numBits = _TestUtil.nextInt(random(), 100, 1 << 20);
+ final int numDocIdSets = _TestUtil.nextInt(random(), 1, 4);
+ final List<FixedBitSet> fixedSets = new ArrayList<FixedBitSet>(numDocIdSets);
+ for (int i = 0; i < numDocIdSets; ++i) {
+ fixedSets.add(randomSet(numBits, random().nextFloat()));
+ }
+ final List<WAH8DocIdSet> compressedSets = new ArrayList<WAH8DocIdSet>(numDocIdSets);
+ for (FixedBitSet set : fixedSets) {
+ compressedSets.add(WAH8DocIdSet.copyOf(set.iterator()));
+ }
+
+ final WAH8DocIdSet union = WAH8DocIdSet.intersect(compressedSets);
+ final FixedBitSet expected = new FixedBitSet(numBits);
+ expected.set(0, expected.length());
+ for (DocIdSet set : fixedSets) {
+ final DocIdSetIterator it = set.iterator();
+ int lastDoc = -1;
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ expected.clear(lastDoc + 1, doc);
+ lastDoc = doc;
+ }
+ if (lastDoc + 1 < expected.length()) {
+ expected.clear(lastDoc + 1, expected.length());
+ }
+ }
+ assertEquals(numBits, expected, union);
+ }
+
+}