You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2016/10/11 07:32:03 UTC
lucene-solr:master: LUCENE-7485: Better storage of sparse values in
Lucene70NormsFormat.
Repository: lucene-solr
Updated Branches:
refs/heads/master 63ef45902 -> 2fbbcac58
LUCENE-7485: Better storage of sparse values in Lucene70NormsFormat.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/2fbbcac5
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/2fbbcac5
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/2fbbcac5
Branch: refs/heads/master
Commit: 2fbbcac580fbfd866a8571c86e9e38704d58cf9d
Parents: 63ef459
Author: Adrien Grand <jp...@gmail.com>
Authored: Tue Oct 11 09:20:34 2016 +0200
Committer: Adrien Grand <jp...@gmail.com>
Committed: Tue Oct 11 09:20:34 2016 +0200
----------------------------------------------------------------------
.../lucene/codecs/lucene70/IndexedDISI.java | 269 +++++++++++++++++++
.../codecs/lucene70/Lucene70NormsConsumer.java | 8 +-
.../codecs/lucene70/Lucene70NormsFormat.java | 4 +-
.../codecs/lucene70/Lucene70NormsProducer.java | 4 +-
.../lucene/codecs/lucene70/SparseDISI.java | 114 --------
.../lucene/codecs/lucene70/TestIndexedDISI.java | 223 +++++++++++++++
.../lucene/codecs/lucene70/TestSparseDISI.java | 94 -------
7 files changed, 504 insertions(+), 212 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2fbbcac5/lucene/core/src/java/org/apache/lucene/codecs/lucene70/IndexedDISI.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/IndexedDISI.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/IndexedDISI.java
new file mode 100644
index 0000000..3ea3141
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/IndexedDISI.java
@@ -0,0 +1,269 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene70;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current document
+ * among the list of documents that this iterator can return. This is useful
+ * to implement sparse doc values by only having to encode values for documents
+ * that actually have a value.
+ * <p>Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:<ul>
+ * <li>{@code ALL} if the range contains 65536 documents exactly,
+ * <li>{@code DENSE} if the range contains 4096 documents or more; in that
+ * case documents are stored in a bit set,
+ * <li>{@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ * stored in a {@link DataInput#readShort() short}.
+ * </ul>
+ * <p>Only ranges that contain at least one value are encoded.
+ * <p>This implementation uses 6 bytes per document in the worst-case, which happens
+ * in the case that all ranges contain exactly one document.
+ * @lucene.internal
+ */
+final class IndexedDISI extends DocIdSetIterator {
+
+ static final int MAX_ARRAY_LENGTH = (1 << 12) - 1;
+
+ private static void flush(int block, FixedBitSet buffer, int cardinality, IndexOutput out) throws IOException {
+ assert block >= 0 && block < 65536;
+ out.writeShort((short) block);
+ assert cardinality > 0 && cardinality <= 65536;
+ out.writeShort((short) (cardinality - 1));
+ if (cardinality > MAX_ARRAY_LENGTH) {
+ if (cardinality != 65536) { // all docs are set
+ for (long word : buffer.getBits()) {
+ out.writeLong(word);
+ }
+ }
+ } else {
+ BitSetIterator it = new BitSetIterator(buffer, cardinality);
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ out.writeShort((short) doc);
+ }
+ }
+ }
+
+ static void writeBitSet(DocIdSetIterator it, IndexOutput out) throws IOException {
+ int i = 0;
+ final FixedBitSet buffer = new FixedBitSet(1<<16);
+ int prevBlock = -1;
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ final int block = doc >>> 16;
+ if (prevBlock != -1 && block != prevBlock) {
+ flush(prevBlock, buffer, i, out);
+ buffer.clear(0, buffer.length());
+ prevBlock = block;
+ i = 0;
+ }
+ buffer.set(doc & 0xFFFF);
+ i++;
+ prevBlock = block;
+ }
+ if (i > 0) {
+ flush(prevBlock, buffer, i, out);
+ buffer.clear(0, buffer.length());
+ }
+ // NO_MORE_DOCS is stored explicitly
+ buffer.set(DocIdSetIterator.NO_MORE_DOCS & 0xFFFF);
+ flush(DocIdSetIterator.NO_MORE_DOCS >>> 16, buffer, 1, out);
+ }
+
+ /** The slice that stores the {@link DocIdSetIterator}. */
+ private final IndexInput slice;
+ private final long cost;
+
+ IndexedDISI(IndexInput in, long offset, long length, long cost) throws IOException {
+ this.slice = in.slice("docs", offset, length);
+ this.cost = cost;
+ }
+
+ private int block = -1;
+ private long blockEnd;
+ private int nextBlockIndex = -1;
+ Method method;
+
+ private int doc = -1;
+ private int index = -1;
+
+ // DENSE variables
+ private long word;
+ private int wordIndex = -1;
+ // number of one bits encountered so far, including those of `word`
+ private int numberOfOnes;
+
+ // ALL variables
+ private int gap;
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ final int targetBlock = target & 0xFFFF0000;
+ if (block != targetBlock) {
+ advanceBlock(targetBlock);
+ }
+ if (block == targetBlock) {
+ if (method.advanceWithinBlock(this, target)) {
+ return doc;
+ }
+ readBlockHeader();
+ }
+ return doc = method.readFirstDoc(this);
+ }
+
+ private void advanceBlock(int targetBlock) throws IOException {
+ do {
+ slice.seek(blockEnd);
+ readBlockHeader();
+ } while (block < targetBlock);
+ }
+
+ private void readBlockHeader() throws IOException {
+ block = Short.toUnsignedInt(slice.readShort()) << 16;
+ assert block >= 0;
+ final int numValues = 1 + Short.toUnsignedInt(slice.readShort());
+ index = nextBlockIndex;
+ nextBlockIndex = index + numValues;
+ if (numValues <= MAX_ARRAY_LENGTH) {
+ method = Method.SPARSE;
+ blockEnd = slice.getFilePointer() + (numValues << 1);
+ } else if (numValues == 65536) {
+ method = Method.ALL;
+ blockEnd = slice.getFilePointer();
+ gap = block - index - 1;
+ } else {
+ method = Method.DENSE;
+ blockEnd = slice.getFilePointer() + (1 << 13);
+ wordIndex = -1;
+ numberOfOnes = index + 1;
+ }
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ public int index() {
+ return index;
+ }
+
+ @Override
+ public long cost() {
+ return cost;
+ }
+
+ enum Method {
+ SPARSE {
+ @Override
+ int readFirstDoc(IndexedDISI disi) throws IOException {
+ disi.index++;
+ return disi.block | Short.toUnsignedInt(disi.slice.readShort());
+ }
+ @Override
+ boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException {
+ final int targetInBlock = target & 0xFFFF;
+ // TODO: binary search
+ for (; disi.index < disi.nextBlockIndex;) {
+ int doc = Short.toUnsignedInt(disi.slice.readShort());
+ disi.index++;
+ if (doc >= targetInBlock) {
+ disi.doc = disi.block | doc;
+ return true;
+ }
+ }
+ return false;
+ }
+ },
+ DENSE {
+ @Override
+ int readFirstDoc(IndexedDISI disi) throws IOException {
+ do {
+ disi.word = disi.slice.readLong();
+ disi.wordIndex++;
+ } while (disi.word == 0L);
+ disi.index = disi.numberOfOnes;
+ disi.numberOfOnes += Long.bitCount(disi.word);
+ return disi.block | (disi.wordIndex << 6) | Long.numberOfTrailingZeros(disi.word);
+ }
+ @Override
+ boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException {
+ final int targetInBlock = target & 0xFFFF;
+ final int targetWordIndex = targetInBlock >>> 6;
+ for (int i = disi.wordIndex + 1; i <= targetWordIndex; ++i) {
+ disi.word = disi.slice.readLong();
+ disi.numberOfOnes += Long.bitCount(disi.word);
+ }
+ disi.wordIndex = targetWordIndex;
+
+ long leftBits = disi.word >>> target;
+ if (leftBits != 0L) {
+ disi.doc = target + Long.numberOfTrailingZeros(leftBits);
+ disi.index = disi.numberOfOnes - Long.bitCount(leftBits);
+ return true;
+ }
+
+ while (++disi.wordIndex < 1024) {
+ disi.word = disi.slice.readLong();
+ if (disi.word != 0) {
+ disi.index = disi.numberOfOnes;
+ disi.numberOfOnes += Long.bitCount(disi.word);
+ disi.doc = disi.block | (disi.wordIndex << 6) | Long.numberOfTrailingZeros(disi.word);
+ return true;
+ }
+ }
+ return false;
+ }
+ },
+ ALL {
+ @Override
+ int readFirstDoc(IndexedDISI disi) {
+ return disi.block;
+ }
+ @Override
+ boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException {
+ disi.doc = target;
+ disi.index = target - disi.gap;
+ return true;
+ }
+ };
+
+ /** Read the first document of the current block. */
+ abstract int readFirstDoc(IndexedDISI disi) throws IOException;
+
+ /** Advance to the first doc from the block that is equal to or greater than {@code target}.
+ * Return true if there is such a doc and false otherwise. */
+ abstract boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2fbbcac5/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsConsumer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsConsumer.java
index 00cd5ec..d79e246 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsConsumer.java
@@ -96,12 +96,16 @@ final class Lucene70NormsConsumer extends NormsConsumer {
if (numDocsWithValue == 0) {
meta.writeLong(-2);
+ meta.writeLong(0L);
} else if (numDocsWithValue == maxDoc) {
meta.writeLong(-1);
+ meta.writeLong(0L);
} else {
- meta.writeLong(data.getFilePointer());
+ long offset = data.getFilePointer();
+ meta.writeLong(offset);
values = normsProducer.getNorms(field);
- SparseDISI.writeBitSet(values, maxDoc, data);
+ IndexedDISI.writeBitSet(values, data);
+ meta.writeLong(data.getFilePointer() - offset);
}
meta.writeInt(numDocsWithValue);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2fbbcac5/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsFormat.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsFormat.java
index 7e70b24..7c764fe 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsFormat.java
@@ -45,9 +45,10 @@ import org.apache.lucene.store.DataOutput;
* <p>Norms metadata (.dvm) --> Header,<Entry><sup>NumFields</sup>,Footer</p>
* <ul>
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}</li>
- * <li>Entry --> FieldNumber, DocsWithFieldAddress, NumDocsWithField, BytesPerNorm, NormsAddress</li>
+ * <li>Entry --> FieldNumber, DocsWithFieldAddress, DocsWithFieldLength, NumDocsWithField, BytesPerNorm, NormsAddress</li>
* <li>FieldNumber --> {@link DataOutput#writeInt Int32}</li>
* <li>DocsWithFieldAddress --> {@link DataOutput#writeLong Int64}</li>
+ * <li>DocsWithFieldLength --> {@link DataOutput#writeLong Int64}</li>
* <li>NumDocsWithField --> {@link DataOutput#writeInt Int32}</li>
* <li>BytesPerNorm --> {@link DataOutput#writeByte byte}</li>
* <li>NormsAddress --> {@link DataOutput#writeLong Int64}</li>
@@ -60,6 +61,7 @@ import org.apache.lucene.store.DataOutput;
* <p>DocsWithFieldAddress is the pointer to the start of the bit set containing documents that have a norm
* in the norms data (.nvd), or -2 if no documents have a norm value, or -1 if all documents have a norm
* value.</p>
+ * <p>DocsWithFieldLength is the number of bytes used to encode the set of documents that have a norm.</p>
* <li><a name="nvd"></a>
* <p>The Norms data or .nvd file.</p>
* <p>For each Norms field, this stores the actual per-document data (the heavy-lifting)</p>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2fbbcac5/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsProducer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsProducer.java
index 79c185c..e3f6f79 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsProducer.java
@@ -90,6 +90,7 @@ final class Lucene70NormsProducer extends NormsProducer {
static class NormsEntry {
byte bytesPerNorm;
long docsWithFieldOffset;
+ long docsWithFieldLength;
int numDocsWithField;
long normsOffset;
}
@@ -108,6 +109,7 @@ final class Lucene70NormsProducer extends NormsProducer {
}
NormsEntry entry = new NormsEntry();
entry.docsWithFieldOffset = meta.readLong();
+ entry.docsWithFieldLength = meta.readLong();
entry.numDocsWithField = meta.readInt();
entry.bytesPerNorm = meta.readByte();
switch (entry.bytesPerNorm) {
@@ -166,7 +168,7 @@ final class Lucene70NormsProducer extends NormsProducer {
} else {
// sparse
final LongValues normValues = getNormValues(entry);
- final SparseDISI disi = new SparseDISI(maxDoc, data, entry.docsWithFieldOffset, entry.numDocsWithField);
+ final IndexedDISI disi = new IndexedDISI(data, entry.docsWithFieldOffset, entry.docsWithFieldLength, entry.numDocsWithField);
return new NumericDocValues() {
@Override
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2fbbcac5/lucene/core/src/java/org/apache/lucene/codecs/lucene70/SparseDISI.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/SparseDISI.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/SparseDISI.java
deleted file mode 100644
index b924297..0000000
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/SparseDISI.java
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.codecs.lucene70;
-
-import java.io.IOException;
-
-import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.store.IndexOutput;
-
-final class SparseDISI extends DocIdSetIterator {
-
- static void writeBitSet(DocIdSetIterator it, int maxDoc, IndexOutput out) throws IOException {
- int currentIndex = 0;
- long currentBits = 0;
- for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
- final int index = doc >>> 6;
- if (index > currentIndex) {
- out.writeLong(currentBits);
- for (int i = currentIndex + 1; i < index; ++i) {
- out.writeLong(0L);
- }
- currentIndex = index;
- currentBits = 0L;
- }
- currentBits |= 1L << doc;
- }
-
- out.writeLong(currentBits);
- final int maxIndex = (maxDoc - 1) >>> 6;
- for (int i = currentIndex + 1; i <= maxIndex; ++i) {
- out.writeLong(0L);
- }
- }
-
- final int maxDoc;
- final int numWords;
- final long cost;
- final IndexInput slice;
- int doc = -1;
- int wordIndex = -1;
- long word;
- int index = -1;
-
- SparseDISI(int maxDoc, IndexInput in, long offset, long cost) throws IOException {
- this.maxDoc = maxDoc;
- this.numWords = (int) ((maxDoc + 63L) >>> 6);
- this.slice = in.slice("docs", offset, numWords * 8L);
- this.cost = cost;
- }
-
- @Override
- public int advance(int target) throws IOException {
- if (target >= maxDoc) {
- return doc = NO_MORE_DOCS;
- }
-
- final int targetWordIndex = target >>> 6;
- for (int i = wordIndex + 1; i <= targetWordIndex; ++i) {
- word = slice.readLong();
- index += Long.bitCount(word);
- }
- wordIndex = targetWordIndex;
-
- long leftBits = word >>> target;
- if (leftBits != 0L) {
- return doc = target + Long.numberOfTrailingZeros(leftBits);
- }
-
- while (++wordIndex < numWords) {
- word = slice.readLong();
- if (word != 0) {
- index += Long.bitCount(word);
- return doc = (wordIndex << 6) + Long.numberOfTrailingZeros(word);
- }
- }
-
- return doc = NO_MORE_DOCS;
- }
-
- @Override
- public int nextDoc() throws IOException {
- return advance(doc + 1);
- }
-
- @Override
- public int docID() {
- return doc;
- }
-
- @Override
- public long cost() {
- return cost;
- }
-
- public int index() {
- return index - Long.bitCount(word >>> doc) + 1;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2fbbcac5/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestIndexedDISI.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestIndexedDISI.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestIndexedDISI.java
new file mode 100644
index 0000000..18b4590
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestIndexedDISI.java
@@ -0,0 +1,223 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene70;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+
+public class TestIndexedDISI extends LuceneTestCase {
+
+ public void testEmpty() throws IOException {
+ int maxDoc = TestUtil.nextInt(random(), 1, 100000);
+ FixedBitSet set = new FixedBitSet(maxDoc);
+ try (Directory dir = newDirectory()) {
+ doTest(set, dir);
+ }
+ }
+
+ public void testOneDoc() throws IOException {
+ int maxDoc = TestUtil.nextInt(random(), 1, 100000);
+ FixedBitSet set = new FixedBitSet(maxDoc);
+ set.set(random().nextInt(maxDoc));
+ try (Directory dir = newDirectory()) {
+ doTest(set, dir);
+ }
+ }
+
+ public void testTwoDocs() throws IOException {
+ int maxDoc = TestUtil.nextInt(random(), 1, 100000);
+ FixedBitSet set = new FixedBitSet(maxDoc);
+ set.set(random().nextInt(maxDoc));
+ set.set(random().nextInt(maxDoc));
+ try (Directory dir = newDirectory()) {
+ doTest(set, dir);
+ }
+ }
+
+ public void testAllDocs() throws IOException {
+ int maxDoc = TestUtil.nextInt(random(), 1, 100000);
+ FixedBitSet set = new FixedBitSet(maxDoc);
+ set.set(1, maxDoc);
+ try (Directory dir = newDirectory()) {
+ doTest(set, dir);
+ }
+ }
+
+ public void testHalfFull() throws IOException {
+ int maxDoc = TestUtil.nextInt(random(), 1, 100000);
+ FixedBitSet set = new FixedBitSet(maxDoc);
+ for (int i = random().nextInt(2); i < maxDoc; i += TestUtil.nextInt(random(), 1, 3)) {
+ set.set(i);
+ }
+ try (Directory dir = newDirectory()) {
+ doTest(set, dir);
+ }
+ }
+
+ public void testDocRange() throws IOException {
+ try (Directory dir = newDirectory()) {
+ for (int iter = 0; iter < 10; ++iter) {
+ int maxDoc = TestUtil.nextInt(random(), 1, 1000000);
+ FixedBitSet set = new FixedBitSet(maxDoc);
+ final int start = random().nextInt(maxDoc);
+ final int end = TestUtil.nextInt(random(), start + 1, maxDoc);
+ set.set(start, end);
+ doTest(set, dir);
+ }
+ }
+ }
+
+ public void testSparseDenseBoundary() throws IOException {
+ try (Directory dir = newDirectory()) {
+ FixedBitSet set = new FixedBitSet(200000);
+ int start = 65536 + random().nextInt(100);
+
+ // we set MAX_ARRAY_LENGTH bits so the encoding will be sparse
+ set.set(start, start + IndexedDISI.MAX_ARRAY_LENGTH);
+ long length;
+ try (IndexOutput out = dir.createOutput("sparse", IOContext.DEFAULT)) {
+ IndexedDISI.writeBitSet(new BitSetIterator(set, IndexedDISI.MAX_ARRAY_LENGTH), out);
+ length = out.getFilePointer();
+ }
+ try (IndexInput in = dir.openInput("sparse", IOContext.DEFAULT)) {
+ IndexedDISI disi = new IndexedDISI(in, 0L, length, IndexedDISI.MAX_ARRAY_LENGTH);
+ assertEquals(start, disi.nextDoc());
+ assertEquals(IndexedDISI.Method.SPARSE, disi.method);
+ }
+ doTest(set, dir);
+
+ // now we set one more bit so the encoding will be dense
+ set.set(start + IndexedDISI.MAX_ARRAY_LENGTH + random().nextInt(100));
+ try (IndexOutput out = dir.createOutput("bar", IOContext.DEFAULT)) {
+ IndexedDISI.writeBitSet(new BitSetIterator(set, IndexedDISI.MAX_ARRAY_LENGTH + 1), out);
+ length = out.getFilePointer();
+ }
+ try (IndexInput in = dir.openInput("bar", IOContext.DEFAULT)) {
+ IndexedDISI disi = new IndexedDISI(in, 0L, length, IndexedDISI.MAX_ARRAY_LENGTH + 1);
+ assertEquals(start, disi.nextDoc());
+ assertEquals(IndexedDISI.Method.DENSE, disi.method);
+ }
+ doTest(set, dir);
+ }
+ }
+
+ public void testOneDocMissing() throws IOException {
+ int maxDoc = TestUtil.nextInt(random(), 1, 1000000);
+ FixedBitSet set = new FixedBitSet(maxDoc);
+ set.set(0, maxDoc);
+ set.clear(random().nextInt(maxDoc));
+ try (Directory dir = newDirectory()) {
+ doTest(set, dir);
+ }
+ }
+
+ public void testFewMissingDocs() throws IOException {
+ try (Directory dir = newDirectory()) {
+ for (int iter = 0; iter < 100; ++iter) {
+ int maxDoc = TestUtil.nextInt(random(), 1, 100000);
+ FixedBitSet set = new FixedBitSet(maxDoc);
+ set.set(0, maxDoc);
+ final int numMissingDocs = TestUtil.nextInt(random(), 2, 1000);
+ for (int i = 0; i < numMissingDocs; ++i) {
+ set.clear(random().nextInt(maxDoc));
+ }
+ doTest(set, dir);
+ }
+ }
+ }
+
+ public void testRandom() throws IOException {
+ try (Directory dir = newDirectory()) {
+ for (int i = 0; i < 100; ++i) {
+ doTestRandom(dir);
+ }
+ }
+ }
+
+ private void doTestRandom(Directory dir) throws IOException {
+ List<Integer> docs = new ArrayList<>();
+ final int maxStep = TestUtil.nextInt(random(), 1, 1 << TestUtil.nextInt(random(), 2, 20));
+ final int numDocs = TestUtil.nextInt(random(), 1, Math.min(100000, Integer.MAX_VALUE / maxStep));
+ for (int doc = -1, i = 0; i < numDocs; ++i) {
+ doc += TestUtil.nextInt(random(), 1, maxStep);
+ docs.add(doc);
+ }
+ final int maxDoc = docs.get(docs.size() - 1) + TestUtil.nextInt(random(), 1, 100);
+
+ FixedBitSet set = new FixedBitSet(maxDoc);
+ for (int doc : docs) {
+ set.set(doc);
+ }
+
+ doTest(set, dir);
+ }
+
+ private void doTest(FixedBitSet set, Directory dir) throws IOException {
+ final int cardinality = set.cardinality();
+ long length;
+ try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
+ IndexedDISI.writeBitSet(new BitSetIterator(set, cardinality), out);
+ length = out.getFilePointer();
+ }
+
+ try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
+ IndexedDISI disi = new IndexedDISI(in, 0L, length, cardinality);
+ BitSetIterator disi2 = new BitSetIterator(set, cardinality);
+ int i = 0;
+ for (int doc = disi2.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi2.nextDoc()) {
+ assertEquals(doc, disi.nextDoc());
+ assertEquals(i++, disi.index());
+ }
+ assertEquals(DocIdSetIterator.NO_MORE_DOCS, disi.nextDoc());
+ }
+
+ for (int step : new int[] {1, 10, 100, 1000, 10000, 100000}) {
+ try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
+ IndexedDISI disi = new IndexedDISI(in, 0L, length, cardinality);
+ BitSetIterator disi2 = new BitSetIterator(set, cardinality);
+ int index = -1;
+ while (true) {
+ int target = disi2.docID() + step;
+ int doc;
+ do {
+ doc = disi2.nextDoc();
+ index++;
+ } while (doc < target);
+ assertEquals(doc, disi.advance(target));
+ if (doc == DocIdSetIterator.NO_MORE_DOCS) {
+ break;
+ }
+ assertEquals(index, disi.index());
+ }
+ }
+ }
+
+ dir.deleteFile("foo");
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/2fbbcac5/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestSparseDISI.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestSparseDISI.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestSparseDISI.java
deleted file mode 100644
index 1911bd0..0000000
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestSparseDISI.java
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.codecs.lucene70;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-
-import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.IOContext;
-import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.store.IndexOutput;
-import org.apache.lucene.util.BitSetIterator;
-import org.apache.lucene.util.FixedBitSet;
-import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.TestUtil;
-
-public class TestSparseDISI extends LuceneTestCase {
-
- public void testRandom() throws IOException {
- try (Directory dir = newDirectory()) {
- for (int i = 0; i < 1000; ++i) {
- doTestRandom(dir);
- }
- }
- }
-
- private void doTestRandom(Directory dir) throws IOException {
- List<Integer> docs = new ArrayList<>();
- final int maxStep = TestUtil.nextInt(random(), 1, 1 << TestUtil.nextInt(random(), 2, 10));
- final int numDocs = TestUtil.nextInt(random(), 1, 1000);
- for (int doc = -1, i = 0; i < numDocs; ++i) {
- doc += TestUtil.nextInt(random(), 1, maxStep);
- docs.add(doc);
- }
- final int maxDoc = docs.get(docs.size() - 1) + TestUtil.nextInt(random(), 1, 100);
-
- FixedBitSet set = new FixedBitSet(maxDoc);
- for (int doc : docs) {
- set.set(doc);
- }
-
- try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
- SparseDISI.writeBitSet(new BitSetIterator(set, docs.size()), maxDoc, out);
- }
-
- try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
- SparseDISI disi = new SparseDISI(maxDoc, in, 0L, docs.size());
- BitSetIterator disi2 = new BitSetIterator(set, docs.size());
- int i = 0;
- for (int doc = disi2.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi2.nextDoc()) {
- assertEquals(doc, disi.nextDoc());
- assertEquals(i++, disi.index());
- }
- assertEquals(DocIdSetIterator.NO_MORE_DOCS, disi.nextDoc());
- }
-
- for (int step : new int[] {1, 20, maxStep, maxStep * 10}) {
- try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
- SparseDISI disi = new SparseDISI(maxDoc, in, 0L, docs.size());
- BitSetIterator disi2 = new BitSetIterator(set, docs.size());
- while (true) {
- int target = disi2.docID() + step;
- int doc = disi2.advance(target);
- assertEquals(doc, disi.advance(target));
- if (doc == DocIdSetIterator.NO_MORE_DOCS) {
- break;
- }
- int index = Collections.binarySearch(docs, doc);
- assertEquals(index, disi.index());
- }
- }
- }
-
- dir.deleteFile("foo");
- }
-
-}