You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by to...@apache.org on 2019/01/18 21:51:06 UTC
[lucene-solr] branch master updated: LUCENE-8585: Create
jump-tables for DocValues at index-time
This is an automated email from the ASF dual-hosted git repository.
toke pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new c13645b LUCENE-8585: Create jump-tables for DocValues at index-time
c13645b is described below
commit c13645bd4c65f052e7f35df8e33d3dbfe6d7cd58
Author: Toke Eskildsen <to...@apache.org>
AuthorDate: Fri Jan 18 22:42:04 2019 +0100
LUCENE-8585: Create jump-tables for DocValues at index-time
---
lucene/CHANGES.txt | 3 +
.../apache/lucene/codecs/lucene70/IndexedDISI.java | 0
.../codecs/lucene70/Lucene70DocValuesConsumer.java | 0
.../codecs/lucene70/Lucene70DocValuesFormat.java | 0
.../codecs/lucene70/Lucene70DocValuesProducer.java | 0
.../codecs/lucene70/Lucene70NormsConsumer.java | 0
.../codecs/lucene70/Lucene70NormsFormat.java | 0
.../codecs/lucene70/Lucene70NormsProducer.java | 0
.../org.apache.lucene.codecs.DocValuesFormat | 1 +
.../lucene/codecs/lucene70/TestIndexedDISI.java | 0
.../lucene70/TestLucene70DocValuesFormat.java | 0
.../codecs/lucene70/TestLucene70NormsFormat.java | 0
.../lucene/codecs/lucene50/package-info.java | 2 +-
.../lucene/codecs/lucene60/package-info.java | 2 +-
.../lucene/codecs/lucene70/package-info.java | 387 +------------
.../apache/lucene/codecs/lucene80/IndexedDISI.java | 632 +++++++++++++++++++++
.../lucene/codecs/lucene80/Lucene80Codec.java | 7 +-
.../Lucene80DocValuesConsumer.java} | 247 ++++----
.../Lucene80DocValuesFormat.java} | 34 +-
.../Lucene80DocValuesProducer.java} | 216 +++----
.../Lucene80NormsConsumer.java} | 34 +-
.../Lucene80NormsFormat.java} | 18 +-
.../Lucene80NormsProducer.java} | 49 +-
.../lucene/codecs/lucene80/package-info.java | 12 +-
.../org.apache.lucene.codecs.DocValuesFormat | 2 +-
.../lucene/codecs/lucene80/TestIndexedDISI.java | 522 +++++++++++++++++
.../TestLucene80DocValuesFormat.java} | 178 ++++--
.../TestLucene80NormsFormat.java} | 7 +-
.../org/apache/lucene/index/TestDocValues.java | 6 +-
.../lucene/index/BaseDocValuesFormatTestCase.java | 19 +-
.../src/java/org/apache/lucene/util/TestUtil.java | 4 +-
.../solr/collection1/conf/schema_codec.xml | 2 +-
32 files changed, 1659 insertions(+), 725 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index fd2e8d0..b6a600d 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -243,6 +243,9 @@ Optimizations
* LUCENE-8607: MatchAllDocsQuery can shortcut when total hit count is not
required (Alan Woodward, Adrien Grand)
+* LUCENE-8585: Index-time jump-tables for DocValues, for O(1) advance when retrieving doc values.
+ (Toke Eskildsen, Adrien Grand)
+
======================= Lucene 7.7.0 =======================
Changes in Runtime Behavior
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/IndexedDISI.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene70/IndexedDISI.java
similarity index 100%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene70/IndexedDISI.java
rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene70/IndexedDISI.java
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesConsumer.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesConsumer.java
similarity index 100%
copy from lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesConsumer.java
copy to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesConsumer.java
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesFormat.java
similarity index 100%
copy from lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesFormat.java
copy to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesFormat.java
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesProducer.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesProducer.java
similarity index 100%
copy from lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesProducer.java
copy to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesProducer.java
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsConsumer.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsConsumer.java
similarity index 100%
copy from lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsConsumer.java
copy to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsConsumer.java
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsFormat.java
similarity index 100%
copy from lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsFormat.java
copy to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsFormat.java
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsProducer.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsProducer.java
similarity index 100%
copy from lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsProducer.java
copy to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsProducer.java
diff --git a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat
index 4a812de..20463c5 100644
--- a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat
+++ b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat
@@ -13,3 +13,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestIndexedDISI.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene70/TestIndexedDISI.java
similarity index 100%
rename from lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestIndexedDISI.java
rename to lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene70/TestIndexedDISI.java
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70DocValuesFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene70/TestLucene70DocValuesFormat.java
similarity index 100%
copy from lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70DocValuesFormat.java
copy to lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene70/TestLucene70DocValuesFormat.java
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70NormsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene70/TestLucene70NormsFormat.java
similarity index 100%
copy from lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70NormsFormat.java
copy to lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene70/TestLucene70NormsFormat.java
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package-info.java
index 9170c69..e63873a 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package-info.java
@@ -17,7 +17,7 @@
/**
* Components from the Lucene 5.0 index format
- * See {@link org.apache.lucene.codecs.lucene50} for an overview
+ * See {@link org.apache.lucene.codecs.lucene80} for an overview
* of the index format.
*/
package org.apache.lucene.codecs.lucene50;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java
index a914001..b7145cc 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java
@@ -16,7 +16,7 @@
*/
/**
- * Components from the Lucene 6.0 index format. See {@link org.apache.lucene.codecs.lucene70}
+ * Components from the Lucene 6.0 index format. See {@link org.apache.lucene.codecs.lucene80}
* for an overview of the current index format.
*/
package org.apache.lucene.codecs.lucene60;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java
index 08dd72d..e1913a0 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene70/package-info.java
@@ -16,390 +16,7 @@
*/
/**
- * Lucene 7.0 file format.
- *
- * <h1>Apache Lucene - Index File Formats</h1>
- * <div>
- * <ul>
- * <li><a href="#Introduction">Introduction</a></li>
- * <li><a href="#Definitions">Definitions</a>
- * <ul>
- * <li><a href="#Inverted_Indexing">Inverted Indexing</a></li>
- * <li><a href="#Types_of_Fields">Types of Fields</a></li>
- * <li><a href="#Segments">Segments</a></li>
- * <li><a href="#Document_Numbers">Document Numbers</a></li>
- * </ul>
- * </li>
- * <li><a href="#Overview">Index Structure Overview</a></li>
- * <li><a href="#File_Naming">File Naming</a></li>
- * <li><a href="#file-names">Summary of File Extensions</a>
- * <ul>
- * <li><a href="#Lock_File">Lock File</a></li>
- * <li><a href="#History">History</a></li>
- * <li><a href="#Limitations">Limitations</a></li>
- * </ul>
- * </li>
- * </ul>
- * </div>
- * <a name="Introduction"></a>
- * <h2>Introduction</h2>
- * <div>
- * <p>This document defines the index file formats used in this version of Lucene.
- * If you are using a different version of Lucene, please consult the copy of
- * <code>docs/</code> that was distributed with
- * the version you are using.</p>
- * <p>This document attempts to provide a high-level definition of the Apache
- * Lucene file formats.</p>
- * </div>
- * <a name="Definitions"></a>
- * <h2>Definitions</h2>
- * <div>
- * <p>The fundamental concepts in Lucene are index, document, field and term.</p>
- * <p>An index contains a sequence of documents.</p>
- * <ul>
- * <li>A document is a sequence of fields.</li>
- * <li>A field is a named sequence of terms.</li>
- * <li>A term is a sequence of bytes.</li>
- * </ul>
- * <p>The same sequence of bytes in two different fields is considered a different
- * term. Thus terms are represented as a pair: the string naming the field, and the
- * bytes within the field.</p>
- * <a name="Inverted_Indexing"></a>
- * <h3>Inverted Indexing</h3>
- * <p>The index stores statistics about terms in order to make term-based search
- * more efficient. Lucene's index falls into the family of indexes known as an
- * <i>inverted index.</i> This is because it can list, for a term, the documents
- * that contain it. This is the inverse of the natural relationship, in which
- * documents list terms.</p>
- * <a name="Types_of_Fields"></a>
- * <h3>Types of Fields</h3>
- * <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored
- * in the index literally, in a non-inverted manner. Fields that are inverted are
- * called <i>indexed</i>. A field may be both stored and indexed.</p>
- * <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the
- * text of a field may be used literally as a term to be indexed. Most fields are
- * tokenized, but sometimes it is useful for certain identifier fields to be
- * indexed literally.</p>
- * <p>See the {@link org.apache.lucene.document.Field Field}
- * java docs for more information on Fields.</p>
- * <a name="Segments"></a>
- * <h3>Segments</h3>
- * <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>.
- * Each segment is a fully independent index, which could be searched separately.
- * Indexes evolve by:</p>
- * <ol>
- * <li>Creating new segments for newly added documents.</li>
- * <li>Merging existing segments.</li>
- * </ol>
- * <p>Searches may involve multiple segments and/or multiple indexes, each index
- * potentially composed of a set of segments.</p>
- * <a name="Document_Numbers"></a>
- * <h3>Document Numbers</h3>
- * <p>Internally, Lucene refers to documents by an integer <i>document number</i>.
- * The first document added to an index is numbered zero, and each subsequent
- * document added gets a number one greater than the previous.</p>
- * <p>Note that a document's number may change, so caution should be taken when
- * storing these numbers outside of Lucene. In particular, numbers may change in
- * the following situations:</p>
- * <ul>
- * <li>
- * <p>The numbers stored in each segment are unique only within the segment, and
- * must be converted before they can be used in a larger context. The standard
- * technique is to allocate each segment a range of values, based on the range of
- * numbers used in that segment. To convert a document number from a segment to an
- * external value, the segment's <i>base</i> document number is added. To convert
- * an external value back to a segment-specific value, the segment is identified
- * by the range that the external value is in, and the segment's base value is
- * subtracted. For example two five document segments might be combined, so that
- * the first segment has a base value of zero, and the second of five. Document
- * three from the second segment would have an external value of eight.</p>
- * </li>
- * <li>
- * <p>When documents are deleted, gaps are created in the numbering. These are
- * eventually removed as the index evolves through merging. Deleted documents are
- * dropped when segments are merged. A freshly-merged segment thus has no gaps in
- * its numbering.</p>
- * </li>
- * </ul>
- * </div>
- * <a name="Overview"></a>
- * <h2>Index Structure Overview</h2>
- * <div>
- * <p>Each segment index maintains the following:</p>
- * <ul>
- * <li>
- * {@link org.apache.lucene.codecs.lucene70.Lucene70SegmentInfoFormat Segment info}.
- * This contains metadata about a segment, such as the number of documents,
- * what files it uses,
- * </li>
- * <li>
- * {@link org.apache.lucene.codecs.lucene50.Lucene50FieldInfosFormat Field names}.
- * This contains the set of field names used in the index.
- * </li>
- * <li>
- * {@link org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat Stored Field values}.
- * This contains, for each document, a list of attribute-value pairs, where the attributes
- * are field names. These are used to store auxiliary information about the document, such as
- * its title, url, or an identifier to access a database. The set of stored fields are what is
- * returned for each hit when searching. This is keyed by document number.
- * </li>
- * <li>
- * {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term dictionary}.
- * A dictionary containing all of the terms used in all of the
- * indexed fields of all of the documents. The dictionary also contains the number
- * of documents which contain the term, and pointers to the term's frequency and
- * proximity data.
- * </li>
- * <li>
- * {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Frequency data}.
- * For each term in the dictionary, the numbers of all the
- * documents that contain that term, and the frequency of the term in that
- * document, unless frequencies are omitted (IndexOptions.DOCS_ONLY)
- * </li>
- * <li>
- * {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Proximity data}.
- * For each term in the dictionary, the positions that the
- * term occurs in each document. Note that this will not exist if all fields in
- * all documents omit position data.
- * </li>
- * <li>
- * {@link org.apache.lucene.codecs.lucene70.Lucene70NormsFormat Normalization factors}.
- * For each field in each document, a value is stored
- * that is multiplied into the score for hits on that field.
- * </li>
- * <li>
- * {@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vectors}.
- * For each field in each document, the term vector (sometimes
- * called document vector) may be stored. A term vector consists of term text and
- * term frequency. To add Term Vectors to your index see the
- * {@link org.apache.lucene.document.Field Field} constructors
- * </li>
- * <li>
- * {@link org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat Per-document values}.
- * Like stored values, these are also keyed by document
- * number, but are generally intended to be loaded into main memory for fast
- * access. Whereas stored values are generally intended for summary results from
- * searches, per-document values are useful for things like scoring factors.
- * </li>
- * <li>
- * {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live documents}.
- * An optional file indicating which documents are live.
- * </li>
- * <li>
- * {@link org.apache.lucene.codecs.lucene60.Lucene60PointsFormat Point values}.
- * Optional pair of files, recording dimensionally indexed fields, to enable fast
- * numeric range filtering and large numeric values like BigInteger and BigDecimal (1D)
- * and geographic shape intersection (2D, 3D).
- * </li>
- * </ul>
- * <p>Details on each of these are provided in their linked pages.</p>
- * </div>
- * <a name="File_Naming"></a>
- * <h2>File Naming</h2>
- * <div>
- * <p>All files belonging to a segment have the same name with varying extensions.
- * The extensions correspond to the different file formats described below. When
- * using the Compound File format (default for small segments) these files (except
- * for the Segment info file, the Lock file, and Deleted documents file) are collapsed
- * into a single .cfs file (see below for details)</p>
- * <p>Typically, all segments in an index are stored in a single directory,
- * although this is not required.</p>
- * <p>File names are never re-used. That is, when any file is saved
- * to the Directory it is given a never before used filename. This is achieved
- * using a simple generations approach. For example, the first segments file is
- * segments_1, then segments_2, etc. The generation is a sequential long integer
- * represented in alpha-numeric (base 36) form.</p>
- * </div>
- * <a name="file-names"></a>
- * <h2>Summary of File Extensions</h2>
- * <div>
- * <p>The following table summarizes the names and extensions of the files in
- * Lucene:</p>
- * <table cellspacing="1" cellpadding="4" summary="lucene filenames by extension">
- * <tr>
- * <th>Name</th>
- * <th>Extension</th>
- * <th>Brief Description</th>
- * </tr>
- * <tr>
- * <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
- * <td>segments_N</td>
- * <td>Stores information about a commit point</td>
- * </tr>
- * <tr>
- * <td><a href="#Lock_File">Lock File</a></td>
- * <td>write.lock</td>
- * <td>The Write lock prevents multiple IndexWriters from writing to the same
- * file.</td>
- * </tr>
- * <tr>
- * <td>{@link org.apache.lucene.codecs.lucene70.Lucene70SegmentInfoFormat Segment Info}</td>
- * <td>.si</td>
- * <td>Stores metadata about a segment</td>
- * </tr>
- * <tr>
- * <td>{@link org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat Compound File}</td>
- * <td>.cfs, .cfe</td>
- * <td>An optional "virtual" file consisting of all the other index files for
- * systems that frequently run out of file handles.</td>
- * </tr>
- * <tr>
- * <td>{@link org.apache.lucene.codecs.lucene50.Lucene50FieldInfosFormat Fields}</td>
- * <td>.fnm</td>
- * <td>Stores information about the fields</td>
- * </tr>
- * <tr>
- * <td>{@link org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat Field Index}</td>
- * <td>.fdx</td>
- * <td>Contains pointers to field data</td>
- * </tr>
- * <tr>
- * <td>{@link org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat Field Data}</td>
- * <td>.fdt</td>
- * <td>The stored fields for documents</td>
- * </tr>
- * <tr>
- * <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Dictionary}</td>
- * <td>.tim</td>
- * <td>The term dictionary, stores term info</td>
- * </tr>
- * <tr>
- * <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Index}</td>
- * <td>.tip</td>
- * <td>The index into the Term Dictionary</td>
- * </tr>
- * <tr>
- * <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Frequencies}</td>
- * <td>.doc</td>
- * <td>Contains the list of docs which contain each term along with frequency</td>
- * </tr>
- * <tr>
- * <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Positions}</td>
- * <td>.pos</td>
- * <td>Stores position information about where a term occurs in the index</td>
- * </tr>
- * <tr>
- * <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Payloads}</td>
- * <td>.pay</td>
- * <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
- * </tr>
- * <tr>
- * <td>{@link org.apache.lucene.codecs.lucene70.Lucene70NormsFormat Norms}</td>
- * <td>.nvd, .nvm</td>
- * <td>Encodes length and boost factors for docs and fields</td>
- * </tr>
- * <tr>
- * <td>{@link org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat Per-Document Values}</td>
- * <td>.dvd, .dvm</td>
- * <td>Encodes additional scoring factors or other per-document information.</td>
- * </tr>
- * <tr>
- * <td>{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Index}</td>
- * <td>.tvx</td>
- * <td>Stores offset into the document data file</td>
- * </tr>
- * <tr>
- * <td>{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Data}</td>
- * <td>.tvd</td>
- * <td>Contains term vector data.</td>
- * </tr>
- * <tr>
- * <td>{@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents}</td>
- * <td>.liv</td>
- * <td>Info about what documents are live</td>
- * </tr>
- * <tr>
- * <td>{@link org.apache.lucene.codecs.lucene60.Lucene60PointsFormat Point values}</td>
- * <td>.dii, .dim</td>
- * <td>Holds indexed points, if any</td>
- * </tr>
- * </table>
- * </div>
- * <a name="Lock_File"></a>
- * <h2>Lock File</h2>
- * The write lock, which is stored in the index directory by default, is named
- * "write.lock". If the lock directory is different from the index directory then
- * the write lock will be named "XXXX-write.lock" where XXXX is a unique prefix
- * derived from the full path to the index directory. When this file is present, a
- * writer is currently modifying the index (adding or removing documents). This
- * lock file ensures that only one writer is modifying the index at a time.
- * <a name="History"></a>
- * <h2>History</h2>
- * <p>Compatibility notes are provided in this document, describing how file
- * formats have changed from prior versions:</p>
- * <ul>
- * <li>In version 2.1, the file format was changed to allow lock-less commits (ie,
- * no more commit lock). The change is fully backwards compatible: you can open a
- * pre-2.1 index for searching or adding/deleting of docs. When the new segments
- * file is saved (committed), it will be written in the new file format (meaning
- * no specific "upgrade" process is needed). But note that once a commit has
- * occurred, pre-2.1 Lucene will not be able to read the index.</li>
- * <li>In version 2.3, the file format was changed to allow segments to share a
- * single set of doc store (vectors & stored fields) files. This allows for
- * faster indexing in certain cases. The change is fully backwards compatible (in
- * the same way as the lock-less commits change in 2.1).</li>
- * <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not
- * Java's modified UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">
- * LUCENE-510</a> for details.</li>
- * <li>In version 2.9, an optional opaque Map<String,String> CommitUserData
- * may be passed to IndexWriter's commit methods (and later retrieved), which is
- * recorded in the segments_N file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">
- * LUCENE-1382</a> for details. Also,
- * diagnostics were added to each segment written recording details about why it
- * was written (due to flush, merge; which OS/JRE was used; etc.). See issue
- * <a href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.</li>
- * <li>In version 3.0, compressed fields are no longer written to the index (they
- * can still be read, but on merge the new segment will write them, uncompressed).
- * See issue <a href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a>
- * for details.</li>
- * <li>In version 3.1, segments records the code version that created them. See
- * <a href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
- * Additionally segments track explicitly whether or not they have term vectors.
- * See <a href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a>
- * for details.</li>
- * <li>In version 3.2, numeric fields are written as natively to stored fields
- * file, previously they were stored in text format only.</li>
- * <li>In version 3.4, fields can omit position data while still indexing term
- * frequencies.</li>
- * <li>In version 4.0, the format of the inverted index became extensible via
- * the {@link org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage
- * ({@code DocValues}) was introduced. Normalization factors need no longer be a
- * single byte, they can be any {@link org.apache.lucene.index.NumericDocValues NumericDocValues}.
- * Terms need not be unicode strings, they can be any byte sequence. Term offsets
- * can optionally be indexed into the postings lists. Payloads can be stored in the
- * term vectors.</li>
- * <li>In version 4.1, the format of the postings list changed to use either
- * of FOR compression or variable-byte encoding, depending upon the frequency
- * of the term. Terms appearing only once were changed to inline directly into
- * the term dictionary. Stored fields are compressed by default. </li>
- * <li>In version 4.2, term vectors are compressed by default. DocValues has
- * a new multi-valued type (SortedSet), that can be used for faceting/grouping/joining
- * on multi-valued fields.</li>
- * <li>In version 4.5, DocValues were extended to explicitly represent missing values.</li>
- * <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
- * allow updating NumericDocValues fields.</li>
- * <li>In version 4.8, checksum footers were added to the end of each index file
- * for improved data integrity. Specifically, the last 8 bytes of every index file
- * contain the zlib-crc32 checksum of the file.</li>
- * <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric)
- * that is suitable for faceting/sorting/analytics.
- * <li>In version 5.4, DocValues have been improved to store more information on disk:
- * addresses for binary fields and ord indexes for multi-valued fields.
- * <li>In version 6.0, Points were added, for multi-dimensional range/distance search.
- * <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index sorting.
- * <li>In version 7.0, DocValues have been improved to better support sparse doc values
- * thanks to an iterator API.
- * </li>
- * </ul>
- * <a name="Limitations"></a>
- * <h2>Limitations</h2>
- * <div>
- * <p>Lucene uses a Java <code>int</code> to refer to
- * document numbers, and the index file format uses an <code>Int32</code>
- * on-disk to store document numbers. This is a limitation
- * of both the index file format and the current implementation. Eventually these
- * should be replaced with either <code>UInt64</code> values, or
- * better yet, {@link org.apache.lucene.store.DataOutput#writeVInt VInt} values which have no limit.</p>
- * </div>
+ * Components from the Lucene 7.0 index format. See {@link org.apache.lucene.codecs.lucene80}
+ * for an overview of the current index format.
*/
package org.apache.lucene.codecs.lucene70;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java
new file mode 100644
index 0000000..8ddb93e
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/IndexedDISI.java
@@ -0,0 +1,632 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.RoaringDocIdSet;
+
+/**
+ * Disk-based implementation of a {@link DocIdSetIterator} which can return
+ * the index of the current document, i.e. the ordinal of the current document
+ * among the list of documents that this iterator can return. This is useful
+ * to implement sparse doc values by only having to encode values for documents
+ * that actually have a value.
+ * <p>Implementation-wise, this {@link DocIdSetIterator} is inspired of
+ * {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536}
+ * documents independently and picks between 3 encodings depending on the
+ * density of the range:<ul>
+ * <li>{@code ALL} if the range contains 65536 documents exactly,
+ * <li>{@code DENSE} if the range contains 4096 documents or more; in that
+ * case documents are stored in a bit set,
+ * <li>{@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
+ * stored in a {@link DataInput#readShort() short}.
+ * </ul>
+ * <p>Only ranges that contain at least one value are encoded.
+ * <p>This implementation uses 6 bytes per document in the worst-case, which happens
+ * in the case that all ranges contain exactly one document.
+ *
+ *
+ * To avoid O(n) lookup time complexity, with n being the number of documents, two lookup
+ * tables are used: A lookup table for block offset and index, and a rank structure
+ * for DENSE block index lookups.
+ *
+ * The lookup table is an array of {@code int}-pairs, with a pair for each block. It allows for
+ * direct jumping to the block, as opposed to iteration from the current position and forward
+ * one block at a time.
+ *
+ * Each int-pair entry consists of 2 logical parts:
+ *
+ * The first 32 bit int holds the index (number of set bits in the blocks) up to just before the
+ * wanted block. The maximum number of set bits is the maximum number of documents, which is < 2^31.
+ *
+ * The next int holds the offset in bytes into the underlying slice. As there is a maximum of 2^16
+ * blocks, it follows that the maximum size of any block must not exceed 2^15 bytes to avoid
+ * overflow (2^16 bytes if the int is treated as unsigned). This is currently the case, with the
+ * largest block being DENSE and using 2^13 + 36 bytes.
+ *
+ * The cache overhead is numDocs/1024 bytes.
+ *
+ * Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits).
+ * In the case of non-existing blocks, the entry in the lookup table has index equal to the
+ * previous entry and offset equal to the next non-empty block.
+ *
+ * The block lookup table is stored at the end of the total block structure.
+ *
+ *
+ * The rank structure for DENSE blocks is an array of byte-pairs with an entry for each
+ * sub-block (default 512 bits) out of the 65536 bits in the outer DENSE block.
+ *
+ * Each rank-entry states the number of set bits within the block up to the bit before the
+ * bit positioned at the start of the sub-block.
+ * Note that that the rank entry of the first sub-block is always 0 and that the last entry can
+ * at most be 65536-2 = 65634 and thus will always fit into an byte-pair of 16 bits.
+ *
+ * The rank structure for a given DENSE block is stored at the beginning of the DENSE block.
+ * This ensures locality and keeps logistics simple.
+ *
+ * @lucene.internal
+ */
+final class IndexedDISI extends DocIdSetIterator {
+
+ // jump-table time/space trade-offs to consider:
+ // The block offsets and the block indexes could be stored in more compressed form with
+ // two PackedInts or two MonotonicDirectReaders.
+ // The DENSE ranks (default 128 shorts = 256 bytes) could likewise be compressed. But as there is
+ // at least 4096 set bits in DENSE blocks, there will be at least one rank with 2^12 bits, so it
+ // is doubtful if there is much to gain here.
+
+ private static final int BLOCK_SIZE = 65536; // The number of docIDs that a single block represents
+
+ private static final int DENSE_BLOCK_LONGS = BLOCK_SIZE/Long.SIZE; // 1024
+ public static final byte DEFAULT_DENSE_RANK_POWER = 9; // Every 512 docIDs / 8 longs
+
+ static final int MAX_ARRAY_LENGTH = (1 << 12) - 1;
+
+ private static void flush(
+ int block, FixedBitSet buffer, int cardinality, byte denseRankPower, IndexOutput out) throws IOException {
+ assert block >= 0 && block < 65536;
+ out.writeShort((short) block);
+ assert cardinality > 0 && cardinality <= 65536;
+ out.writeShort((short) (cardinality - 1));
+ if (cardinality > MAX_ARRAY_LENGTH) {
+ if (cardinality != 65536) { // all docs are set
+ if (denseRankPower != -1) {
+ final byte[] rank = createRank(buffer, denseRankPower);
+ out.writeBytes(rank, rank.length);
+ }
+ for (long word : buffer.getBits()) {
+ out.writeLong(word);
+ }
+ }
+ } else {
+ BitSetIterator it = new BitSetIterator(buffer, cardinality);
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ out.writeShort((short) doc);
+ }
+ }
+ }
+
+ // Creates a DENSE rank-entry (the number of set bits up to a given point) for the buffer.
+ // One rank-entry for every {@code 2^denseRankPower} bits, with each rank-entry using 2 bytes.
+ // Represented as a byte[] for fast flushing and mirroring of the retrieval representation.
+ private static byte[] createRank(FixedBitSet buffer, byte denseRankPower) {
+ final int longsPerRank = 1 << (denseRankPower-6);
+ final int rankMark = longsPerRank-1;
+ final int rankIndexShift = denseRankPower-7; // 6 for the long (2^6) + 1 for 2 bytes/entry
+ final byte[] rank = new byte[DENSE_BLOCK_LONGS >> rankIndexShift];
+ final long[] bits = buffer.getBits();
+ int bitCount = 0;
+ for (int word = 0 ; word < DENSE_BLOCK_LONGS ; word++) {
+ if ((word & rankMark) == 0) { // Every longsPerRank longs
+ rank[word >> rankIndexShift] = (byte)(bitCount>>8);
+ rank[(word >> rankIndexShift)+1] = (byte)(bitCount & 0xFF);
+ }
+ bitCount += Long.bitCount(bits[word]);
+ }
+ return rank;
+ }
+
+ /**
+ * Writes the docIDs from it to out, in logical blocks, one for each 65536 docIDs in monotonically increasing
+ * gap-less order. DENSE blocks uses {@link #DEFAULT_DENSE_RANK_POWER} of 9 (every 512 docIDs / 8 longs).
+ * The caller must keep track of the number of jump-table entries (returned by this method) as well as the
+ * denseRankPower (9 for this method) and provide them when constructing an IndexedDISI for reading.
+ * @param it the document IDs.
+ * @param out destination for the blocks.
+ * @throws IOException if there was an error writing to out.
+ * @return the number of jump-table entries following the blocks, -1 for no entries.
+ * This should be stored in meta and used when creating an instance of IndexedDISI.
+ */
+ static short writeBitSet(DocIdSetIterator it, IndexOutput out) throws IOException {
+ return writeBitSet(it, out, DEFAULT_DENSE_RANK_POWER);
+ }
+
+ /**
+ * Writes the docIDs from it to out, in logical blocks, one for each 65536 docIDs in monotonically
+ * increasing gap-less order.
+ * The caller must keep track of the number of jump-table entries (returned by this method) as well as the
+ * denseRankPower and provide them when constructing an IndexedDISI for reading.
+ * @param it the document IDs.
+ * @param out destination for the blocks.
+ * @param denseRankPower for {@link Method#DENSE} blocks, a rank will be written every {@code 2^denseRankPower} docIDs.
+ * Values < 7 (every 128 docIDs) or > 15 (every 32768 docIDs) disables DENSE rank.
+ * Recommended values are 8-12: Every 256-4096 docIDs or 4-64 longs.
+ * {@link #DEFAULT_DENSE_RANK_POWER} is 9: Every 512 docIDs.
+ * This should be stored in meta and used when creating an instance of IndexedDISI.
+ * @throws IOException if there was an error writing to out.
+ * @return the number of jump-table entries following the blocks, -1 for no entries.
+ * This should be stored in meta and used when creating an instance of IndexedDISI.
+ */
+ static short writeBitSet(DocIdSetIterator it, IndexOutput out, byte denseRankPower) throws IOException {
+ final long origo = out.getFilePointer(); // All jumps are relative to the origo
+ if ((denseRankPower < 7 || denseRankPower > 15) && denseRankPower != -1) {
+ throw new IllegalArgumentException("Acceptable values for denseRankPower are 7-15 (every 128-32768 docIDs). " +
+ "The provided power was " + denseRankPower + " (every " + (int)Math.pow(2, denseRankPower) + " docIDs)");
+ }
+ int totalCardinality = 0;
+ int blockCardinality = 0;
+ final FixedBitSet buffer = new FixedBitSet(1<<16);
+ int[] jumps = new int[ArrayUtil.oversize(1, Integer.BYTES*2)];
+ int prevBlock = -1;
+ int jumpBlockIndex = 0;
+
+ for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+ final int block = doc >>> 16;
+ if (prevBlock != -1 && block != prevBlock) {
+ // Track offset+index from previous block up to current
+ jumps = addJumps(jumps, out.getFilePointer()-origo, totalCardinality, jumpBlockIndex, prevBlock+1);
+ jumpBlockIndex = prevBlock+1;
+ // Flush block
+ flush(prevBlock, buffer, blockCardinality, denseRankPower, out);
+ // Reset for next block
+ buffer.clear(0, buffer.length());
+ totalCardinality += blockCardinality;
+ blockCardinality = 0;
+ }
+ buffer.set(doc & 0xFFFF);
+ blockCardinality++;
+ prevBlock = block;
+ }
+ if (blockCardinality > 0) {
+ jumps = addJumps(jumps, out.getFilePointer()-origo, totalCardinality, jumpBlockIndex, prevBlock+1);
+ totalCardinality += blockCardinality;
+ flush(prevBlock, buffer, blockCardinality, denseRankPower, out);
+ buffer.clear(0, buffer.length());
+ prevBlock++;
+ }
+ final int lastBlock = prevBlock == -1 ? 0 : prevBlock; // There will always be at least 1 block (NO_MORE_DOCS)
+ // Last entry is a SPARSE with blockIndex == 32767 and the single entry 65535, which becomes the docID NO_MORE_DOCS
+ // To avoid creating 65K jump-table entries, only a single entry is created pointing to the offset of the
+ // NO_MORE_DOCS block, with the jumpBlockIndex set to the logical EMPTY block after all real blocks.
+ jumps = addJumps(jumps, out.getFilePointer()-origo, totalCardinality, lastBlock, lastBlock+1);
+ buffer.set(DocIdSetIterator.NO_MORE_DOCS & 0xFFFF);
+ flush(DocIdSetIterator.NO_MORE_DOCS >>> 16, buffer, 1, denseRankPower, out);
+ // offset+index jump-table stored at the end
+ return flushBlockJumps(jumps, lastBlock+1, out, origo);
+ }
+
+ // Adds entries to the offset & index jump-table for blocks
+ private static int[] addJumps(int[] jumps, long offset, int index, int startBlock, int endBlock) {
+ assert offset < Integer.MAX_VALUE : "Logically the offset should not exceed 2^30 but was >= Integer.MAX_VALUE";
+ jumps = ArrayUtil.grow(jumps, (endBlock+1)*2);
+ for (int b = startBlock; b < endBlock; b++) {
+ jumps[b*2] = index;
+ jumps[b*2+1] = (int) offset;
+ }
+ return jumps;
+ }
+
+ // Flushes the offet & index jump-table for blocks. This should be the last data written to out
+ // This method returns the blockCount for the blocks reachable for the jump_table or -1 for no jump-table
+ private static short flushBlockJumps(int[] jumps, int blockCount, IndexOutput out, long origo) throws IOException {
+ if (blockCount == 2) { // Jumps with a single real entry + NO_MORE_DOCS is just wasted space so we ignore that
+ blockCount = 0;
+ }
+ for (int i = 0 ; i < blockCount ; i++) {
+ out.writeInt(jumps[i*2]); // index
+ out.writeInt(jumps[i*2+1]); // offset
+ }
+ // As there are at most 32k blocks, the count is a short
+ // The jumpTableOffset will be at lastPos - (blockCount * Long.BYTES)
+ return (short)blockCount;
+ }
+
+ /** The slice that stores the {@link DocIdSetIterator}. */
+ private final IndexInput slice;
+ private final int jumpTableEntryCount;
+ private final byte denseRankPower;
+ private final RandomAccessInput jumpTable; // Skip blocks of 64K bits
+ private final byte[] denseRankTable;
+ private final long cost;
+
+ /**
+ * This constructor always creates a new blockSlice and a new jumpTable from in, to ensure that operations are
+ * independent from the caller.
+ * See {@link #IndexedDISI(IndexInput, RandomAccessInput, int, byte, long)} for re-use of blockSlice and jumpTable.
+ * @param in backing data.
+ * @param offset starting offset for blocks in the backing data.
+ * @param length the number of bytes holding blocks and jump-table in the backing data.
+ * @param jumpTableEntryCount the number of blocks covered by the jump-table.
+ * This must match the number returned by {@link #writeBitSet(DocIdSetIterator, IndexOutput, byte)}.
+ * @param denseRankPower the number of docIDs covered by each rank entry in DENSE blocks, expressed as {@code 2^denseRankPower}.
+ * This must match the power given in {@link #writeBitSet(DocIdSetIterator, IndexOutput, byte)}
+ * @param cost normally the number of logical docIDs.
+ */
+ IndexedDISI(IndexInput in, long offset, long length, int jumpTableEntryCount, byte denseRankPower, long cost) throws IOException {
+ this(createBlockSlice(in,"docs", offset, length, jumpTableEntryCount),
+ createJumpTable(in, offset, length, jumpTableEntryCount),
+ jumpTableEntryCount, denseRankPower, cost);
+ }
+
+ /**
+ * This constructor allows to pass the slice and jumpTable directly in case it helps reuse.
+ * see eg. Lucene80 norms producer's merge instance.
+ * @param blockSlice data blocks, normally created by {@link #createBlockSlice}.
+ * @param jumpTable table holding jump-data for block-skips, normally created by {@link #createJumpTable}.
+ * @param jumpTableEntryCount the number of blocks covered by the jump-table.
+ * This must match the number returned by {@link #writeBitSet(DocIdSetIterator, IndexOutput, byte)}.
+ * @param denseRankPower the number of docIDs covered by each rank entry in DENSE blocks, expressed as {@code 2^denseRankPower}.
+ * This must match the power given in {@link #writeBitSet(DocIdSetIterator, IndexOutput, byte)}
+ * @param cost normally the number of logical docIDs.
+ */
+ IndexedDISI(IndexInput blockSlice, RandomAccessInput jumpTable, int jumpTableEntryCount, byte denseRankPower, long cost) throws IOException {
+ if ((denseRankPower < 7 || denseRankPower > 15) && denseRankPower != -1) {
+ throw new IllegalArgumentException("Acceptable values for denseRankPower are 7-15 (every 128-32768 docIDs). " +
+ "The provided power was " + denseRankPower + " (every " + (int)Math.pow(2, denseRankPower) + " docIDs). ");
+ }
+
+ this.slice = blockSlice;
+ this.jumpTable = jumpTable;
+ this.jumpTableEntryCount = jumpTableEntryCount;
+ this.denseRankPower = denseRankPower;
+ final int rankIndexShift = denseRankPower-7;
+ this.denseRankTable = denseRankPower == -1 ? null : new byte[DENSE_BLOCK_LONGS >> rankIndexShift];
+ this.cost = cost;
+ }
+
+ /**
+ * Helper method for using {@link #IndexedDISI(IndexInput, RandomAccessInput, int, byte, long)}.
+ * Creates a disiSlice for the IndexedDISI data blocks, without the jump-table.
+ * @param slice backing data, holding both blocks and jump-table.
+ * @param sliceDescription human readable slice designation.
+ * @param offset relative to the backing data.
+ * @param length full length of the IndexedDISI, including blocks and jump-table data.
+ * @param jumpTableEntryCount the number of blocks covered by the jump-table.
+ * @return a jumpTable containing the block jump-data or null if no such table exists.
+ * @throws IOException if a RandomAccessInput could not be created from slice.
+ */
+ public static IndexInput createBlockSlice(
+ IndexInput slice, String sliceDescription, long offset, long length, int jumpTableEntryCount) throws IOException {
+ long jumpTableBytes = jumpTableEntryCount < 0 ? 0 : jumpTableEntryCount*Integer.BYTES*2;
+ return slice.slice(sliceDescription, offset, length - jumpTableBytes);
+ }
+
+ /**
+ * Helper method for using {@link #IndexedDISI(IndexInput, RandomAccessInput, int, byte, long)}.
+ * Creates a RandomAccessInput covering only the jump-table data or null.
+ * @param slice backing data, holding both blocks and jump-table.
+ * @param offset relative to the backing data.
+ * @param length full length of the IndexedDISI, including blocks and jump-table data.
+ * @param jumpTableEntryCount the number of blocks covered by the jump-table.
+ * @return a jumpTable containing the block jump-data or null if no such table exists.
+ * @throws IOException if a RandomAccessInput could not be created from slice.
+ */
+ public static RandomAccessInput createJumpTable(
+ IndexInput slice, long offset, long length, int jumpTableEntryCount) throws IOException {
+ if (jumpTableEntryCount <= 0) {
+ return null;
+ } else {
+ int jumpTableBytes = jumpTableEntryCount*Integer.BYTES*2;
+ return slice.randomAccessSlice(offset + length - jumpTableBytes, jumpTableBytes);
+ }
+ }
+
+ private int block = -1;
+ private long blockEnd;
+ private long denseBitmapOffset = -1; // Only used for DENSE blocks
+ private int nextBlockIndex = -1;
+ Method method;
+
+ private int doc = -1;
+ private int index = -1;
+
+ // SPARSE variables
+ private boolean exists;
+
+ // DENSE variables
+ private long word;
+ private int wordIndex = -1;
+ // number of one bits encountered so far, including those of `word`
+ private int numberOfOnes;
+ // Used with rank for jumps inside of DENSE as they are absolute instead of relative
+ private int denseOrigoIndex;
+
+ // ALL variables
+ private int gap;
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ final int targetBlock = target & 0xFFFF0000;
+ if (block < targetBlock) {
+ advanceBlock(targetBlock);
+ }
+ if (block == targetBlock) {
+ if (method.advanceWithinBlock(this, target)) {
+ return doc;
+ }
+ readBlockHeader();
+ }
+ boolean found = method.advanceWithinBlock(this, block);
+ assert found;
+ return doc;
+ }
+
+ public boolean advanceExact(int target) throws IOException {
+ final int targetBlock = target & 0xFFFF0000;
+ if (block < targetBlock) {
+ advanceBlock(targetBlock);
+ }
+ boolean found = block == targetBlock && method.advanceExactWithinBlock(this, target);
+ this.doc = target;
+ return found;
+ }
+
+ private void advanceBlock(int targetBlock) throws IOException {
+ final int blockIndex = targetBlock >> 16;
+ // If the destination block is 2 blocks or more ahead, we use the jump-table.
+ if (jumpTable != null && blockIndex >= (block >> 16)+2) {
+ // If the jumpTableEntryCount is exceeded, there are no further bits. Last entry is always NO_MORE_DOCS
+ final int inRangeBlockIndex = blockIndex < jumpTableEntryCount ? blockIndex : jumpTableEntryCount-1;
+ final int index = jumpTable.readInt(inRangeBlockIndex*Integer.BYTES*2);
+ final int offset = jumpTable.readInt(inRangeBlockIndex*Integer.BYTES*2+Integer.BYTES);
+ this.nextBlockIndex = index-1; // -1 to compensate for the always-added 1 in readBlockHeader
+ slice.seek(offset);
+ readBlockHeader();
+ return;
+ }
+
+ // Fallback to iteration of blocks
+ do {
+ slice.seek(blockEnd);
+ readBlockHeader();
+ } while (block < targetBlock);
+ }
+
+ private void readBlockHeader() throws IOException {
+ block = Short.toUnsignedInt(slice.readShort()) << 16;
+ assert block >= 0;
+ final int numValues = 1 + Short.toUnsignedInt(slice.readShort());
+ index = nextBlockIndex;
+ nextBlockIndex = index + numValues;
+ if (numValues <= MAX_ARRAY_LENGTH) {
+ method = Method.SPARSE;
+ blockEnd = slice.getFilePointer() + (numValues << 1);
+ } else if (numValues == 65536) {
+ method = Method.ALL;
+ blockEnd = slice.getFilePointer();
+ gap = block - index - 1;
+ } else {
+ method = Method.DENSE;
+ denseBitmapOffset = slice.getFilePointer() + (denseRankTable == null ? 0 : denseRankTable.length);
+ blockEnd = denseBitmapOffset + (1 << 13);
+ // Performance consideration: All rank (default 128 * 16 bits) are loaded up front. This should be fast with the
+ // reusable byte[] buffer, but it is still wasted if the DENSE block is iterated in small steps.
+ // If this results in too great a performance regression, a heuristic strategy might work where the rank data
+ // are loaded on first in-block advance, if said advance is > X docIDs. The hope being that a small first
+ // advance means that subsequent advances will be small too.
+ // Another alternative is to maintain an extra slice for DENSE rank, but IndexedDISI is already slice-heavy.
+ if (denseRankPower != -1) {
+ slice.readBytes(denseRankTable, 0, denseRankTable.length);
+ }
+ wordIndex = -1;
+ numberOfOnes = index + 1;
+ denseOrigoIndex = numberOfOnes;
+ }
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return advance(doc + 1);
+ }
+
+ public int index() {
+ return index;
+ }
+
+ @Override
+ public long cost() {
+ return cost;
+ }
+
+ enum Method {
+ SPARSE {
+ @Override
+ boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException {
+ final int targetInBlock = target & 0xFFFF;
+ // TODO: binary search
+ for (; disi.index < disi.nextBlockIndex;) {
+ int doc = Short.toUnsignedInt(disi.slice.readShort());
+ disi.index++;
+ if (doc >= targetInBlock) {
+ disi.doc = disi.block | doc;
+ disi.exists = true;
+ return true;
+ }
+ }
+ return false;
+ }
+ @Override
+ boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException {
+ final int targetInBlock = target & 0xFFFF;
+ // TODO: binary search
+ if (target == disi.doc) {
+ return disi.exists;
+ }
+ for (; disi.index < disi.nextBlockIndex;) {
+ int doc = Short.toUnsignedInt(disi.slice.readShort());
+ disi.index++;
+ if (doc >= targetInBlock) {
+ if (doc != targetInBlock) {
+ disi.index--;
+ disi.slice.seek(disi.slice.getFilePointer() - Short.BYTES);
+ break;
+ }
+ disi.exists = true;
+ return true;
+ }
+ }
+ disi.exists = false;
+ return false;
+ }
+ },
+ DENSE {
+ @Override
+ boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException {
+ final int targetInBlock = target & 0xFFFF;
+ final int targetWordIndex = targetInBlock >>> 6;
+
+ // If possible, skip ahead using the rank cache
+ rankSkip(disi, target);
+
+ for (int i = disi.wordIndex + 1; i <= targetWordIndex; ++i) {
+ disi.word = disi.slice.readLong();
+ disi.numberOfOnes += Long.bitCount(disi.word);
+ }
+ disi.wordIndex = targetWordIndex;
+
+ long leftBits = disi.word >>> target;
+ if (leftBits != 0L) {
+ disi.doc = target + Long.numberOfTrailingZeros(leftBits);
+ disi.index = disi.numberOfOnes - Long.bitCount(leftBits);
+ return true;
+ }
+
+ // There were no set bits at the wanted position. Move forward until one is reached
+ while (++disi.wordIndex < 1024) {
+ // This could use the rank cache to skip empty spaces >= 512 bits, but it seems unrealistic
+ // that such blocks would be DENSE
+ disi.word = disi.slice.readLong();
+ if (disi.word != 0) {
+ disi.index = disi.numberOfOnes;
+ disi.numberOfOnes += Long.bitCount(disi.word);
+ disi.doc = disi.block | (disi.wordIndex << 6) | Long.numberOfTrailingZeros(disi.word);
+ return true;
+ }
+ }
+ // No set bits in the block at or after the wanted position.
+ return false;
+ }
+
+ @Override
+ boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException {
+ final int targetInBlock = target & 0xFFFF;
+ final int targetWordIndex = targetInBlock >>> 6;
+
+ rankSkip(disi, target);
+
+ for (int i = disi.wordIndex + 1; i <= targetWordIndex; ++i) {
+ disi.word = disi.slice.readLong();
+ disi.numberOfOnes += Long.bitCount(disi.word);
+ }
+ disi.wordIndex = targetWordIndex;
+
+ long leftBits = disi.word >>> target;
+ disi.index = disi.numberOfOnes - Long.bitCount(leftBits);
+ return (leftBits & 1L) != 0;
+ }
+
+
+ },
+ ALL {
+ @Override
+ boolean advanceWithinBlock(IndexedDISI disi, int target) {
+ disi.doc = target;
+ disi.index = target - disi.gap;
+ return true;
+ }
+ @Override
+ boolean advanceExactWithinBlock(IndexedDISI disi, int target) {
+ disi.index = target - disi.gap;
+ return true;
+ }
+ };
+
+ /** Advance to the first doc from the block that is equal to or greater than {@code target}.
+ * Return true if there is such a doc and false otherwise. */
+ abstract boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException;
+
+ /** Advance the iterator exactly to the position corresponding to the given {@code target}
+ * and return whether this document exists. */
+ abstract boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException;
+ }
+
+ /**
+ * If the distance between the current position and the target is > 8 words, the rank cache will
+ * be used to guarantee a worst-case of 1 rank-lookup and 7 word-read-and-count-bits operations.
+ * Note: This does not guarantee a skip up to target, only up to nearest rank boundary. It is the
+ * responsibility of the caller to iterate further to reach target.
+ * @param disi standard DISI.
+ * @param target the wanted docID for which to calculate set-flag and index.
+ * @throws IOException if a DISI seek failed.
+ */
+ private static void rankSkip(IndexedDISI disi, int target) throws IOException {
+ if (disi.denseRankPower == -1) { // No rank for the current structure
+ return;
+ }
+
+ final int targetInBlock = target & 0xFFFF; // Lower 16 bits
+ final int targetWordIndex = targetInBlock >>> 6; // long: 2^6 = 64
+
+ // If the distance between the current position and the target is < rank-longs
+ // there is no sense in using rank
+ if (targetWordIndex - disi.wordIndex < (1 << (disi.denseRankPower-6) )) {
+ return;
+ }
+
+ // Resolve the rank as close to targetInBlock as possible (maximum distance is 8 longs)
+ // Note: rankOrigoOffset is tracked on block open, so it is absolute (e.g. don't add origo)
+ final int rankIndex = targetInBlock >> disi.denseRankPower; // Default is 9 (8 longs: 2^3 * 2^6 = 512 docIDs)
+
+ final int rank =
+ (disi.denseRankTable[rankIndex<<1] & 0xFF) << 8 |
+ (disi.denseRankTable[(rankIndex<<1)+1] & 0xFF);
+
+ // Position the counting logic just after the rank point
+ final int rankAlignedWordIndex = rankIndex << disi.denseRankPower >> 6;
+ disi.slice.seek(disi.denseBitmapOffset + rankAlignedWordIndex*Long.BYTES);
+ long rankWord = disi.slice.readLong();
+ int denseNOO = rank + Long.bitCount(rankWord);
+
+ disi.wordIndex = rankAlignedWordIndex;
+ disi.word = rankWord;
+ disi.numberOfOnes = disi.denseOrigoIndex + denseNOO;
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80Codec.java
index 1905b70..93e91ea 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80Codec.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80Codec.java
@@ -37,7 +37,6 @@ import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat;
import org.apache.lucene.codecs.lucene60.Lucene60PointsFormat;
-import org.apache.lucene.codecs.lucene70.Lucene70NormsFormat;
import org.apache.lucene.codecs.lucene70.Lucene70SegmentInfoFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
@@ -150,7 +149,7 @@ public class Lucene80Codec extends Codec {
/** Returns the docvalues format that should be used for writing
* new segments of <code>field</code>.
*
- * The default implementation always returns "Lucene70".
+ * The default implementation always returns "Lucene80".
* <p>
* <b>WARNING:</b> if you subclass, you are responsible for index
* backwards compatibility: future version of Lucene are only
@@ -166,9 +165,9 @@ public class Lucene80Codec extends Codec {
}
private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene50");
- private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene70");
+ private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene80");
- private final NormsFormat normsFormat = new Lucene70NormsFormat();
+ private final NormsFormat normsFormat = new Lucene80NormsFormat();
@Override
public final NormsFormat normsFormat() {
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesConsumer.java
similarity index 71%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesConsumer.java
rename to lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesConsumer.java
index 6db3cca..38b9fc0 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesConsumer.java
@@ -14,14 +14,10 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.lucene.codecs.lucene70;
+package org.apache.lucene.codecs.lucene80;
-import static org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
-import static org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat.NUMERIC_BLOCK_SHIFT;
-import static org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat.NUMERIC_BLOCK_SIZE;
-
-import java.io.Closeable; // javadocs
+import java.io.Closeable;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
@@ -47,6 +43,7 @@ import org.apache.lucene.search.SortedSetSelector;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.ByteBuffersIndexOutput;
import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
@@ -55,22 +52,26 @@ import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.packed.DirectMonotonicWriter;
import org.apache.lucene.util.packed.DirectWriter;
-/** writer for {@link Lucene70DocValuesFormat} */
-final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Closeable {
+import static org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
+import static org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat.NUMERIC_BLOCK_SHIFT;
+import static org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE;
+
+/** writer for {@link Lucene80DocValuesFormat} */
+final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Closeable {
IndexOutput data, meta;
final int maxDoc;
/** expert: Creates a new writer */
- public Lucene70DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
+ public Lucene80DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
boolean success = false;
try {
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
data = state.directory.createOutput(dataName, state.context);
- CodecUtil.writeIndexHeader(data, dataCodec, Lucene70DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
+ CodecUtil.writeIndexHeader(data, dataCodec, Lucene80DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
meta = state.directory.createOutput(metaName, state.context);
- CodecUtil.writeIndexHeader(meta, metaCodec, Lucene70DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
+ CodecUtil.writeIndexHeader(meta, metaCodec, Lucene80DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
maxDoc = state.segmentInfo.maxDoc();
success = true;
} finally {
@@ -105,7 +106,7 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close
@Override
public void addNumericField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
meta.writeInt(field.number);
- meta.writeByte(Lucene70DocValuesFormat.NUMERIC);
+ meta.writeByte(Lucene80DocValuesFormat.NUMERIC);
writeValues(field, new EmptyDocValuesProducer() {
@Override
@@ -196,27 +197,33 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close
final long max = minMax.max;
assert blockMinMax.spaceInBits <= minMax.spaceInBits;
- if (numDocsWithValue == 0) {
- meta.writeLong(-2);
- meta.writeLong(0L);
- } else if (numDocsWithValue == maxDoc) {
- meta.writeLong(-1);
- meta.writeLong(0L);
- } else {
+ if (numDocsWithValue == 0) { // meta[-2, 0]: No documents with values
+ meta.writeLong(-2); // docsWithFieldOffset
+ meta.writeLong(0L); // docsWithFieldLength
+ meta.writeShort((short) -1); // jumpTableEntryCount
+ meta.writeByte((byte) -1); // denseRankPower
+ } else if (numDocsWithValue == maxDoc) { // meta[-1, 0]: All documents has values
+ meta.writeLong(-1); // docsWithFieldOffset
+ meta.writeLong(0L); // docsWithFieldLength
+ meta.writeShort((short) -1); // jumpTableEntryCount
+ meta.writeByte((byte) -1); // denseRankPower
+ } else { // meta[data.offset, data.length]: IndexedDISI structure for documents with values
long offset = data.getFilePointer();
- meta.writeLong(offset);
+ meta.writeLong(offset);// docsWithFieldOffset
values = valuesProducer.getSortedNumeric(field);
- IndexedDISI.writeBitSet(values, data);
- meta.writeLong(data.getFilePointer() - offset);
+ final short jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
+ meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
+ meta.writeShort(jumpTableEntryCount);
+ meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
}
meta.writeLong(numValues);
final int numBitsPerValue;
boolean doBlocks = false;
Map<Long, Integer> encode = null;
- if (min >= max) {
+ if (min >= max) { // meta[-1]: All values are 0
numBitsPerValue = 0;
- meta.writeInt(-1);
+ meta.writeInt(-1); // tablesize
} else {
if (uniqueValues != null
&& uniqueValues.size() > 1
@@ -224,9 +231,9 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close
numBitsPerValue = DirectWriter.unsignedBitsRequired(uniqueValues.size() - 1);
final Long[] sortedUniqueValues = uniqueValues.toArray(new Long[0]);
Arrays.sort(sortedUniqueValues);
- meta.writeInt(sortedUniqueValues.length);
+ meta.writeInt(sortedUniqueValues.length); // tablesize
for (Long v : sortedUniqueValues) {
- meta.writeLong(v);
+ meta.writeLong(v); // table[] entry
}
encode = new HashMap<>();
for (int i = 0; i < sortedUniqueValues.length; ++i) {
@@ -240,14 +247,14 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close
doBlocks = minMax.spaceInBits > 0 && (double) blockMinMax.spaceInBits / minMax.spaceInBits <= 0.9;
if (doBlocks) {
numBitsPerValue = 0xFF;
- meta.writeInt(-2 - NUMERIC_BLOCK_SHIFT);
+ meta.writeInt(-2 - NUMERIC_BLOCK_SHIFT); // tablesize
} else {
numBitsPerValue = DirectWriter.unsignedBitsRequired((max - min) / gcd);
if (gcd == 1 && min > 0
&& DirectWriter.unsignedBitsRequired(max) == DirectWriter.unsignedBitsRequired(max - min)) {
min = 0;
}
- meta.writeInt(-1);
+ meta.writeInt(-1); // tablesize
}
}
}
@@ -256,14 +263,15 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close
meta.writeLong(min);
meta.writeLong(gcd);
long startOffset = data.getFilePointer();
- meta.writeLong(startOffset);
+ meta.writeLong(startOffset); // valueOffset
+ long jumpTableOffset = -1;
if (doBlocks) {
- writeValuesMultipleBlocks(valuesProducer.getSortedNumeric(field), gcd);
+ jumpTableOffset = writeValuesMultipleBlocks(valuesProducer.getSortedNumeric(field), gcd);
} else if (numBitsPerValue != 0) {
writeValuesSingleBlock(valuesProducer.getSortedNumeric(field), numValues, numBitsPerValue, min, gcd, encode);
}
- meta.writeLong(data.getFilePointer() - startOffset);
-
+ meta.writeLong(data.getFilePointer() - startOffset); // valuesLength
+ meta.writeLong(jumpTableOffset);
return new long[] {numDocsWithValue, numValues};
}
@@ -282,8 +290,11 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close
}
writer.finish();
}
-
- private void writeValuesMultipleBlocks(SortedNumericDocValues values, long gcd) throws IOException {
+
+ // Returns the offset to the jump-table for vBPV
+ private long writeValuesMultipleBlocks(SortedNumericDocValues values, long gcd) throws IOException {
+ long[] offsets = new long[ArrayUtil.oversize(1, Long.BYTES)];
+ int offsetsIndex = 0;
final long[] buffer = new long[NUMERIC_BLOCK_SIZE];
final ByteBuffersDataOutput encodeBuffer = ByteBuffersDataOutput.newResettableInstance();
int upTo = 0;
@@ -291,14 +302,26 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close
for (int i = 0, count = values.docValueCount(); i < count; ++i) {
buffer[upTo++] = values.nextValue();
if (upTo == NUMERIC_BLOCK_SIZE) {
+ offsets = ArrayUtil.grow(offsets, offsetsIndex+1);
+ offsets[offsetsIndex++] = data.getFilePointer();
writeBlock(buffer, NUMERIC_BLOCK_SIZE, gcd, encodeBuffer);
upTo = 0;
}
}
}
if (upTo > 0) {
+ offsets = ArrayUtil.grow(offsets, offsetsIndex+1);
+ offsets[offsetsIndex++] = data.getFilePointer();
writeBlock(buffer, upTo, gcd, encodeBuffer);
}
+
+ // All blocks has been written. Flush the offset jump-table
+ final long offsetsOrigo = data.getFilePointer();
+ for (int i = 0 ; i < offsetsIndex ; i++) {
+ data.writeLong(offsets[i]);
+ }
+ data.writeLong(offsetsOrigo);
+ return offsetsOrigo;
}
private void writeBlock(long[] values, int length, long gcd, ByteBuffersDataOutput buffer) throws IOException {
@@ -333,11 +356,11 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close
@Override
public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
meta.writeInt(field.number);
- meta.writeByte(Lucene70DocValuesFormat.BINARY);
+ meta.writeByte(Lucene80DocValuesFormat.BINARY);
BinaryDocValues values = valuesProducer.getBinary(field);
long start = data.getFilePointer();
- meta.writeLong(start);
+ meta.writeLong(start); // dataOffset
int numDocsWithField = 0;
int minLength = Integer.MAX_VALUE;
int maxLength = 0;
@@ -350,20 +373,26 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close
maxLength = Math.max(length, maxLength);
}
assert numDocsWithField <= maxDoc;
- meta.writeLong(data.getFilePointer() - start);
+ meta.writeLong(data.getFilePointer() - start); // dataLength
if (numDocsWithField == 0) {
- meta.writeLong(-2);
- meta.writeLong(0L);
+ meta.writeLong(-2); // docsWithFieldOffset
+ meta.writeLong(0L); // docsWithFieldLength
+ meta.writeShort((short) -1); // jumpTableEntryCount
+ meta.writeByte((byte) -1); // denseRankPower
} else if (numDocsWithField == maxDoc) {
- meta.writeLong(-1);
- meta.writeLong(0L);
+ meta.writeLong(-1); // docsWithFieldOffset
+ meta.writeLong(0L); // docsWithFieldLength
+ meta.writeShort((short) -1); // jumpTableEntryCount
+ meta.writeByte((byte) -1); // denseRankPower
} else {
long offset = data.getFilePointer();
- meta.writeLong(offset);
+ meta.writeLong(offset); // docsWithFieldOffset
values = valuesProducer.getBinary(field);
- IndexedDISI.writeBitSet(values, data);
- meta.writeLong(data.getFilePointer() - offset);
+ final short jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
+ meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
+ meta.writeShort(jumpTableEntryCount);
+ meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
}
meta.writeInt(numDocsWithField);
@@ -390,7 +419,7 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close
@Override
public void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
meta.writeInt(field.number);
- meta.writeByte(Lucene70DocValuesFormat.SORTED);
+ meta.writeByte(Lucene80DocValuesFormat.SORTED);
doAddSortedField(field, valuesProducer);
}
@@ -402,36 +431,42 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close
}
if (numDocsWithField == 0) {
- meta.writeLong(-2);
- meta.writeLong(0L);
+ meta.writeLong(-2); // docsWithFieldOffset
+ meta.writeLong(0L); // docsWithFieldLength
+ meta.writeShort((short) -1); // jumpTableEntryCount
+ meta.writeByte((byte) -1); // denseRankPower
} else if (numDocsWithField == maxDoc) {
- meta.writeLong(-1);
- meta.writeLong(0L);
+ meta.writeLong(-1); // docsWithFieldOffset
+ meta.writeLong(0L); // docsWithFieldLength
+ meta.writeShort((short) -1); // jumpTableEntryCount
+ meta.writeByte((byte) -1); // denseRankPower
} else {
long offset = data.getFilePointer();
- meta.writeLong(offset);
+ meta.writeLong(offset); // docsWithFieldOffset
values = valuesProducer.getSorted(field);
- IndexedDISI.writeBitSet(values, data);
- meta.writeLong(data.getFilePointer() - offset);
+ final short jumpTableentryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
+ meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
+ meta.writeShort(jumpTableentryCount);
+ meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
}
meta.writeInt(numDocsWithField);
if (values.getValueCount() <= 1) {
- meta.writeByte((byte) 0);
- meta.writeLong(0L);
- meta.writeLong(0L);
+ meta.writeByte((byte) 0); // bitsPerValue
+ meta.writeLong(0L); // ordsOffset
+ meta.writeLong(0L); // ordsLength
} else {
int numberOfBitsPerOrd = DirectWriter.unsignedBitsRequired(values.getValueCount() - 1);
- meta.writeByte((byte) numberOfBitsPerOrd);
+ meta.writeByte((byte) numberOfBitsPerOrd); // bitsPerValue
long start = data.getFilePointer();
- meta.writeLong(start);
+ meta.writeLong(start); // ordsOffset
DirectWriter writer = DirectWriter.getInstance(data, numDocsWithField, numberOfBitsPerOrd);
values = valuesProducer.getSorted(field);
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
writer.add(values.ordValue());
}
writer.finish();
- meta.writeLong(data.getFilePointer() - start);
+ meta.writeLong(data.getFilePointer() - start); // ordsLength
}
addTermsDict(DocValues.singleton(valuesProducer.getSorted(field)));
@@ -440,12 +475,12 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close
private void addTermsDict(SortedSetDocValues values) throws IOException {
final long size = values.getValueCount();
meta.writeVLong(size);
- meta.writeInt(Lucene70DocValuesFormat.TERMS_DICT_BLOCK_SHIFT);
+ meta.writeInt(Lucene80DocValuesFormat.TERMS_DICT_BLOCK_SHIFT);
ByteBuffersDataOutput addressBuffer = new ByteBuffersDataOutput();
ByteBuffersIndexOutput addressOutput = new ByteBuffersIndexOutput(addressBuffer, "temp", "temp");
meta.writeInt(DIRECT_MONOTONIC_BLOCK_SHIFT);
- long numBlocks = (size + Lucene70DocValuesFormat.TERMS_DICT_BLOCK_MASK) >>> Lucene70DocValuesFormat.TERMS_DICT_BLOCK_SHIFT;
+ long numBlocks = (size + Lucene80DocValuesFormat.TERMS_DICT_BLOCK_MASK) >>> Lucene80DocValuesFormat.TERMS_DICT_BLOCK_SHIFT;
DirectMonotonicWriter writer = DirectMonotonicWriter.getInstance(meta, addressOutput, numBlocks, DIRECT_MONOTONIC_BLOCK_SHIFT);
BytesRefBuilder previous = new BytesRefBuilder();
@@ -454,7 +489,7 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close
int maxLength = 0;
TermsEnum iterator = values.termsEnum();
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
- if ((ord & Lucene70DocValuesFormat.TERMS_DICT_BLOCK_MASK) == 0) {
+ if ((ord & Lucene80DocValuesFormat.TERMS_DICT_BLOCK_MASK) == 0) {
writer.add(data.getFilePointer() - start);
data.writeVInt(term.length);
data.writeBytes(term.bytes, term.offset, term.length);
@@ -491,50 +526,50 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close
private void writeTermsIndex(SortedSetDocValues values) throws IOException {
final long size = values.getValueCount();
- meta.writeInt(Lucene70DocValuesFormat.TERMS_DICT_REVERSE_INDEX_SHIFT);
+ meta.writeInt(Lucene80DocValuesFormat.TERMS_DICT_REVERSE_INDEX_SHIFT);
long start = data.getFilePointer();
- long numBlocks = 1L + ((size + Lucene70DocValuesFormat.TERMS_DICT_REVERSE_INDEX_MASK) >>> Lucene70DocValuesFormat.TERMS_DICT_REVERSE_INDEX_SHIFT);
+ long numBlocks = 1L + ((size + Lucene80DocValuesFormat.TERMS_DICT_REVERSE_INDEX_MASK) >>> Lucene80DocValuesFormat.TERMS_DICT_REVERSE_INDEX_SHIFT);
ByteBuffersDataOutput addressBuffer = new ByteBuffersDataOutput();
DirectMonotonicWriter writer;
try (ByteBuffersIndexOutput addressOutput = new ByteBuffersIndexOutput(addressBuffer, "temp", "temp")) {
writer = DirectMonotonicWriter.getInstance(meta, addressOutput, numBlocks, DIRECT_MONOTONIC_BLOCK_SHIFT);
- TermsEnum iterator = values.termsEnum();
- BytesRefBuilder previous = new BytesRefBuilder();
- long offset = 0;
- long ord = 0;
- for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
- if ((ord & Lucene70DocValuesFormat.TERMS_DICT_REVERSE_INDEX_MASK) == 0) {
- writer.add(offset);
- final int sortKeyLength;
- if (ord == 0) {
- // no previous term: no bytes to write
- sortKeyLength = 0;
- } else {
- sortKeyLength = StringHelper.sortKeyLength(previous.get(), term);
+ TermsEnum iterator = values.termsEnum();
+ BytesRefBuilder previous = new BytesRefBuilder();
+ long offset = 0;
+ long ord = 0;
+ for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
+ if ((ord & Lucene80DocValuesFormat.TERMS_DICT_REVERSE_INDEX_MASK) == 0) {
+ writer.add(offset);
+ final int sortKeyLength;
+ if (ord == 0) {
+ // no previous term: no bytes to write
+ sortKeyLength = 0;
+ } else {
+ sortKeyLength = StringHelper.sortKeyLength(previous.get(), term);
+ }
+ offset += sortKeyLength;
+ data.writeBytes(term.bytes, term.offset, sortKeyLength);
+ } else if ((ord & Lucene80DocValuesFormat.TERMS_DICT_REVERSE_INDEX_MASK) == Lucene80DocValuesFormat.TERMS_DICT_REVERSE_INDEX_MASK) {
+ previous.copyBytes(term);
}
- offset += sortKeyLength;
- data.writeBytes(term.bytes, term.offset, sortKeyLength);
- } else if ((ord & Lucene70DocValuesFormat.TERMS_DICT_REVERSE_INDEX_MASK) == Lucene70DocValuesFormat.TERMS_DICT_REVERSE_INDEX_MASK) {
- previous.copyBytes(term);
+ ++ord;
}
- ++ord;
- }
- writer.add(offset);
- writer.finish();
- meta.writeLong(start);
- meta.writeLong(data.getFilePointer() - start);
- start = data.getFilePointer();
+ writer.add(offset);
+ writer.finish();
+ meta.writeLong(start);
+ meta.writeLong(data.getFilePointer() - start);
+ start = data.getFilePointer();
addressBuffer.copyTo(data);
- meta.writeLong(start);
- meta.writeLong(data.getFilePointer() - start);
- }
+ meta.writeLong(start);
+ meta.writeLong(data.getFilePointer() - start);
+ }
}
@Override
public void addSortedNumericField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
meta.writeInt(field.number);
- meta.writeByte(Lucene70DocValuesFormat.SORTED_NUMERIC);
+ meta.writeByte(Lucene80DocValuesFormat.SORTED_NUMERIC);
long[] stats = writeValues(field, valuesProducer);
int numDocsWithField = Math.toIntExact(stats[0]);
@@ -563,7 +598,7 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close
@Override
public void addSortedSetField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
meta.writeInt(field.number);
- meta.writeByte(Lucene70DocValuesFormat.SORTED_SET);
+ meta.writeByte(Lucene80DocValuesFormat.SORTED_SET);
SortedSetDocValues values = valuesProducer.getSortedSet(field);
int numDocsWithField = 0;
@@ -576,7 +611,7 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close
}
if (numDocsWithField == numOrds) {
- meta.writeByte((byte) 0);
+ meta.writeByte((byte) 0); // multiValued (0 = singleValued)
doAddSortedField(field, new EmptyDocValuesProducer() {
@Override
public SortedDocValues getSorted(FieldInfo field) throws IOException {
@@ -585,24 +620,28 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close
});
return;
}
- meta.writeByte((byte) 1);
+ meta.writeByte((byte) 1); // multiValued (1 = multiValued)
assert numDocsWithField != 0;
if (numDocsWithField == maxDoc) {
- meta.writeLong(-1);
- meta.writeLong(0L);
+ meta.writeLong(-1); // docsWithFieldOffset
+ meta.writeLong(0L); // docsWithFieldLength
+ meta.writeShort((short) -1); // jumpTableEntryCount
+ meta.writeByte((byte) -1); // denseRankPower
} else {
long offset = data.getFilePointer();
- meta.writeLong(offset);
+ meta.writeLong(offset); // docsWithFieldOffset
values = valuesProducer.getSortedSet(field);
- IndexedDISI.writeBitSet(values, data);
- meta.writeLong(data.getFilePointer() - offset);
+ final short jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
+ meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
+ meta.writeShort(jumpTableEntryCount);
+ meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
}
int numberOfBitsPerOrd = DirectWriter.unsignedBitsRequired(values.getValueCount() - 1);
- meta.writeByte((byte) numberOfBitsPerOrd);
+ meta.writeByte((byte) numberOfBitsPerOrd); // bitsPerValue
long start = data.getFilePointer();
- meta.writeLong(start);
+ meta.writeLong(start); // ordsOffset
DirectWriter writer = DirectWriter.getInstance(data, numOrds, numberOfBitsPerOrd);
values = valuesProducer.getSortedSet(field);
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
@@ -611,11 +650,11 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close
}
}
writer.finish();
- meta.writeLong(data.getFilePointer() - start);
+ meta.writeLong(data.getFilePointer() - start); // ordsLength
meta.writeInt(numDocsWithField);
start = data.getFilePointer();
- meta.writeLong(start);
+ meta.writeLong(start); // addressesOffset
meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT);
final DirectMonotonicWriter addressesWriter = DirectMonotonicWriter.getInstance(meta, data, numDocsWithField + 1, DIRECT_MONOTONIC_BLOCK_SHIFT);
@@ -631,7 +670,7 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close
addressesWriter.add(addr);
}
addressesWriter.finish();
- meta.writeLong(data.getFilePointer() - start);
+ meta.writeLong(data.getFilePointer() - start); // addressesLength
addTermsDict(values);
}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesFormat.java
similarity index 84%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesFormat.java
rename to lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesFormat.java
index 2ce2124..029980f 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesFormat.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.lucene.codecs.lucene70;
+package org.apache.lucene.codecs.lucene80;
import java.io.IOException;
@@ -31,7 +31,7 @@ import org.apache.lucene.util.SmallFloat;
import org.apache.lucene.util.packed.DirectWriter;
/**
- * Lucene 7.0 DocValues format.
+ * Lucene 8.0 DocValues format.
* <p>
* Documents that have a value for the field are encoded in a way that it is always possible to
* know the ordinal of the current document in the set of documents that have a value. For instance,
@@ -47,9 +47,14 @@ import org.apache.lucene.util.packed.DirectWriter;
* bits of doc IDs are stored as {@link DataOutput#writeShort(short) shorts} while the upper
* 16 bits are given by the block ID.
* <li>DENSE: This strategy is used when a block contains between 4096 and 65535 documents. The
- * lower bits of doc IDs are stored in a bit set. Advancing is performed using
+ * lower bits of doc IDs are stored in a bit set. Advancing < 512 documents is performed using
* {@link Long#numberOfTrailingZeros(long) ntz} operations while the index is computed by
* accumulating the {@link Long#bitCount(long) bit counts} of the visited longs.
+ * Advancing >= 512 documents is performed by skipping to the start of the needed 512 document
+ * sub-block and iterating to the specific document within that block. The index for the
+ * sub-block that is skipped to is retrieved from a rank-table positioned beforethe bit set.
+ * The rank-table holds the origo index numbers for all 512 documents sub-blocks, represented
+ * as an unsigned short for each 128 blocks.
* <li>ALL: This strategy is used when a block contains exactly 65536 documents, meaning that
* the block is full. In that case doc IDs do not need to be stored explicitly. This is
* typically faster than both SPARSE and DENSE which is a reason why it is preferable to have
@@ -57,6 +62,11 @@ import org.apache.lucene.util.packed.DirectWriter;
* using {@link IndexWriterConfig#setIndexSort(org.apache.lucene.search.Sort) index sorting}.
* </ul>
* <p>
+ * Skipping blocks to arrive at a wanted document is either done on an iterative basis or by using the
+ * jump-table stored at the end of the chain of blocks. The jump-table holds the offset as well as the
+ * index for all blocks, packed in a single long per block.
+ * </p>
+ * <p>
* Then the five per-document value types (Numeric,Binary,Sorted,SortedSet,SortedNumeric) are
* encoded using the following strategies:
* <p>
@@ -76,6 +86,10 @@ import org.apache.lucene.util.packed.DirectWriter;
* this value is encoded alone.
* </ul>
* <p>
+ * Depending on calculated gains, the numbers might be split into blocks of 16384 values. In that case,
+ * a jump-table with block offsets is appended to the blocks for O(1) access to the needed block.
+ * </p>
+ * <p>
* {@link DocValuesType#BINARY BINARY}:
* <ul>
* <li>Fixed-width Binary: one large concatenated byte[] is written, along with the fixed length.
@@ -115,26 +129,26 @@ import org.apache.lucene.util.packed.DirectWriter;
* </ol>
* @lucene.experimental
*/
-public final class Lucene70DocValuesFormat extends DocValuesFormat {
+public final class Lucene80DocValuesFormat extends DocValuesFormat {
/** Sole Constructor */
- public Lucene70DocValuesFormat() {
- super("Lucene70");
+ public Lucene80DocValuesFormat() {
+ super("Lucene80");
}
@Override
public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
- return new Lucene70DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
+ return new Lucene80DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
}
@Override
public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
- return new Lucene70DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
+ return new Lucene80DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
}
- static final String DATA_CODEC = "Lucene70DocValuesData";
+ static final String DATA_CODEC = "Lucene80DocValuesData";
static final String DATA_EXTENSION = "dvd";
- static final String META_CODEC = "Lucene70DocValuesMetadata";
+ static final String META_CODEC = "Lucene80DocValuesMetadata";
static final String META_EXTENSION = "dvm";
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesProducer.java
similarity index 89%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesProducer.java
rename to lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesProducer.java
index b0f6e84..e0ece52 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70DocValuesProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80DocValuesProducer.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.lucene.codecs.lucene70;
+package org.apache.lucene.codecs.lucene80;
import java.io.Closeable;
import java.io.IOException;
@@ -48,8 +48,8 @@ import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.packed.DirectMonotonicReader;
import org.apache.lucene.util.packed.DirectReader;
-/** reader for {@link Lucene70DocValuesFormat} */
-final class Lucene70DocValuesProducer extends DocValuesProducer implements Closeable {
+/** reader for {@link Lucene80DocValuesFormat} */
+final class Lucene80DocValuesProducer extends DocValuesProducer implements Closeable {
private final Map<String,NumericEntry> numerics = new HashMap<>();
private final Map<String,BinaryEntry> binaries = new HashMap<>();
private final Map<String,SortedEntry> sorted = new HashMap<>();
@@ -60,7 +60,7 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
private final int maxDoc;
/** expert: instantiates a new reader */
- Lucene70DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
+ Lucene80DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
this.maxDoc = state.segmentInfo.maxDoc();
ramBytesUsed = RamUsageEstimator.shallowSizeOfInstance(getClass());
@@ -72,8 +72,8 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
Throwable priorE = null;
try {
version = CodecUtil.checkIndexHeader(in, metaCodec,
- Lucene70DocValuesFormat.VERSION_START,
- Lucene70DocValuesFormat.VERSION_CURRENT,
+ Lucene80DocValuesFormat.VERSION_START,
+ Lucene80DocValuesFormat.VERSION_CURRENT,
state.segmentInfo.getId(),
state.segmentSuffix);
readFields(in, state.fieldInfos);
@@ -89,8 +89,8 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
boolean success = false;
try {
final int version2 = CodecUtil.checkIndexHeader(data, dataCodec,
- Lucene70DocValuesFormat.VERSION_START,
- Lucene70DocValuesFormat.VERSION_CURRENT,
+ Lucene80DocValuesFormat.VERSION_START,
+ Lucene80DocValuesFormat.VERSION_CURRENT,
state.segmentInfo.getId(),
state.segmentSuffix);
if (version != version2) {
@@ -118,15 +118,15 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta);
}
byte type = meta.readByte();
- if (type == Lucene70DocValuesFormat.NUMERIC) {
+ if (type == Lucene80DocValuesFormat.NUMERIC) {
numerics.put(info.name, readNumeric(meta));
- } else if (type == Lucene70DocValuesFormat.BINARY) {
+ } else if (type == Lucene80DocValuesFormat.BINARY) {
binaries.put(info.name, readBinary(meta));
- } else if (type == Lucene70DocValuesFormat.SORTED) {
+ } else if (type == Lucene80DocValuesFormat.SORTED) {
sorted.put(info.name, readSorted(meta));
- } else if (type == Lucene70DocValuesFormat.SORTED_SET) {
+ } else if (type == Lucene80DocValuesFormat.SORTED_SET) {
sortedSets.put(info.name, readSortedSet(meta));
- } else if (type == Lucene70DocValuesFormat.SORTED_NUMERIC) {
+ } else if (type == Lucene80DocValuesFormat.SORTED_NUMERIC) {
sortedNumerics.put(info.name, readSortedNumeric(meta));
} else {
throw new CorruptIndexException("invalid type: " + type, meta);
@@ -143,6 +143,8 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
private void readNumeric(ChecksumIndexInput meta, NumericEntry entry) throws IOException {
entry.docsWithFieldOffset = meta.readLong();
entry.docsWithFieldLength = meta.readLong();
+ entry.jumpTableEntryCount = meta.readShort();
+ entry.denseRankPower = meta.readByte();
entry.numValues = meta.readLong();
int tableSize = meta.readInt();
if (tableSize > 256) {
@@ -165,6 +167,7 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
entry.gcd = meta.readLong();
entry.valuesOffset = meta.readLong();
entry.valuesLength = meta.readLong();
+ entry.valueJumpTableOffset = meta.readLong();
}
private BinaryEntry readBinary(ChecksumIndexInput meta) throws IOException {
@@ -173,6 +176,8 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
entry.dataLength = meta.readLong();
entry.docsWithFieldOffset = meta.readLong();
entry.docsWithFieldLength = meta.readLong();
+ entry.jumpTableEntryCount = meta.readShort();
+ entry.denseRankPower = meta.readByte();
entry.numDocsWithField = meta.readInt();
entry.minLength = meta.readInt();
entry.maxLength = meta.readInt();
@@ -190,6 +195,8 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
SortedEntry entry = new SortedEntry();
entry.docsWithFieldOffset = meta.readLong();
entry.docsWithFieldLength = meta.readLong();
+ entry.jumpTableEntryCount = meta.readShort();
+ entry.denseRankPower = meta.readByte();
entry.numDocsWithField = meta.readInt();
entry.bitsPerValue = meta.readByte();
entry.ordsOffset = meta.readLong();
@@ -212,6 +219,8 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
}
entry.docsWithFieldOffset = meta.readLong();
entry.docsWithFieldLength = meta.readLong();
+ entry.jumpTableEntryCount = meta.readShort();
+ entry.denseRankPower = meta.readByte();
entry.bitsPerValue = meta.readByte();
entry.ordsOffset = meta.readLong();
entry.ordsLength = meta.readLong();
@@ -270,11 +279,14 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
byte bitsPerValue;
long docsWithFieldOffset;
long docsWithFieldLength;
+ short jumpTableEntryCount;
+ byte denseRankPower;
long numValues;
long minValue;
long gcd;
long valuesOffset;
long valuesLength;
+ long valueJumpTableOffset; // -1 if no jump-table
}
private static class BinaryEntry {
@@ -282,6 +294,8 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
long dataLength;
long docsWithFieldOffset;
long docsWithFieldLength;
+ short jumpTableEntryCount;
+ byte denseRankPower;
int numDocsWithField;
int minLength;
int maxLength;
@@ -310,6 +324,8 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
private static class SortedEntry extends TermsDictEntry {
long docsWithFieldOffset;
long docsWithFieldLength;
+ short jumpTableEntryCount;
+ byte denseRankPower;
int numDocsWithField;
byte bitsPerValue;
long ordsOffset;
@@ -320,6 +336,8 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
SortedEntry singleValueEntry;
long docsWithFieldOffset;
long docsWithFieldLength;
+ short jumpTableEntryCount;
+ byte denseRankPower;
int numDocsWithField;
byte bitsPerValue;
long ordsOffset;
@@ -438,38 +456,12 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
final RandomAccessInput slice = data.randomAccessSlice(entry.valuesOffset, entry.valuesLength);
if (entry.blockShift >= 0) {
// dense but split into blocks of different bits per value
- final int shift = entry.blockShift;
- final long mul = entry.gcd;
- final int mask = (1 << shift) - 1;
return new DenseNumericDocValues(maxDoc) {
- int block = -1;
- long delta;
- long offset;
- long blockEndOffset;
- LongValues values;
+ final VaryingBPVReader vBPVReader = new VaryingBPVReader(entry, slice);
@Override
public long longValue() throws IOException {
- final int block = doc >>> shift;
- if (this.block != block) {
- int bitsPerValue;
- do {
- offset = blockEndOffset;
- bitsPerValue = slice.readByte(offset++);
- delta = slice.readLong(offset);
- offset += Long.BYTES;
- if (bitsPerValue == 0) {
- blockEndOffset = offset;
- } else {
- final int length = slice.readInt(offset);
- offset += Integer.BYTES;
- blockEndOffset = offset + length;
- }
- this.block ++;
- } while (this.block != block);
- values = bitsPerValue == 0 ? LongValues.ZEROES : DirectReader.getInstance(slice, bitsPerValue, offset);
- }
- return mul * values.get(doc & mask) + delta;
+ return vBPVReader.getLongValue(doc);
}
};
} else {
@@ -496,7 +488,8 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
}
} else {
// sparse
- final IndexedDISI disi = new IndexedDISI(data, entry.docsWithFieldOffset, entry.docsWithFieldLength, entry.numValues);
+ final IndexedDISI disi = new IndexedDISI(data, entry.docsWithFieldOffset, entry.docsWithFieldLength,
+ entry.jumpTableEntryCount, entry.denseRankPower, entry.numValues);
if (entry.bitsPerValue == 0) {
return new SparseNumericDocValues(disi) {
@Override
@@ -508,39 +501,13 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
final RandomAccessInput slice = data.randomAccessSlice(entry.valuesOffset, entry.valuesLength);
if (entry.blockShift >= 0) {
// sparse and split into blocks of different bits per value
- final int shift = entry.blockShift;
- final long mul = entry.gcd;
- final int mask = (1 << shift) - 1;
return new SparseNumericDocValues(disi) {
- int block = -1;
- long delta;
- long offset;
- long blockEndOffset;
- LongValues values;
+ final VaryingBPVReader vBPVReader = new VaryingBPVReader(entry, slice);
@Override
public long longValue() throws IOException {
final int index = disi.index();
- final int block = index >>> shift;
- if (this.block != block) {
- int bitsPerValue;
- do {
- offset = blockEndOffset;
- bitsPerValue = slice.readByte(offset++);
- delta = slice.readLong(offset);
- offset += Long.BYTES;
- if (bitsPerValue == 0) {
- blockEndOffset = offset;
- } else {
- final int length = slice.readInt(offset);
- offset += Integer.BYTES;
- blockEndOffset = offset + length;
- }
- this.block ++;
- } while (this.block != block);
- values = bitsPerValue == 0 ? LongValues.ZEROES : DirectReader.getInstance(slice, bitsPerValue, offset);
- }
- return mul * values.get(index & mask) + delta;
+ return vBPVReader.getLongValue(index);
}
};
} else {
@@ -579,42 +546,15 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
} else {
final RandomAccessInput slice = data.randomAccessSlice(entry.valuesOffset, entry.valuesLength);
if (entry.blockShift >= 0) {
- final int shift = entry.blockShift;
- final long mul = entry.gcd;
- final long mask = (1L << shift) - 1;
return new LongValues() {
- long block = -1;
- long delta;
- long offset;
- long blockEndOffset;
- LongValues values;
-
+ final VaryingBPVReader vBPVReader = new VaryingBPVReader(entry, slice);
+ @Override
public long get(long index) {
- final long block = index >>> shift;
- if (this.block != block) {
- assert block > this.block : "Reading backwards is illegal: " + this.block + " < " + block;
- int bitsPerValue;
- do {
- offset = blockEndOffset;
- try {
- bitsPerValue = slice.readByte(offset++);
- delta = slice.readLong(offset);
- offset += Long.BYTES;
- if (bitsPerValue == 0) {
- blockEndOffset = offset;
- } else {
- final int length = slice.readInt(offset);
- offset += Integer.BYTES;
- blockEndOffset = offset + length;
- }
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- this.block ++;
- } while (this.block != block);
- values = bitsPerValue == 0 ? LongValues.ZEROES : DirectReader.getInstance(slice, bitsPerValue, offset);
+ try {
+ return vBPVReader.getLongValue(index);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
}
- return mul * values.get(index & mask) + delta;
}
};
} else {
@@ -767,7 +707,8 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
}
} else {
// sparse
- final IndexedDISI disi = new IndexedDISI(data, entry.docsWithFieldOffset, entry.docsWithFieldLength, entry.numDocsWithField);
+ final IndexedDISI disi = new IndexedDISI(data, entry.docsWithFieldOffset, entry.docsWithFieldLength,
+ entry.jumpTableEntryCount, entry.denseRankPower, entry.numDocsWithField);
if (entry.minLength == entry.maxLength) {
// fixed length
final int length = entry.maxLength;
@@ -868,7 +809,8 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
};
} else {
// sparse
- final IndexedDISI disi = new IndexedDISI(data, entry.docsWithFieldOffset, entry.docsWithFieldLength, entry.numDocsWithField);
+ final IndexedDISI disi = new IndexedDISI(data, entry.docsWithFieldOffset, entry.docsWithFieldLength,
+ entry.jumpTableEntryCount, entry.denseRankPower, entry.numDocsWithField);
return new BaseSortedDocValues(entry, data) {
@Override
@@ -1236,7 +1178,8 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
};
} else {
// sparse
- final IndexedDISI disi = new IndexedDISI(data, entry.docsWithFieldOffset, entry.docsWithFieldLength, entry.numDocsWithField);
+ final IndexedDISI disi = new IndexedDISI(data, entry.docsWithFieldOffset, entry.docsWithFieldLength,
+ entry.jumpTableEntryCount, entry.denseRankPower, entry.numDocsWithField);
return new SortedNumericDocValues() {
boolean set;
@@ -1362,7 +1305,8 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
};
} else {
// sparse
- final IndexedDISI disi = new IndexedDISI(data, entry.docsWithFieldOffset, entry.docsWithFieldLength, entry.numDocsWithField);
+ final IndexedDISI disi = new IndexedDISI(data, entry.docsWithFieldOffset, entry.docsWithFieldLength,
+ entry.jumpTableEntryCount, entry.denseRankPower, entry.numDocsWithField);
return new BaseSortedSetDocValues(entry, data) {
boolean set;
@@ -1422,4 +1366,62 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
CodecUtil.checksumEntireFile(data);
}
+ /**
+ * Reader for longs split into blocks of different bits per values.
+ * The longs are requested by index and must be accessed in monotonically increasing order.
+ */
+ // Note: The order requirement could be removed as the jump-tables allow for backwards iteration
+ // Note 2: The rankSlice is only used if an advance of > 1 block is called. Its construction could be lazy
+ private class VaryingBPVReader {
+ final RandomAccessInput slice; // 2 slices to avoid cache thrashing when using rank
+ final RandomAccessInput rankSlice;
+ final NumericEntry entry;
+ final int shift;
+ final long mul;
+ final int mask;
+
+ long block = -1;
+ long delta;
+ long offset;
+ long blockEndOffset;
+ LongValues values;
+
+ VaryingBPVReader(NumericEntry entry, RandomAccessInput slice) throws IOException {
+ this.entry = entry;
+ this.slice = slice;
+ this.rankSlice = entry.valueJumpTableOffset == -1 ? null :
+ data.randomAccessSlice(entry.valueJumpTableOffset, data.length()-entry.valueJumpTableOffset);
+ shift = entry.blockShift;
+ mul = entry.gcd;
+ mask = (1 << shift) - 1;
+ }
+
+ long getLongValue(long index) throws IOException {
+ final long block = index >>> shift;
+ if (this.block != block) {
+ int bitsPerValue;
+ do {
+ // If the needed block is the one directly following the current block, it is cheaper to avoid the cache
+ if (rankSlice != null && block != this.block+1) {
+ blockEndOffset = rankSlice.readLong(block*Long.BYTES)-entry.valuesOffset;
+ this.block = block-1;
+ }
+ offset = blockEndOffset;
+ bitsPerValue = slice.readByte(offset++);
+ delta = slice.readLong(offset);
+ offset += Long.BYTES;
+ if (bitsPerValue == 0) {
+ blockEndOffset = offset;
+ } else {
+ final int length = slice.readInt(offset);
+ offset += Integer.BYTES;
+ blockEndOffset = offset + length;
+ }
+ this.block++;
+ } while (this.block != block);
+ values = bitsPerValue == 0 ? LongValues.ZEROES : DirectReader.getInstance(slice, bitsPerValue, offset);
+ }
+ return mul * values.get(index & mask) + delta;
+ }
+ }
}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80NormsConsumer.java
similarity index 81%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsConsumer.java
rename to lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80NormsConsumer.java
index d79e246..ae2b445 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80NormsConsumer.java
@@ -14,9 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.lucene.codecs.lucene70;
-
-import static org.apache.lucene.codecs.lucene70.Lucene70NormsFormat.VERSION_CURRENT;
+package org.apache.lucene.codecs.lucene80;
import java.io.IOException;
@@ -31,14 +29,16 @@ import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.IOUtils;
+import static org.apache.lucene.codecs.lucene80.Lucene80NormsFormat.VERSION_CURRENT;
+
/**
- * Writer for {@link Lucene70NormsFormat}
+ * Writer for {@link Lucene80NormsFormat}
*/
-final class Lucene70NormsConsumer extends NormsConsumer {
+final class Lucene80NormsConsumer extends NormsConsumer {
IndexOutput data, meta;
final int maxDoc;
- Lucene70NormsConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
+ Lucene80NormsConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
boolean success = false;
try {
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
@@ -95,17 +95,23 @@ final class Lucene70NormsConsumer extends NormsConsumer {
meta.writeInt(field.number);
if (numDocsWithValue == 0) {
- meta.writeLong(-2);
- meta.writeLong(0L);
+ meta.writeLong(-2); // docsWithFieldOffset
+ meta.writeLong(0L); // docsWithFieldLength
+ meta.writeShort((short) -1); // jumpTableEntryCount
+ meta.writeByte((byte) -1); // denseRankPower
} else if (numDocsWithValue == maxDoc) {
- meta.writeLong(-1);
- meta.writeLong(0L);
+ meta.writeLong(-1); // docsWithFieldOffset
+ meta.writeLong(0L); // docsWithFieldLength
+ meta.writeShort((short) -1); // jumpTableEntryCount
+ meta.writeByte((byte) -1); // denseRankPower
} else {
long offset = data.getFilePointer();
- meta.writeLong(offset);
+ meta.writeLong(offset); // docsWithFieldOffset
values = normsProducer.getNorms(field);
- IndexedDISI.writeBitSet(values, data);
- meta.writeLong(data.getFilePointer() - offset);
+ final short jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
+ meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
+ meta.writeShort(jumpTableEntryCount);
+ meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
}
meta.writeInt(numDocsWithValue);
@@ -115,7 +121,7 @@ final class Lucene70NormsConsumer extends NormsConsumer {
if (numBytesPerValue == 0) {
meta.writeLong(min);
} else {
- meta.writeLong(data.getFilePointer());
+ meta.writeLong(data.getFilePointer()); // normsOffset
values = normsProducer.getNorms(field);
writeValues(values, numBytesPerValue, data);
}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80NormsFormat.java
similarity index 88%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsFormat.java
rename to lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80NormsFormat.java
index 7c764fe..915116b 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80NormsFormat.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.lucene.codecs.lucene70;
+package org.apache.lucene.codecs.lucene80;
import java.io.IOException;
@@ -27,7 +27,7 @@ import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.DataOutput;
/**
- * Lucene 7.0 Score normalization format.
+ * Lucene 8.0 Score normalization format.
* <p>
* Encodes normalization values by encoding each value with the minimum
* number of bytes needed to represent the range (which can be zero).
@@ -68,31 +68,31 @@ import org.apache.lucene.store.DataOutput;
* <p>Norms data (.nvd) --> Header,< Data ><sup>NumFields</sup>,Footer</p>
* <ul>
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}</li>
- * <li>DocsWithFieldData --> Bit set of MaxDoc bits</li>
+ * <li>DocsWithFieldData --> {@link IndexedDISI#writeBitSet Bit set of MaxDoc bits}</li>
* <li>NormsData --> {@link DataOutput#writeByte(byte) byte}<sup>NumDocsWithField * BytesPerValue</sup></li>
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
* </ul>
* </ol>
* @lucene.experimental
*/
-public class Lucene70NormsFormat extends NormsFormat {
+public class Lucene80NormsFormat extends NormsFormat {
/** Sole Constructor */
- public Lucene70NormsFormat() {}
+ public Lucene80NormsFormat() {}
@Override
public NormsConsumer normsConsumer(SegmentWriteState state) throws IOException {
- return new Lucene70NormsConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
+ return new Lucene80NormsConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
}
@Override
public NormsProducer normsProducer(SegmentReadState state) throws IOException {
- return new Lucene70NormsProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
+ return new Lucene80NormsProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
}
- private static final String DATA_CODEC = "Lucene70NormsData";
+ private static final String DATA_CODEC = "Lucene80NormsData";
private static final String DATA_EXTENSION = "nvd";
- private static final String METADATA_CODEC = "Lucene70NormsMetadata";
+ private static final String METADATA_CODEC = "Lucene80NormsMetadata";
private static final String METADATA_EXTENSION = "nvm";
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80NormsProducer.java
similarity index 87%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsProducer.java
rename to lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80NormsProducer.java
index c7310e8..66126a2 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene70/Lucene70NormsProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/Lucene80NormsProducer.java
@@ -14,10 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.lucene.codecs.lucene70;
-
-import static org.apache.lucene.codecs.lucene70.Lucene70NormsFormat.VERSION_CURRENT;
-import static org.apache.lucene.codecs.lucene70.Lucene70NormsFormat.VERSION_START;
+package org.apache.lucene.codecs.lucene80;
import java.io.IOException;
import java.util.HashMap;
@@ -37,19 +34,23 @@ import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.RandomAccessInput;
import org.apache.lucene.util.IOUtils;
+import static org.apache.lucene.codecs.lucene80.Lucene80NormsFormat.VERSION_CURRENT;
+import static org.apache.lucene.codecs.lucene80.Lucene80NormsFormat.VERSION_START;
+
/**
- * Reader for {@link Lucene70NormsFormat}
+ * Reader for {@link Lucene80NormsFormat}
*/
-final class Lucene70NormsProducer extends NormsProducer implements Cloneable {
+final class Lucene80NormsProducer extends NormsProducer implements Cloneable {
// metadata maps (just file pointers and minimal stuff)
private final Map<Integer,NormsEntry> norms = new HashMap<>();
private final int maxDoc;
private IndexInput data;
private boolean merging;
private Map<Integer, IndexInput> disiInputs;
+ private Map<Integer, RandomAccessInput> disiJumpTables;
private Map<Integer, RandomAccessInput> dataInputs;
- Lucene70NormsProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
+ Lucene80NormsProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
maxDoc = state.segmentInfo.maxDoc();
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
int version = -1;
@@ -92,24 +93,27 @@ final class Lucene70NormsProducer extends NormsProducer implements Cloneable {
@Override
public NormsProducer getMergeInstance() throws IOException {
- Lucene70NormsProducer clone;
+ Lucene80NormsProducer clone;
try {
- clone = (Lucene70NormsProducer) super.clone();
+ clone = (Lucene80NormsProducer) super.clone();
} catch (CloneNotSupportedException e) {
// cannot happen
throw new RuntimeException(e);
}
clone.data = data.clone();
- clone.dataInputs = new HashMap<>();
clone.disiInputs = new HashMap<>();
+ clone.disiJumpTables = new HashMap<>();
+ clone.dataInputs = new HashMap<>();
clone.merging = true;
return clone;
}
static class NormsEntry {
+ byte denseRankPower;
byte bytesPerNorm;
long docsWithFieldOffset;
long docsWithFieldLength;
+ short jumpTableEntryCount;
int numDocsWithField;
long normsOffset;
}
@@ -199,6 +203,8 @@ final class Lucene70NormsProducer extends NormsProducer implements Cloneable {
NormsEntry entry = new NormsEntry();
entry.docsWithFieldOffset = meta.readLong();
entry.docsWithFieldLength = meta.readLong();
+ entry.jumpTableEntryCount = meta.readShort();
+ entry.denseRankPower = meta.readByte();
entry.numDocsWithField = meta.readInt();
entry.bytesPerNorm = meta.readByte();
switch (entry.bytesPerNorm) {
@@ -232,7 +238,8 @@ final class Lucene70NormsProducer extends NormsProducer implements Cloneable {
slice = disiInputs.get(field.number);
}
if (slice == null) {
- slice = data.slice("docs", entry.docsWithFieldOffset, entry.docsWithFieldLength);
+ slice = IndexedDISI.createBlockSlice(
+ data, "docs", entry.docsWithFieldOffset, entry.docsWithFieldLength, entry.jumpTableEntryCount);
if (merging) {
disiInputs.put(field.number, slice);
}
@@ -240,6 +247,22 @@ final class Lucene70NormsProducer extends NormsProducer implements Cloneable {
return slice;
}
+ private RandomAccessInput getDisiJumpTable(FieldInfo field, NormsEntry entry) throws IOException {
+ RandomAccessInput jumpTable = null;
+ if (merging) {
+ jumpTable = disiJumpTables.get(field.number);
+ }
+ if (jumpTable == null) {
+ jumpTable = IndexedDISI.createJumpTable(
+ data, entry.docsWithFieldOffset, entry.docsWithFieldLength, entry.jumpTableEntryCount);
+ if (merging) {
+ disiJumpTables.put(field.number, jumpTable);
+ }
+ }
+ return jumpTable;
+ }
+
+
@Override
public NumericDocValues getNorms(FieldInfo field) throws IOException {
final NormsEntry entry = norms.get(field.number);
@@ -293,7 +316,9 @@ final class Lucene70NormsProducer extends NormsProducer implements Cloneable {
} else {
// sparse
final IndexInput disiInput = getDisiInput(field, entry);
- final IndexedDISI disi = new IndexedDISI(disiInput, entry.numDocsWithField);
+ final RandomAccessInput disiJumpTable = getDisiJumpTable(field, entry);
+ final IndexedDISI disi = new IndexedDISI(disiInput, disiJumpTable, entry.jumpTableEntryCount, entry.denseRankPower, entry.numDocsWithField);
+
if (entry.bytesPerNorm == 0) {
return new SparseNormsIterator(disi) {
@Override
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene80/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/package-info.java
index 2b8a7e1..82b5689 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene80/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/package-info.java
@@ -163,7 +163,7 @@
* all documents omit position data.
* </li>
* <li>
- * {@link org.apache.lucene.codecs.lucene70.Lucene70NormsFormat Normalization factors}.
+ * {@link org.apache.lucene.codecs.lucene80.Lucene80NormsFormat Normalization factors}.
* For each field in each document, a value is stored
* that is multiplied into the score for hits on that field.
* </li>
@@ -175,7 +175,7 @@
* {@link org.apache.lucene.document.Field Field} constructors
* </li>
* <li>
- * {@link org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat Per-document values}.
+ * {@link org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat Per-document values}.
* Like stored values, these are also keyed by document
* number, but are generally intended to be loaded into main memory for fast
* access. Whereas stored values are generally intended for summary results from
@@ -284,12 +284,12 @@
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
* </tr>
* <tr>
- * <td>{@link org.apache.lucene.codecs.lucene70.Lucene70NormsFormat Norms}</td>
+ * <td>{@link org.apache.lucene.codecs.lucene80.Lucene80NormsFormat Norms}</td>
* <td>.nvd, .nvm</td>
* <td>Encodes length and boost factors for docs and fields</td>
* </tr>
* <tr>
- * <td>{@link org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat Per-Document Values}</td>
+ * <td>{@link org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat Per-Document Values}</td>
* <td>.dvd, .dvm</td>
* <td>Encodes additional scoring factors or other per-document information.</td>
* </tr>
@@ -393,7 +393,9 @@
* doc ids, the (term freq, normalization factor) pairs that may trigger the
* maximum score of the block. This information is recorded alongside skip data
* in order to be able to skip blocks of doc ids if they may not produce high
- * enough scores.</li>
+ * enough scores.
+ * Additionally doc values and norms has been extended with jump-tables to make access O(1)
+ * instead of O(n), where n is the number of elements to skip when advancing in the data.</li>
* </ul>
* <a name="Limitations"></a>
* <h2>Limitations</h2>
diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat
index 20463c5..43ed529 100644
--- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat
+++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat
@@ -13,4 +13,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat
+org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene80/TestIndexedDISI.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene80/TestIndexedDISI.java
new file mode 100644
index 0000000..6f102b3
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene80/TestIndexedDISI.java
@@ -0,0 +1,522 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene80;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+
+// Copied from the lucene70 package for separation of codec-code
+public class TestIndexedDISI extends LuceneTestCase {
+
+ public void testEmpty() throws IOException {
+ int maxDoc = TestUtil.nextInt(random(), 1, 100000);
+ FixedBitSet set = new FixedBitSet(maxDoc);
+ try (Directory dir = newDirectory()) {
+ doTest(set, dir);
+ }
+ }
+
+ // EMPTY blocks are special with regard to jumps as they have size 0
+ public void testEmptyBlocks() throws IOException {
+ final int B = 65536;
+ int maxDoc = B*11;
+ FixedBitSet set = new FixedBitSet(maxDoc);
+ // block 0: EMPTY
+ set.set(B+5); // block 1: SPARSE
+ // block 2: EMPTY
+ // block 3: EMPTY
+ set.set(B*4+5); // block 4: SPARSE
+
+ for (int i = 0 ; i < B ; i++) {
+ set.set(B*6+i); // block 6: ALL
+ }
+ for (int i = 0 ; i < B ; i+=3) {
+ set.set(B*7+i); // block 7: DENSE
+ }
+ for (int i = 0 ; i < B ; i++) {
+ if (i != 32768) {
+ set.set(B*8 + i); // block 8: DENSE (all-1)
+ }
+ }
+ // block 9-11: EMPTY
+
+ try (Directory dir = newDirectory()) {
+ doTestAllSingleJump(set, dir);
+ }
+
+ // Change the first block to DENSE to see if jump-tables sets to position 0
+ set.set(0);
+ try (Directory dir = newDirectory()) {
+ doTestAllSingleJump(set, dir);
+ }
+ }
+
+ // EMPTY blocks are special with regard to jumps as they have size 0
+ public void testLastEmptyBlocks() throws IOException {
+ final int B = 65536;
+ int maxDoc = B*3;
+ FixedBitSet set = new FixedBitSet(maxDoc);
+ for (int docID = 0 ; docID < B*2 ; docID++) { // first 2 blocks are ALL
+ set.set(docID);
+ }
+ // Last block is EMPTY
+
+ try (Directory dir = newDirectory()) {
+ doTestAllSingleJump(set, dir);
+ assertAdvanceBeyondEnd(set, dir);
+ }
+ }
+
+ // Checks that advance after the end of the blocks has been reached has the correct behaviour
+ private void assertAdvanceBeyondEnd(FixedBitSet set, Directory dir) throws IOException {
+ final int cardinality = set.cardinality();
+ final byte denseRankPower = 9; // Not tested here so fixed to isolate factors
+ long length;
+ int jumpTableentryCount;
+ try (IndexOutput out = dir.createOutput("bar", IOContext.DEFAULT)) {
+ jumpTableentryCount = IndexedDISI.writeBitSet(new BitSetIterator(set, cardinality), out, denseRankPower);
+ }
+
+ try (IndexInput in = dir.openInput("bar", IOContext.DEFAULT)) {
+ BitSetIterator disi2 = new BitSetIterator(set, cardinality);
+ int doc = disi2.docID();
+ int index = 0;
+ while (doc < cardinality) {
+ doc = disi2.nextDoc();
+ index++;
+ }
+
+ IndexedDISI disi = new IndexedDISI(in, 0L, in.length(), jumpTableentryCount, denseRankPower, cardinality);
+ // Advance 1 docID beyond end
+ assertFalse("There should be no set bit beyond the valid docID range", disi.advanceExact(set.length()));
+ disi.advance(doc); // Should be the special docID signifyin NO_MORE_DOCS from the BitSetIterator
+ assertEquals("The index when advancing beyond the last defined docID should be correct",
+ index, disi.index()+1); // disi.index()+1 as the while-loop also counts the NO_MORE_DOCS
+ }
+ }
+
+ public void testRandomBlocks() throws IOException {
+ final int BLOCKS = 5;
+ FixedBitSet set = createSetWithRandomBlocks(BLOCKS);
+ try (Directory dir = newDirectory()) {
+ doTestAllSingleJump(set, dir);
+ }
+ }
+
+ // When doing merges in Lucene80NormsProducer, IndexedDISI are created from slices where the offset is not 0
+ public void testPositionNotZero() throws IOException {
+ final int BLOCKS = 10;
+ final byte denseRankPower = rarely() ? -1 : (byte) (random().nextInt(7)+7); // sane + chance of disable
+
+ FixedBitSet set = createSetWithRandomBlocks(BLOCKS);
+ try (Directory dir = newDirectory()) {
+ final int cardinality = set.cardinality();
+ int jumpTableEntryCount;
+ try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
+ jumpTableEntryCount = IndexedDISI.writeBitSet(new BitSetIterator(set, cardinality), out, denseRankPower);
+ }
+ try (IndexInput fullInput = dir.openInput("foo", IOContext.DEFAULT)) {
+ IndexInput blockData =
+ IndexedDISI.createBlockSlice(fullInput, "blocks", 0, fullInput.length(), jumpTableEntryCount);
+ blockData.seek(random().nextInt((int) blockData.length()));
+
+ RandomAccessInput jumpTable = IndexedDISI.createJumpTable(fullInput, 0, fullInput.length(), jumpTableEntryCount);
+ IndexedDISI disi = new IndexedDISI(blockData, jumpTable, jumpTableEntryCount, denseRankPower, cardinality);
+ // This failed at some point during LUCENE-8585 development as it did not reset the slice position
+ disi.advanceExact(BLOCKS*65536-1);
+ }
+ }
+ }
+
+ private FixedBitSet createSetWithRandomBlocks(int blockCount) {
+ final int B = 65536;
+ FixedBitSet set = new FixedBitSet(blockCount * B);
+ for (int block = 0; block < blockCount; block++) {
+ switch (random().nextInt(4)) {
+ case 0: { // EMPTY
+ break;
+ }
+ case 1: { // ALL
+ for (int docID = block* B; docID < (block+1)* B; docID++) {
+ set.set(docID);
+ }
+ break;
+ }
+ case 2: { // SPARSE ( < 4096 )
+ for (int docID = block* B; docID < (block+1)* B; docID += 101) {
+ set.set(docID);
+ }
+ break;
+ }
+ case 3: { // DENSE ( >= 4096 )
+ for (int docID = block* B; docID < (block+1)* B; docID += 3) {
+ set.set(docID);
+ }
+ break;
+ }
+ default: throw new IllegalStateException("Modulo logic error: there should only be 4 possibilities");
+ }
+ }
+ return set;
+ }
+
+
+ private void doTestAllSingleJump(FixedBitSet set, Directory dir) throws IOException {
+ final int cardinality = set.cardinality();
+ final byte denseRankPower = rarely() ? -1 : (byte) (random().nextInt(7)+7); // sane + chance of disable
+ long length;
+ int jumpTableentryCount;
+ try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
+ jumpTableentryCount = IndexedDISI.writeBitSet(new BitSetIterator(set, cardinality), out, denseRankPower);
+ length = out.getFilePointer();
+ }
+
+ try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
+ for (int i = 0; i < set.length(); i++) {
+ IndexedDISI disi = new IndexedDISI(in, 0L, length, jumpTableentryCount, denseRankPower, cardinality);
+ assertEquals("The bit at " + i + " should be correct with advanceExact", set.get(i), disi.advanceExact(i));
+
+ IndexedDISI disi2 = new IndexedDISI(in, 0L, length, jumpTableentryCount, denseRankPower, cardinality);
+ disi2.advance(i);
+ // Proper sanity check with jump tables as an error could make them seek backwards
+ assertTrue("The docID should at least be " + i + " after advance(" + i + ") but was " + disi2.docID(),
+ i <= disi2.docID());
+ if (set.get(i)) {
+ assertEquals("The docID should be present with advance", i, disi2.docID());
+ } else {
+ assertNotSame("The docID should not be present with advance", i, disi2.docID());
+ }
+ }
+ }
+ }
+
+ public void testOneDoc() throws IOException {
+ int maxDoc = TestUtil.nextInt(random(), 1, 100000);
+ FixedBitSet set = new FixedBitSet(maxDoc);
+ set.set(random().nextInt(maxDoc));
+ try (Directory dir = newDirectory()) {
+ doTest(set, dir);
+ }
+ }
+
+ public void testTwoDocs() throws IOException {
+ int maxDoc = TestUtil.nextInt(random(), 1, 100000);
+ FixedBitSet set = new FixedBitSet(maxDoc);
+ set.set(random().nextInt(maxDoc));
+ set.set(random().nextInt(maxDoc));
+ try (Directory dir = newDirectory()) {
+ doTest(set, dir);
+ }
+ }
+
+ public void testAllDocs() throws IOException {
+ int maxDoc = TestUtil.nextInt(random(), 1, 100000);
+ FixedBitSet set = new FixedBitSet(maxDoc);
+ set.set(1, maxDoc);
+ try (Directory dir = newDirectory()) {
+ doTest(set, dir);
+ }
+ }
+
+ public void testHalfFull() throws IOException {
+ int maxDoc = TestUtil.nextInt(random(), 1, 100000);
+ FixedBitSet set = new FixedBitSet(maxDoc);
+ for (int i = random().nextInt(2); i < maxDoc; i += TestUtil.nextInt(random(), 1, 3)) {
+ set.set(i);
+ }
+ try (Directory dir = newDirectory()) {
+ doTest(set, dir);
+ }
+ }
+
+ public void testDocRange() throws IOException {
+ try (Directory dir = newDirectory()) {
+ for (int iter = 0; iter < 10; ++iter) {
+ int maxDoc = TestUtil.nextInt(random(), 1, 1000000);
+ FixedBitSet set = new FixedBitSet(maxDoc);
+ final int start = random().nextInt(maxDoc);
+ final int end = TestUtil.nextInt(random(), start + 1, maxDoc);
+ set.set(start, end);
+ doTest(set, dir);
+ }
+ }
+ }
+
+ public void testSparseDenseBoundary() throws IOException {
+ try (Directory dir = newDirectory()) {
+ FixedBitSet set = new FixedBitSet(200000);
+ int start = 65536 + random().nextInt(100);
+ final byte denseRankPower = rarely() ? -1 : (byte) (random().nextInt(7)+7); // sane + chance of disable
+
+ // we set MAX_ARRAY_LENGTH bits so the encoding will be sparse
+ set.set(start, start + IndexedDISI.MAX_ARRAY_LENGTH);
+ long length;
+ int jumpTableEntryCount;
+ try (IndexOutput out = dir.createOutput("sparse", IOContext.DEFAULT)) {
+ jumpTableEntryCount = IndexedDISI.writeBitSet(new BitSetIterator(set, IndexedDISI.MAX_ARRAY_LENGTH), out, denseRankPower);
+ length = out.getFilePointer();
+ }
+ try (IndexInput in = dir.openInput("sparse", IOContext.DEFAULT)) {
+ IndexedDISI disi = new IndexedDISI(in, 0L, length, jumpTableEntryCount, denseRankPower, IndexedDISI.MAX_ARRAY_LENGTH);
+ assertEquals(start, disi.nextDoc());
+ assertEquals(IndexedDISI.Method.SPARSE, disi.method);
+ }
+ doTest(set, dir);
+
+ // now we set one more bit so the encoding will be dense
+ set.set(start + IndexedDISI.MAX_ARRAY_LENGTH + random().nextInt(100));
+ try (IndexOutput out = dir.createOutput("bar", IOContext.DEFAULT)) {
+ IndexedDISI.writeBitSet(new BitSetIterator(set, IndexedDISI.MAX_ARRAY_LENGTH + 1), out, denseRankPower);
+ length = out.getFilePointer();
+ }
+ try (IndexInput in = dir.openInput("bar", IOContext.DEFAULT)) {
+ IndexedDISI disi = new IndexedDISI(in, 0L, length, jumpTableEntryCount, denseRankPower, IndexedDISI.MAX_ARRAY_LENGTH + 1);
+ assertEquals(start, disi.nextDoc());
+ assertEquals(IndexedDISI.Method.DENSE, disi.method);
+ }
+ doTest(set, dir);
+ }
+ }
+
+ public void testOneDocMissing() throws IOException {
+ int maxDoc = TestUtil.nextInt(random(), 1, 1000000);
+ FixedBitSet set = new FixedBitSet(maxDoc);
+ set.set(0, maxDoc);
+ set.clear(random().nextInt(maxDoc));
+ try (Directory dir = newDirectory()) {
+ doTest(set, dir);
+ }
+ }
+
+ public void testFewMissingDocs() throws IOException {
+ try (Directory dir = newDirectory()) {
+ for (int iter = 0; iter < 100; ++iter) {
+ int maxDoc = TestUtil.nextInt(random(), 1, 100000);
+ FixedBitSet set = new FixedBitSet(maxDoc);
+ set.set(0, maxDoc);
+ final int numMissingDocs = TestUtil.nextInt(random(), 2, 1000);
+ for (int i = 0; i < numMissingDocs; ++i) {
+ set.clear(random().nextInt(maxDoc));
+ }
+ doTest(set, dir);
+ }
+ }
+ }
+ public void testDenseMultiBlock() throws IOException {
+ try (Directory dir = newDirectory()) {
+ int maxDoc = 10 * 65536; // 10 blocks
+ FixedBitSet set = new FixedBitSet(maxDoc);
+ for (int i = 0; i < maxDoc; i += 2) { // Set every other to ensure dense
+ set.set(i);
+ }
+ doTest(set, dir);
+ }
+ }
+
+ public void testIllegalDenseRankPower() throws IOException {
+
+ // Legal values
+ for (byte denseRankPower: new byte[]{-1, 7, 8, 9, 10, 11, 12, 13, 14, 15}) {
+ createAndOpenDISI(denseRankPower, denseRankPower);
+ }
+
+ // Illegal values
+ for (byte denseRankPower: new byte[]{-2, 0, 1, 6, 16}) {
+ try {
+ createAndOpenDISI(denseRankPower, (byte) 8); // Illegal write, legal read (should not reach read)
+ fail("Trying to create an IndexedDISI data stream with denseRankPower-read " + denseRankPower +
+ " and denseRankPower-write 8 should fail");
+ } catch (IllegalArgumentException e) {
+ // Expected
+ }
+ try {
+ createAndOpenDISI((byte) 8, denseRankPower); // Legal write, illegal read (should reach read)
+ fail("Trying to create an IndexedDISI data stream with denseRankPower-write 8 and denseRankPower-read " +
+ denseRankPower + " should fail");
+ } catch (IllegalArgumentException e) {
+ // Expected
+ }
+ }
+ }
+
+ private void createAndOpenDISI(byte denseRankPowerWrite, byte denseRankPowerRead) throws IOException {
+ FixedBitSet set = new FixedBitSet(10);
+ set.set(set.length()-1);
+ try (Directory dir = newDirectory()) {
+ long length;
+ int jumpTableEntryCount = -1;
+ try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
+ jumpTableEntryCount = IndexedDISI.writeBitSet(new BitSetIterator(set, set.cardinality()), out, denseRankPowerWrite);
+ length = out.getFilePointer();
+ }
+ try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
+ IndexedDISI disi = new IndexedDISI(in, 0L, length, jumpTableEntryCount, denseRankPowerRead, set.cardinality());
+ }
+ // This tests the legality of the denseRankPower only, so we don't do anything with the disi
+ }
+ }
+
+ public void testOneDocMissingFixed() throws IOException {
+ int maxDoc = 9699;
+ final byte denseRankPower = rarely() ? -1 : (byte) (random().nextInt(7)+7); // sane + chance of disable
+ FixedBitSet set = new FixedBitSet(maxDoc);
+ set.set(0, maxDoc);
+ set.clear(1345);
+ try (Directory dir = newDirectory()) {
+
+ final int cardinality = set.cardinality();
+ long length;
+ int jumpTableentryCount;
+ try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
+ jumpTableentryCount = IndexedDISI.writeBitSet(new BitSetIterator(set, cardinality), out, denseRankPower);
+ length = out.getFilePointer();
+ }
+
+ int step = 16000;
+ try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
+ IndexedDISI disi = new IndexedDISI(in, 0L, length, jumpTableentryCount, denseRankPower, cardinality);
+ BitSetIterator disi2 = new BitSetIterator(set, cardinality);
+ assertAdvanceEquality(disi, disi2, step);
+ }
+ }
+ }
+
+ public void testRandom() throws IOException {
+ try (Directory dir = newDirectory()) {
+ for (int i = 0; i < 10; ++i) {
+ doTestRandom(dir);
+ }
+ }
+ }
+
+ private void doTestRandom(Directory dir) throws IOException {
+ List<Integer> docs = new ArrayList<>();
+ final int maxStep = TestUtil.nextInt(random(), 1, 1 << TestUtil.nextInt(random(), 2, 20));
+ final int numDocs = TestUtil.nextInt(random(), 1, Math.min(100000, Integer.MAX_VALUE / maxStep));
+ for (int doc = -1, i = 0; i < numDocs; ++i) {
+ doc += TestUtil.nextInt(random(), 1, maxStep);
+ docs.add(doc);
+ }
+ final int maxDoc = docs.get(docs.size() - 1) + TestUtil.nextInt(random(), 1, 100);
+
+ FixedBitSet set = new FixedBitSet(maxDoc);
+ for (int doc : docs) {
+ set.set(doc);
+ }
+
+ doTest(set, dir);
+ }
+
+ private void doTest(FixedBitSet set, Directory dir) throws IOException {
+ final int cardinality = set.cardinality();
+ final byte denseRankPower = rarely() ? -1 : (byte) (random().nextInt(7)+7); // sane + chance of disable
+ long length;
+ int jumpTableentryCount;
+ try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
+ jumpTableentryCount = IndexedDISI.writeBitSet(new BitSetIterator(set, cardinality), out, denseRankPower);
+ length = out.getFilePointer();
+ }
+
+ try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
+ IndexedDISI disi = new IndexedDISI(in, 0L, length, jumpTableentryCount, denseRankPower, cardinality);
+ BitSetIterator disi2 = new BitSetIterator(set, cardinality);
+ assertSingleStepEquality(disi, disi2);
+ }
+
+ for (int step : new int[] {1, 10, 100, 1000, 10000, 100000}) {
+ try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
+ IndexedDISI disi = new IndexedDISI(in, 0L, length, jumpTableentryCount, denseRankPower, cardinality);
+ BitSetIterator disi2 = new BitSetIterator(set, cardinality);
+ assertAdvanceEquality(disi, disi2, step);
+ }
+ }
+
+ for (int step : new int[] {10, 100, 1000, 10000, 100000}) {
+ try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
+ IndexedDISI disi = new IndexedDISI(in, 0L, length, jumpTableentryCount, denseRankPower, cardinality);
+ BitSetIterator disi2 = new BitSetIterator(set, cardinality);
+ int disi2length = set.length();
+ assertAdvanceExactRandomized(disi, disi2, disi2length, step);
+ }
+ }
+
+ dir.deleteFile("foo");
+ }
+
+ private void assertAdvanceExactRandomized(IndexedDISI disi, BitSetIterator disi2, int disi2length, int step)
+ throws IOException {
+ int index = -1;
+ for (int target = 0; target < disi2length; ) {
+ target += TestUtil.nextInt(random(), 0, step);
+ int doc = disi2.docID();
+ while (doc < target) {
+ doc = disi2.nextDoc();
+ index++;
+ }
+
+ boolean exists = disi.advanceExact(target);
+ assertEquals(doc == target, exists);
+ if (exists) {
+ assertEquals(index, disi.index());
+ } else if (random().nextBoolean()) {
+ assertEquals(doc, disi.nextDoc());
+ // This is a bit strange when doc == NO_MORE_DOCS as the index overcounts in the disi2 while-loop
+ assertEquals(index, disi.index());
+ target = doc;
+ }
+ }
+ }
+
+ private void assertSingleStepEquality(IndexedDISI disi, BitSetIterator disi2) throws IOException {
+ int i = 0;
+ for (int doc = disi2.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi2.nextDoc()) {
+ assertEquals(doc, disi.nextDoc());
+ assertEquals(i++, disi.index());
+ }
+ assertEquals(DocIdSetIterator.NO_MORE_DOCS, disi.nextDoc());
+ }
+
+ private void assertAdvanceEquality(IndexedDISI disi, BitSetIterator disi2, int step) throws IOException {
+ int index = -1;
+ while (true) {
+ int target = disi2.docID() + step;
+ int doc;
+ do {
+ doc = disi2.nextDoc();
+ index++;
+ } while (doc < target);
+ assertEquals(doc, disi.advance(target));
+ if (doc == DocIdSetIterator.NO_MORE_DOCS) {
+ break;
+ }
+ assertEquals("Expected equality using step " + step + " at docID " + doc, index, disi.index());
+ }
+ }
+
+}
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70DocValuesFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene80/TestLucene80DocValuesFormat.java
similarity index 81%
rename from lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70DocValuesFormat.java
rename to lucene/core/src/test/org/apache/lucene/codecs/lucene80/TestLucene80DocValuesFormat.java
index 9ed3173..03dfd69 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70DocValuesFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene80/TestLucene80DocValuesFormat.java
@@ -14,8 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.lucene.codecs.lucene70;
-
+package org.apache.lucene.codecs.lucene80;
import java.io.IOException;
import java.util.ArrayList;
@@ -63,18 +62,20 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.TermsEnum.SeekStatus;
import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.store.ByteBuffersDataInput;
-import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMFile;
+import org.apache.lucene.store.RAMInputStream;
+import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.TestUtil;
/**
- * Tests Lucene70DocValuesFormat
+ * Tests Lucene80DocValuesFormat
+ * Copied directly from the lucene70 package for separation of codec-code
*/
-public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatTestCase {
- private final Codec codec = TestUtil.alwaysDocValuesFormat(new Lucene70DocValuesFormat());
+public class TestLucene80DocValuesFormat extends BaseCompressingDocValuesFormatTestCase {
+ private final Codec codec = TestUtil.alwaysDocValuesFormat(new Lucene80DocValuesFormat());
@Override
protected Codec getCodec() {
@@ -286,7 +287,7 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
conf.setMergeScheduler(new SerialMergeScheduler());
// set to duel against a codec which has ordinals:
final PostingsFormat pf = TestUtil.getPostingsFormatWithOrds(random());
- final DocValuesFormat dv = new Lucene70DocValuesFormat();
+ final DocValuesFormat dv = new Lucene80DocValuesFormat();
conf.setCodec(new AssertingCodec() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
@@ -441,11 +442,12 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
@Slow
public void testSortedSetAroundBlockSize() throws IOException {
- final int frontier = 1 << Lucene70DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
+ final int frontier = 1 << Lucene80DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
for (int maxDoc = frontier - 1; maxDoc <= frontier + 1; ++maxDoc) {
final Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()));
- ByteBuffersDataOutput buffer = new ByteBuffersDataOutput();
+ RAMFile buffer = new RAMFile();
+ RAMOutputStream out = new RAMOutputStream(buffer, false);
Document doc = new Document();
SortedSetDocValuesField field1 = new SortedSetDocValuesField("sset", new BytesRef());
doc.add(field1);
@@ -458,12 +460,13 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
field2.setBytesValue(s2);
w.addDocument(doc);
Set<BytesRef> set = new TreeSet<>(Arrays.asList(s1, s2));
- buffer.writeVInt(set.size());
+ out.writeVInt(set.size());
for (BytesRef ref : set) {
- buffer.writeVInt(ref.length);
- buffer.writeBytes(ref.bytes, ref.offset, ref.length);
+ out.writeVInt(ref.length);
+ out.writeBytes(ref.bytes, ref.offset, ref.length);
}
}
+ out.close();
w.forceMerge(1);
DirectoryReader r = DirectoryReader.open(w);
w.close();
@@ -471,20 +474,21 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
assertEquals(maxDoc, sr.maxDoc());
SortedSetDocValues values = sr.getSortedSetDocValues("sset");
assertNotNull(values);
- ByteBuffersDataInput in = buffer.toDataInput();
- BytesRefBuilder b = new BytesRefBuilder();
- for (int i = 0; i < maxDoc; ++i) {
- assertEquals(i, values.nextDoc());
- final int numValues = in.readVInt();
-
- for (int j = 0; j < numValues; ++j) {
- b.setLength(in.readVInt());
- b.grow(b.length());
- in.readBytes(b.bytes(), 0, b.length());
- assertEquals(b.get(), values.lookupOrd(values.nextOrd()));
- }
+ try (RAMInputStream in = new RAMInputStream("", buffer)) {
+ BytesRefBuilder b = new BytesRefBuilder();
+ for (int i = 0; i < maxDoc; ++i) {
+ assertEquals(i, values.nextDoc());
+ final int numValues = in.readVInt();
+
+ for (int j = 0; j < numValues; ++j) {
+ b.setLength(in.readVInt());
+ b.grow(b.length());
+ in.readBytes(b.bytes(), 0, b.length());
+ assertEquals(b.get(), values.lookupOrd(values.nextOrd()));
+ }
- assertEquals(SortedSetDocValues.NO_MORE_ORDS, values.nextOrd());
+ assertEquals(SortedSetDocValues.NO_MORE_ORDS, values.nextOrd());
+ }
}
r.close();
dir.close();
@@ -493,12 +497,12 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
@Slow
public void testSortedNumericAroundBlockSize() throws IOException {
- final int frontier = 1 << Lucene70DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
+ final int frontier = 1 << Lucene80DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
for (int maxDoc = frontier - 1; maxDoc <= frontier + 1; ++maxDoc) {
final Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()));
- ByteBuffersDataOutput buffer = new ByteBuffersDataOutput();
-
+ RAMFile buffer = new RAMFile();
+ RAMOutputStream out = new RAMOutputStream(buffer, false);
Document doc = new Document();
SortedNumericDocValuesField field1 = new SortedNumericDocValuesField("snum", 0L);
doc.add(field1);
@@ -510,10 +514,10 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
field1.setLongValue(s1);
field2.setLongValue(s2);
w.addDocument(doc);
- buffer.writeVLong(Math.min(s1, s2));
- buffer.writeVLong(Math.max(s1, s2));
+ out.writeVLong(Math.min(s1, s2));
+ out.writeVLong(Math.max(s1, s2));
}
-
+ out.close();
w.forceMerge(1);
DirectoryReader r = DirectoryReader.open(w);
w.close();
@@ -521,12 +525,13 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
assertEquals(maxDoc, sr.maxDoc());
SortedNumericDocValues values = sr.getSortedNumericDocValues("snum");
assertNotNull(values);
- ByteBuffersDataInput dataInput = buffer.toDataInput();
- for (int i = 0; i < maxDoc; ++i) {
- assertEquals(i, values.nextDoc());
- assertEquals(2, values.docValueCount());
- assertEquals(dataInput.readVLong(), values.nextValue());
- assertEquals(dataInput.readVLong(), values.nextValue());
+ try (RAMInputStream in = new RAMInputStream("", buffer)) {
+ for (int i = 0; i < maxDoc; ++i) {
+ assertEquals(i, values.nextDoc());
+ assertEquals(2, values.docValueCount());
+ assertEquals(in.readVLong(), values.nextValue());
+ assertEquals(in.readVLong(), values.nextValue());
+ }
}
r.close();
dir.close();
@@ -553,15 +558,62 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
doTestSparseNumericBlocksOfVariousBitsPerValue(random().nextDouble());
}
+ // The LUCENE-8585 jump-tables enables O(1) skipping of IndexedDISI blocks, DENSE block lookup
+ // and numeric multi blocks. This test focuses on testing these jumps.
+ @Slow
+ public void testNumericFieldJumpTables() throws Exception {
+ // IndexedDISI block skipping only activated if target >= current+2, so we need at least 5 blocks to
+ // trigger consecutive block skips
+ final int maxDoc = atLeast(5*65536);
+
+ Directory dir = newDirectory();
+ IndexWriter iw = createFastIndexWriter(dir, maxDoc);
+
+ Field idField = newStringField("id", "", Field.Store.NO);
+ Field storedField = newStringField("stored", "", Field.Store.YES);
+ Field dvField = new NumericDocValuesField("dv", 0);
+
+ for (int i = 0 ; i < maxDoc ; i++) {
+ Document doc = new Document();
+ idField.setStringValue(Integer.toBinaryString(i));
+ doc.add(idField);
+ if (random().nextInt(100) > 10) { // Skip 10% to make DENSE blocks
+ int value = random().nextInt(100000);
+ storedField.setStringValue(Integer.toString(value));
+ doc.add(storedField);
+ dvField.setLongValue(value);
+ doc.add(dvField);
+ }
+ iw.addDocument(doc);
+ }
+ iw.flush();
+ iw.forceMerge(1, true); // Single segment to force large enough structures
+ iw.commit();
+ iw.close();
+
+ assertDVIterate(dir);
+ assertDVAdvance(dir, rarely() ? 1 : 7); // 1 is heavy (~20 s), so we do it rarely. 7 is a lot faster (8 s)
+
+ dir.close();
+ }
+
+ private IndexWriter createFastIndexWriter(Directory dir, int maxBufferedDocs) throws IOException {
+ IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
+ conf.setMaxBufferedDocs(maxBufferedDocs);
+ conf.setRAMBufferSizeMB(-1);
+ conf.setMergePolicy(newLogMergePolicy(random().nextBoolean()));
+ return new IndexWriter(dir, conf);
+ }
+
private static LongSupplier blocksOfVariousBPV() {
final long mul = TestUtil.nextInt(random(), 1, 100);
final long min = random().nextInt();
return new LongSupplier() {
- int i = Lucene70DocValuesFormat.NUMERIC_BLOCK_SIZE;
+ int i = Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE;
int maxDelta;
@Override
public long getAsLong() {
- if (i == Lucene70DocValuesFormat.NUMERIC_BLOCK_SIZE) {
+ if (i == Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE) {
maxDelta = 1 << random().nextInt(5);
i = 0;
}
@@ -574,12 +626,12 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
private void doTestSortedNumericBlocksOfVariousBitsPerValue(LongSupplier counts) throws Exception {
Directory dir = newDirectory();
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
- conf.setMaxBufferedDocs(atLeast(Lucene70DocValuesFormat.NUMERIC_BLOCK_SIZE));
+ conf.setMaxBufferedDocs(atLeast(Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE));
conf.setRAMBufferSizeMB(-1);
conf.setMergePolicy(newLogMergePolicy(random().nextBoolean()));
IndexWriter writer = new IndexWriter(dir, conf);
- final int numDocs = atLeast(Lucene70DocValuesFormat.NUMERIC_BLOCK_SIZE*3);
+ final int numDocs = atLeast(Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE*3);
final LongSupplier values = blocksOfVariousBPV();
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
@@ -633,7 +685,7 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
private void doTestSparseNumericBlocksOfVariousBitsPerValue(double density) throws Exception {
Directory dir = newDirectory();
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
- conf.setMaxBufferedDocs(atLeast(Lucene70DocValuesFormat.NUMERIC_BLOCK_SIZE));
+ conf.setMaxBufferedDocs(atLeast(Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE));
conf.setRAMBufferSizeMB(-1);
conf.setMergePolicy(newLogMergePolicy(random().nextBoolean()));
IndexWriter writer = new IndexWriter(dir, conf);
@@ -643,7 +695,7 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
doc.add(storedField);
doc.add(dvField);
- final int numDocs = atLeast(Lucene70DocValuesFormat.NUMERIC_BLOCK_SIZE*3);
+ final int numDocs = atLeast(Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE*3);
final LongSupplier longs = blocksOfVariousBPV();
for (int i = 0; i < numDocs; i++) {
if (random().nextDouble() > density) {
@@ -661,25 +713,39 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
writer.close();
// compare
+ assertDVIterate(dir);
+ assertDVAdvance(dir, 1); // Tests all jump-lengths from 1 to maxDoc (quite slow ~= 1 minute for 200K docs)
+
+ dir.close();
+ }
+
+ // Tests that advanceExact does not change the outcome
+ private void assertDVAdvance(Directory dir, int jumpStep) throws IOException {
DirectoryReader ir = DirectoryReader.open(dir);
TestUtil.checkReader(ir);
for (LeafReaderContext context : ir.leaves()) {
LeafReader r = context.reader();
- NumericDocValues docValues = DocValues.getNumeric(r, "dv");
- docValues.nextDoc();
- for (int i = 0; i < r.maxDoc(); i++) {
- String storedValue = r.document(i).get("stored");
- if (storedValue == null) {
- assertTrue(docValues.docID() > i);
- } else {
- assertEquals(i, docValues.docID());
- assertEquals(Long.parseLong(storedValue), docValues.longValue());
- docValues.nextDoc();
+
+
+ for (int jump = jumpStep; jump < r.maxDoc(); jump += jumpStep) {
+ // Create a new instance each time to ensure jumps from the beginning
+ NumericDocValues docValues = DocValues.getNumeric(r, "dv");
+ for (int docID = 0; docID < r.maxDoc(); docID += jump) {
+ String base = "document #" + docID + "/" + r.maxDoc() + ", jumping " + jump + " from #" + (docID-jump);
+ String storedValue = r.document(docID).get("stored");
+ if (storedValue == null) {
+ assertFalse("There should be no DocValue for " + base,
+ docValues.advanceExact(docID));
+ } else {
+ assertTrue("There should be a DocValue for " + base,
+ docValues.advanceExact(docID));
+ assertEquals("The doc value should be correct for " + base,
+ Long.parseLong(storedValue), docValues.longValue());
+ }
}
}
- assertEquals(DocIdSetIterator.NO_MORE_DOCS, docValues.docID());
}
ir.close();
- dir.close();
}
+
}
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70NormsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene80/TestLucene80NormsFormat.java
similarity index 84%
rename from lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70NormsFormat.java
rename to lucene/core/src/test/org/apache/lucene/codecs/lucene80/TestLucene80NormsFormat.java
index f7d7714..e6116a9 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene70/TestLucene70NormsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene80/TestLucene80NormsFormat.java
@@ -14,17 +14,16 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.lucene.codecs.lucene70;
+package org.apache.lucene.codecs.lucene80;
import org.apache.lucene.codecs.Codec;
-import org.apache.lucene.codecs.lucene80.Lucene80Codec;
import org.apache.lucene.index.BaseNormsFormatTestCase;
/**
- * Tests Lucene70NormsFormat
+ * Tests Lucene80NormsFormat
*/
-public class TestLucene70NormsFormat extends BaseNormsFormatTestCase {
+public class TestLucene80NormsFormat extends BaseNormsFormatTestCase {
private final Codec codec = new Lucene80Codec();
@Override
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDocValues.java b/lucene/core/src/test/org/apache/lucene/index/TestDocValues.java
index 0214e54..442fe7d 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestDocValues.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestDocValues.java
@@ -34,7 +34,7 @@ import org.apache.lucene.util.LuceneTestCase;
/** Tests helper methods in DocValues */
public class TestDocValues extends LuceneTestCase {
-
+
/**
* If the field doesn't exist, we return empty instances:
* it can easily happen that a segment just doesn't have any docs with the field.
@@ -123,8 +123,8 @@ public class TestDocValues extends LuceneTestCase {
iw.close();
dir.close();
}
-
- /**
+
+ /**
* field with binary docvalues
*/
public void testBinaryField() throws Exception {
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
index bd11b47..fd72496 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
@@ -1204,6 +1204,9 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
}
private void doTestNumericsVsStoredFields(double density, LongSupplier longs) throws Exception {
+ doTestNumericsVsStoredFields(density, longs, 256);
+ }
+ private void doTestNumericsVsStoredFields(double density, LongSupplier longs, int minDocs) throws Exception {
Directory dir = newDirectory();
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
@@ -1216,7 +1219,7 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
doc.add(dvField);
// index some docs
- int numDocs = atLeast(300);
+ int numDocs = atLeast((int) (minDocs*1.172));
// numDocs should be always > 256 so that in case of a codec that optimizes
// for numbers of values <= 256, all storage layouts are tested
assert numDocs > 256;
@@ -1243,12 +1246,17 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
}
// merge some segments and ensure that at least one of them has more than
- // 256 values
- writer.forceMerge(numDocs / 256);
+ // max(256, minDocs) values
+ writer.forceMerge(numDocs / Math.max(256, minDocs));
writer.close();
-
// compare
+ assertDVIterate(dir);
+ dir.close();
+ }
+
+ // Asserts equality of stored value vs. DocValue by iterating DocValues one at a time
+ protected void assertDVIterate(Directory dir) throws IOException {
DirectoryReader ir = DirectoryReader.open(dir);
TestUtil.checkReader(ir);
for (LeafReaderContext context : ir.leaves()) {
@@ -1268,9 +1276,8 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docValues.docID());
}
ir.close();
- dir.close();
}
-
+
private void doTestSortedNumericsVsStoredFields(LongSupplier counts, LongSupplier values) throws Exception {
Directory dir = newDirectory();
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java
index 64287a0..5350890 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java
@@ -53,7 +53,7 @@ import org.apache.lucene.codecs.asserting.AssertingCodec;
import org.apache.lucene.codecs.blockterms.LuceneFixedGap;
import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat;
-import org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat;
+import org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat;
import org.apache.lucene.codecs.lucene80.Lucene80Codec;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
@@ -930,7 +930,7 @@ public final class TestUtil {
* Returns the actual default docvalues format (e.g. LuceneMNDocValuesFormat for this version of Lucene.
*/
public static DocValuesFormat getDefaultDocValuesFormat() {
- return new Lucene70DocValuesFormat();
+ return new Lucene80DocValuesFormat();
}
// TODO: generalize all 'test-checks-for-crazy-codecs' to
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema_codec.xml b/solr/core/src/test-files/solr/collection1/conf/schema_codec.xml
index e259740..5c88fcd 100644
--- a/solr/core/src/test-files/solr/collection1/conf/schema_codec.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema_codec.xml
@@ -19,7 +19,7 @@
<fieldType name="string_direct" class="solr.StrField" postingsFormat="Direct" docValuesFormat="Direct"/>
<fieldType name="string_standard" class="solr.StrField" postingsFormat="Lucene50"/>
- <fieldType name="string_disk" class="solr.StrField" docValuesFormat="Lucene70"/>
+ <fieldType name="string_disk" class="solr.StrField" docValuesFormat="Lucene80"/>
<fieldType name="string" class="solr.StrField"/>