You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2022/04/25 07:18:26 UTC
[lucene] branch main updated: LUCENE-8836: Speed up TermsEnum#lookupOrd on increasing sequences of ords. (#827)
This is an automated email from the ASF dual-hosted git repository.
jpountz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new 2a4c21bb586 LUCENE-8836: Speed up TermsEnum#lookupOrd on increasing sequences of ords. (#827)
2a4c21bb586 is described below
commit 2a4c21bb586c2c5afb8550b88cbfd9dd15d433c5
Author: Adrien Grand <jp...@gmail.com>
AuthorDate: Mon Apr 25 09:18:21 2022 +0200
LUCENE-8836: Speed up TermsEnum#lookupOrd on increasing sequences of ords. (#827)
---
lucene/CHANGES.txt | 3 +
.../codecs/lucene90/Lucene90DocValuesProducer.java | 18 ++++--
.../lucene90/TestLucene90DocValuesFormat.java | 65 ++++++++++++++++++++++
3 files changed, 80 insertions(+), 6 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index c2c1fcd6cd4..76304b3eee9 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -121,6 +121,9 @@ Optimizations
* LUCENE-10315: Use SIMD instructions to decode BKD doc IDs. (Guo Feng, Adrien Grand, Ignacio Vera)
+* LUCENE-8836: Speed up calls to TermsEnum#lookupOrd on doc values terms enums
+ and sequences of increasing ords. (Bruno Roustant, Adrien Grand)
+
Bug Fixes
---------------------
* LUCENE-10477: Highlighter: WeightedSpanTermExtractor.extractWeightedSpanTerms to Query#rewrite
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java
index b0f8e048502..f1647d26273 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java
@@ -1111,13 +1111,19 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
if (ord < 0 || ord >= entry.termsDictSize) {
throw new IndexOutOfBoundsException();
}
- final long blockIndex = ord >>> TERMS_DICT_BLOCK_LZ4_SHIFT;
- final long blockAddress = blockAddresses.get(blockIndex);
- bytes.seek(blockAddress);
- this.ord = (blockIndex << TERMS_DICT_BLOCK_LZ4_SHIFT) - 1;
- do {
+ // Signed shift since ord is -1 when the terms enum is not positioned
+ final long currentBlockIndex = this.ord >> TERMS_DICT_BLOCK_LZ4_SHIFT;
+ final long blockIndex = ord >> TERMS_DICT_BLOCK_LZ4_SHIFT;
+ if (ord < this.ord || blockIndex != currentBlockIndex) {
+ // The looked up ord is before the current ord or belongs to a different block, seek again
+ final long blockAddress = blockAddresses.get(blockIndex);
+ bytes.seek(blockAddress);
+ this.ord = (blockIndex << TERMS_DICT_BLOCK_LZ4_SHIFT) - 1;
+ }
+ // Scan to the looked up ord
+ while (this.ord < ord) {
next();
- } while (this.ord < ord);
+ }
}
private BytesRef getTermFromIndex(long index) throws IOException {
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90DocValuesFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90DocValuesFormat.java
index 4e79ee9dfd7..ef085bb03db 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90DocValuesFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90DocValuesFormat.java
@@ -865,4 +865,69 @@ public class TestLucene90DocValuesFormat extends BaseCompressingDocValuesFormatT
ireader.close();
}
}
+
+ public void testSortedTermsDictLookupOrd() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig());
+ Document doc = new Document();
+ SortedDocValuesField field = new SortedDocValuesField("foo", new BytesRef());
+ doc.add(field);
+ final int numDocs = atLeast(Lucene90DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SIZE + 1);
+ for (int i = 0; i < numDocs; ++i) {
+ field.setBytesValue(new BytesRef("" + i));
+ writer.addDocument(doc);
+ }
+ writer.forceMerge(1);
+ IndexReader reader = DirectoryReader.open(writer);
+ LeafReader leafReader = getOnlyLeafReader(reader);
+ doTestTermsDictLookupOrd(leafReader.getSortedDocValues("foo").termsEnum());
+ reader.close();
+ writer.close();
+ dir.close();
+ }
+
+ public void testSortedSetTermsDictLookupOrd() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig());
+ Document doc = new Document();
+ SortedSetDocValuesField field = new SortedSetDocValuesField("foo", new BytesRef());
+ doc.add(field);
+ final int numDocs = atLeast(2 * Lucene90DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SIZE + 1);
+ for (int i = 0; i < numDocs; ++i) {
+ field.setBytesValue(new BytesRef("" + i));
+ writer.addDocument(doc);
+ }
+ writer.forceMerge(1);
+ IndexReader reader = DirectoryReader.open(writer);
+ LeafReader leafReader = getOnlyLeafReader(reader);
+ doTestTermsDictLookupOrd(leafReader.getSortedSetDocValues("foo").termsEnum());
+ reader.close();
+ writer.close();
+ dir.close();
+ }
+
+ private void doTestTermsDictLookupOrd(TermsEnum te) throws IOException {
+ List<BytesRef> terms = new ArrayList<>();
+ for (BytesRef term = te.next(); term != null; term = te.next()) {
+ terms.add(BytesRef.deepCopyOf(term));
+ }
+
+ // iterate in order
+ for (int i = 0; i < terms.size(); ++i) {
+ te.seekExact(i);
+ assertEquals(terms.get(i), te.term());
+ }
+
+ // iterate in reverse order
+ for (int i = terms.size() - 1; i >= 0; --i) {
+ te.seekExact(i);
+ assertEquals(terms.get(i), te.term());
+ }
+
+ // iterate in forward order with random gaps
+ for (int i = random().nextInt(5); i < terms.size(); i += random().nextInt(5)) {
+ te.seekExact(i);
+ assertEquals(terms.get(i), te.term());
+ }
+ }
}