You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2022/04/25 07:18:26 UTC
[lucene] branch main updated: LUCENE-8836: Speed up TermsEnum#lookupOrd on increasing sequences of ords. (#827)

This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new 2a4c21bb586 LUCENE-8836: Speed up TermsEnum#lookupOrd on increasing sequences of ords. (#827)
2a4c21bb586 is described below

commit 2a4c21bb586c2c5afb8550b88cbfd9dd15d433c5
Author: Adrien Grand <jp...@gmail.com>
AuthorDate: Mon Apr 25 09:18:21 2022 +0200

    LUCENE-8836: Speed up TermsEnum#lookupOrd on increasing sequences of ords. (#827)
---
 lucene/CHANGES.txt                                 |  3 +
 .../codecs/lucene90/Lucene90DocValuesProducer.java | 18 ++++--
 .../lucene90/TestLucene90DocValuesFormat.java      | 65 ++++++++++++++++++++++
 3 files changed, 80 insertions(+), 6 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index c2c1fcd6cd4..76304b3eee9 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -121,6 +121,9 @@ Optimizations
 
 * LUCENE-10315: Use SIMD instructions to decode BKD doc IDs. (Guo Feng, Adrien Grand, Ignacio Vera)
 
+* LUCENE-8836: Speed up calls to TermsEnum#lookupOrd on doc values terms enums
+  and sequences of increasing ords. (Bruno Roustant, Adrien Grand)
+
 Bug Fixes
 ---------------------
 * LUCENE-10477: Highlighter: WeightedSpanTermExtractor.extractWeightedSpanTerms to Query#rewrite
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java
index b0f8e048502..f1647d26273 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java
@@ -1111,13 +1111,19 @@ final class Lucene90DocValuesProducer extends DocValuesProducer {
       if (ord < 0 || ord >= entry.termsDictSize) {
         throw new IndexOutOfBoundsException();
       }
-      final long blockIndex = ord >>> TERMS_DICT_BLOCK_LZ4_SHIFT;
-      final long blockAddress = blockAddresses.get(blockIndex);
-      bytes.seek(blockAddress);
-      this.ord = (blockIndex << TERMS_DICT_BLOCK_LZ4_SHIFT) - 1;
-      do {
+      // Signed shift since ord is -1 when the terms enum is not positioned
+      final long currentBlockIndex = this.ord >> TERMS_DICT_BLOCK_LZ4_SHIFT;
+      final long blockIndex = ord >> TERMS_DICT_BLOCK_LZ4_SHIFT;
+      if (ord < this.ord || blockIndex != currentBlockIndex) {
+        // The looked up ord is before the current ord or belongs to a different block, seek again
+        final long blockAddress = blockAddresses.get(blockIndex);
+        bytes.seek(blockAddress);
+        this.ord = (blockIndex << TERMS_DICT_BLOCK_LZ4_SHIFT) - 1;
+      }
+      // Scan to the looked up ord
+      while (this.ord < ord) {
         next();
-      } while (this.ord < ord);
+      }
     }
 
     private BytesRef getTermFromIndex(long index) throws IOException {
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90DocValuesFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90DocValuesFormat.java
index 4e79ee9dfd7..ef085bb03db 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90DocValuesFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90DocValuesFormat.java
@@ -865,4 +865,69 @@ public class TestLucene90DocValuesFormat extends BaseCompressingDocValuesFormatT
       ireader.close();
     }
   }
+
+  public void testSortedTermsDictLookupOrd() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig());
+    Document doc = new Document();
+    SortedDocValuesField field = new SortedDocValuesField("foo", new BytesRef());
+    doc.add(field);
+    final int numDocs = atLeast(Lucene90DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SIZE + 1);
+    for (int i = 0; i < numDocs; ++i) {
+      field.setBytesValue(new BytesRef("" + i));
+      writer.addDocument(doc);
+    }
+    writer.forceMerge(1);
+    IndexReader reader = DirectoryReader.open(writer);
+    LeafReader leafReader = getOnlyLeafReader(reader);
+    doTestTermsDictLookupOrd(leafReader.getSortedDocValues("foo").termsEnum());
+    reader.close();
+    writer.close();
+    dir.close();
+  }
+
+  public void testSortedSetTermsDictLookupOrd() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig());
+    Document doc = new Document();
+    SortedSetDocValuesField field = new SortedSetDocValuesField("foo", new BytesRef());
+    doc.add(field);
+    final int numDocs = atLeast(2 * Lucene90DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SIZE + 1);
+    for (int i = 0; i < numDocs; ++i) {
+      field.setBytesValue(new BytesRef("" + i));
+      writer.addDocument(doc);
+    }
+    writer.forceMerge(1);
+    IndexReader reader = DirectoryReader.open(writer);
+    LeafReader leafReader = getOnlyLeafReader(reader);
+    doTestTermsDictLookupOrd(leafReader.getSortedSetDocValues("foo").termsEnum());
+    reader.close();
+    writer.close();
+    dir.close();
+  }
+
+  private void doTestTermsDictLookupOrd(TermsEnum te) throws IOException {
+    List<BytesRef> terms = new ArrayList<>();
+    for (BytesRef term = te.next(); term != null; term = te.next()) {
+      terms.add(BytesRef.deepCopyOf(term));
+    }
+
+    // iterate in order
+    for (int i = 0; i < terms.size(); ++i) {
+      te.seekExact(i);
+      assertEquals(terms.get(i), te.term());
+    }
+
+    // iterate in reverse order
+    for (int i = terms.size() - 1; i >= 0; --i) {
+      te.seekExact(i);
+      assertEquals(terms.get(i), te.term());
+    }
+
+    // iterate in forward order with random gaps
+    for (int i = random().nextInt(5); i < terms.size(); i += random().nextInt(5)) {
+      te.seekExact(i);
+      assertEquals(terms.get(i), te.term());
+    }
+  }
 }