You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2022/10/18 12:27:34 UTC
[lucene] branch main updated: Revert "Binary search the entries when all suffixes have the same length in a leaf block. (#11722)"
This is an automated email from the ASF dual-hosted git repository.
jpountz pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new 2ed16c78464 Revert "Binary search the entries when all suffixes have the same length in a leaf block. (#11722)"
2ed16c78464 is described below
commit 2ed16c78464680a3f410170b688aa45c85d641b7
Author: Adrien Grand <jp...@gmail.com>
AuthorDate: Tue Oct 18 14:27:02 2022 +0200
Revert "Binary search the entries when all suffixes have the same length in a leaf block. (#11722)"
This reverts commit 3adec5b1ce94cc6e910375e01d270373a8cebba3.
---
lucene/CHANGES.txt | 3 -
.../lucene90/blocktree/SegmentTermsEnumFrame.java | 99 +---------------------
.../tests/index/BasePostingsFormatTestCase.java | 51 -----------
3 files changed, 4 insertions(+), 149 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 4c1fbb84960..00e140ee713 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -112,9 +112,6 @@ Improvements
* GITHUB#687: speed up IndexSortSortedNumericDocValuesRangeQuery#BoundedDocIdSetIterator
construction using bkd binary search. (Jianping Weng)
-
-* GITHUB#11722: Binary search the entries when all suffixes have the same length
- in a leaf block of the terms dictionary. (zhouhui)
Bug Fixes
---------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java
index d9a28c51895..48c4fd0a6d4 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java
@@ -75,9 +75,6 @@ final class SegmentTermsEnumFrame {
// True if all entries are terms
boolean isLeafBlock;
- // True if all entries have the same length.
- boolean allEqual;
-
long lastSubFP;
int nextFloorLabel;
@@ -190,7 +187,7 @@ final class SegmentTermsEnumFrame {
suffixesReader.reset(suffixBytes, 0, numSuffixBytes);
int numSuffixLengthBytes = ste.in.readVInt();
- allEqual = (numSuffixLengthBytes & 0x01) != 0;
+ final boolean allEqual = (numSuffixLengthBytes & 0x01) != 0;
numSuffixLengthBytes >>>= 1;
if (suffixLengthBytes.length < numSuffixLengthBytes) {
suffixLengthBytes = new byte[ArrayUtil.oversize(numSuffixLengthBytes, 1)];
@@ -530,9 +527,7 @@ final class SegmentTermsEnumFrame {
// NOTE: sets startBytePos/suffix as a side effect
public SeekStatus scanToTerm(BytesRef target, boolean exactOnly) throws IOException {
- return isLeafBlock
- ? allEqual ? binarySearchTermLeaf(target, exactOnly) : scanToTermLeaf(target, exactOnly)
- : scanToTermNonLeaf(target, exactOnly);
+ return isLeafBlock ? scanToTermLeaf(target, exactOnly) : scanToTermNonLeaf(target, exactOnly);
}
private int startBytePos;
@@ -577,6 +572,8 @@ final class SegmentTermsEnumFrame {
assert prefixMatches(target);
+ // TODO: binary search when all terms have the same length, which is common for ID fields,
+ // which are also the most sensitive to lookup performance?
// Loop over each entry (term or sub-block) in this block:
do {
nextEnt++;
@@ -649,94 +646,6 @@ final class SegmentTermsEnumFrame {
return SeekStatus.END;
}
- // Target's prefix matches this block's prefix;
- // And all suffixes have the same length in this block,
- // we binary search the entries check if the suffix matches.
- public SeekStatus binarySearchTermLeaf(BytesRef target, boolean exactOnly) throws IOException {
- // if (DEBUG) System.out.println(" binarySearchTermLeaf: block fp=" + fp + " prefix=" +
- // prefix + "
- // nextEnt=" + nextEnt + " (of " + entCount + ") target=" + brToString(target) + " term=" +
- // brToString(term));
-
- assert nextEnt != -1;
-
- ste.termExists = true;
- subCode = 0;
-
- if (nextEnt == entCount) {
- if (exactOnly) {
- fillTerm();
- }
- return SeekStatus.END;
- }
-
- assert prefixMatches(target);
-
- suffix = suffixLengthsReader.readVInt();
- int start = nextEnt;
- int end = entCount - 1;
- // Binary search the entries (terms) in this leaf block:
- int cmp = 0;
- while (start <= end) {
- int mid = (start + end) / 2;
- nextEnt = mid + 1;
- startBytePos = mid * suffix;
- suffixesReader.setPosition(startBytePos + suffix);
-
- // Binary search bytes in the suffix, comparing to the target
- cmp =
- Arrays.compareUnsigned(
- suffixBytes,
- startBytePos,
- startBytePos + suffix,
- target.bytes,
- target.offset + prefix,
- target.offset + target.length);
- if (cmp < 0) {
- start = mid + 1;
- } else if (cmp > 0) {
- end = mid - 1;
- } else {
- // Exact match!
-
- // This cannot be a sub-block because we
- // would have followed the index to this
- // sub-block from the start:
-
- assert ste.termExists;
- fillTerm();
- // if (DEBUG) System.out.println(" found!");
- return SeekStatus.FOUND;
- }
- }
-
- // It is possible (and OK) that terms index pointed us
- // at this block, but, we searched the entire block and
- // did not find the term to position to. This happens
- // when the target is after the last term in the block
- // (but, before the next term in the index). EG
- // target could be foozzz, and terms index pointed us
- // to the foo* block, but the last term in this block
- // was fooz (and, eg, first term in the next block will
- // bee fop).
- // if (DEBUG) System.out.println(" block end");
- SeekStatus seekStatus = end < entCount - 1 ? SeekStatus.NOT_FOUND : SeekStatus.END;
- if (exactOnly || seekStatus == SeekStatus.NOT_FOUND) {
- // If binary search ended at the less term, we need to advance to the greater term.
- if (cmp < 0) {
- startBytePos += suffix;
- suffixesReader.skipBytes(suffix);
- nextEnt++;
- }
- fillTerm();
- }
-
- // TODO: not consistent that in the
- // not-exact case we don't next() into the next
- // frame here
- return seekStatus;
- }
-
// Target's prefix matches this block's prefix; we
// scan the entries check if the suffix matches.
public SeekStatus scanToTermNonLeaf(BytesRef target, boolean exactOnly) throws IOException {
diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java
index 16480692ff8..d98fa17b974 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java
@@ -367,57 +367,6 @@ public abstract class BasePostingsFormatTestCase extends BaseIndexFileFormatTest
dir.close();
}
- public void testBinarySearchTermLeaf() throws Exception {
- Directory dir = newDirectory();
-
- IndexWriterConfig iwc = newIndexWriterConfig(null);
- iwc.setCodec(getCodec());
- iwc.setMergePolicy(newTieredMergePolicy());
- IndexWriter iw = new IndexWriter(dir, iwc);
-
- for (int i = 100000; i <= 100400; i++) {
- // only add odd number
- if (i % 2 == 1) {
- Document document = new Document();
- document.add(new StringField("id", i + "", Field.Store.NO));
- iw.addDocument(document);
- }
- }
- iw.commit();
- iw.forceMerge(1);
-
- DirectoryReader reader = DirectoryReader.open(iw);
- TermsEnum termsEnum = getOnlyLeafReader(reader).terms("id").iterator();
- // test seekExact
- for (int i = 100000; i <= 100400; i++) {
- BytesRef target = new BytesRef(i + "");
- if (i % 2 == 1) {
- assertTrue(termsEnum.seekExact(target));
- assertEquals(termsEnum.term(), target);
- } else {
- assertFalse(termsEnum.seekExact(target));
- }
- }
- // test seekCeil
- for (int i = 100000; i < 100400; i++) {
- BytesRef target = new BytesRef(i + "");
- if (i % 2 == 1) {
- assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(target));
- assertEquals(termsEnum.term(), target);
- if (i <= 100397) {
- assertEquals(new BytesRef(i + 2 + ""), termsEnum.next());
- }
- } else {
- assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(target));
- assertEquals(new BytesRef(i + 1 + ""), termsEnum.term());
- }
- }
- assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef(100400 + "")));
- reader.close();
- iw.close();
- dir.close();
- }
-
// tests that level 2 ghost fields still work
public void testLevel2Ghosts() throws Exception {
Directory dir = newDirectory();