You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2019/10/29 08:32:38 UTC
[lucene-solr] branch branch_8x updated: LUCENE-9024: Optimize
IntroSelector to use median of medians (#966)
This is an automated email from the ASF dual-hosted git repository.
jpountz pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/branch_8x by this push:
new 7d0dbda LUCENE-9024: Optimize IntroSelector to use median of medians (#966)
7d0dbda is described below
commit 7d0dbdaf62299389876a01621430b3ede572ed80
Author: Paul Sanwald <pa...@elastic.co>
AuthorDate: Tue Oct 29 04:21:52 2019 -0400
LUCENE-9024: Optimize IntroSelector to use median of medians (#966)
---
lucene/CHANGES.txt | 4 +
.../java/org/apache/lucene/util/IntroSelector.java | 97 ++++++++++++++++++----
.../org/apache/lucene/util/TestIntroSelector.java | 2 +-
3 files changed, 85 insertions(+), 18 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 89406a0..d2d111b 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -35,6 +35,10 @@ Optimizations
* LUCENE-8932: BKDReader's index is now stored off-heap when the IndexInput is
an instance of ByteBufferIndexInput. (Jack Conradson via Adrien Grand)
+* LUCENE-9024: IntroSelector now falls back to the median of medians algorithm
+ instead of sorting when the maximum recursion level is exceeded, providing
+ better worst-case runtime. (Paul Sanwald via Adrien Grand)
+
Bug Fixes
* LUCENE-9001: Fix race condition in SetOnce. (Przemko Robakowski)
diff --git a/lucene/core/src/java/org/apache/lucene/util/IntroSelector.java b/lucene/core/src/java/org/apache/lucene/util/IntroSelector.java
index 2984801..b492d22 100644
--- a/lucene/core/src/java/org/apache/lucene/util/IntroSelector.java
+++ b/lucene/core/src/java/org/apache/lucene/util/IntroSelector.java
@@ -20,9 +20,8 @@ import java.util.Comparator;
/** Implementation of the quick select algorithm.
* <p>It uses the median of the first, middle and last values as a pivot and
- * falls back to a heap sort when the number of recursion levels exceeds
- * {@code 2 lg(n)}, as a consequence it runs in linear time on average and in
- * {@code n log(n)} time in the worst case.</p>
+ * falls back to a median of medians when the number of recursion levels exceeds
+ * {@code 2 lg(n)}, as a consequence it runs in linear time on average.</p>
* @lucene.internal */
public abstract class IntroSelector extends Selector {
@@ -33,26 +32,90 @@ public abstract class IntroSelector extends Selector {
quickSelect(from, to, k, maxDepth);
}
- // heap sort
- // TODO: use median of median instead to have linear worst-case rather than
- // n*log(n)
- void slowSelect(int from, int to, int k) {
- new Sorter() {
+ int slowSelect(int from, int to, int k) {
+ return medianOfMediansSelect(from, to-1, k);
+ }
+
+ int medianOfMediansSelect(int left, int right, int k) {
+ do {
+ // Defensive check, this is also checked in the calling
+ // method. Including here so this method can be used
+ // as a self contained quickSelect implementation.
+ if (left == right) {
+ return left;
+ }
+ int pivotIndex = pivot(left, right);
+ pivotIndex = partition(left, right, k, pivotIndex);
+ if (k == pivotIndex) {
+ return k;
+ } else if (k < pivotIndex) {
+ right = pivotIndex-1;
+ } else {
+ left = pivotIndex+1;
+ }
+ } while (left != right);
+ return left;
+ }
- @Override
- protected void swap(int i, int j) {
- IntroSelector.this.swap(i, j);
+ private int partition(int left, int right, int k, int pivotIndex) {
+ setPivot(pivotIndex);
+ swap(pivotIndex, right);
+ int storeIndex = left;
+ for (int i = left; i < right; i++) {
+ if (comparePivot(i) > 0) {
+ swap(storeIndex, i);
+ storeIndex++;
+ }
+ }
+ int storeIndexEq = storeIndex;
+ for (int i = storeIndex; i < right; i++) {
+ if (comparePivot(i) == 0) {
+ swap(storeIndexEq, i);
+ storeIndexEq++;
}
+ }
+ swap(right, storeIndexEq);
+ if (k < storeIndex) {
+ return storeIndex;
+ } else if (k <= storeIndexEq) {
+ return k;
+ }
+ return storeIndexEq;
+ }
+
+ private int pivot(int left, int right) {
+ if (right - left < 5) {
+ int pivotIndex = partition5(left, right);
+ return pivotIndex;
+ }
- @Override
- protected int compare(int i, int j) {
- return IntroSelector.this.compare(i, j);
+ for (int i = left; i <= right; i=i+5) {
+ int subRight = i + 4;
+ if (subRight > right) {
+ subRight = right;
}
+ int median5 = partition5(i, subRight);
+ swap(median5, left + ((i-left)/5));
+ }
+ int mid = ((right - left) / 10) + left + 1;
+ int to = left + ((right - left)/5);
+ return medianOfMediansSelect(left, to, mid);
+ }
- public void sort(int from, int to) {
- heapSort(from, to);
+ // selects the median of a group of at most five elements,
+ // implemented using insertion sort. Efficient due to
+ // bounded nature of data set.
+ private int partition5(int left, int right) {
+ int i = left + 1;
+ while( i <= right) {
+ int j = i;
+ while (j > left && compare(j-1,j)>0) {
+ swap(j-1, j);
+ j--;
}
- }.sort(from, to);
+ i++;
+ }
+ return (left + right) >>> 1;
}
private void quickSelect(int from, int to, int k, int maxDepth) {
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestIntroSelector.java b/lucene/core/src/test/org/apache/lucene/util/TestIntroSelector.java
index 468b2f7..c10269a 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestIntroSelector.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestIntroSelector.java
@@ -36,7 +36,7 @@ public class TestIntroSelector extends LuceneTestCase {
final int from = random().nextInt(5);
final int to = from + TestUtil.nextInt(random(), 1, 10000);
final int max = random().nextBoolean() ? random().nextInt(100) : random().nextInt(100000);
- Integer[] arr = new Integer[from + to + random().nextInt(5)];
+ Integer[] arr = new Integer[to + random().nextInt(5)];
for (int i = 0; i < arr.length; ++i) {
arr[i] = TestUtil.nextInt(random(), 0, max);
}