You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2019/10/29 08:32:38 UTC

[lucene-solr] branch branch_8x updated: LUCENE-9024: Optimize IntroSelector to use median of medians (#966)

This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8x by this push:
     new 7d0dbda  LUCENE-9024: Optimize IntroSelector to use median of medians (#966)
7d0dbda is described below

commit 7d0dbdaf62299389876a01621430b3ede572ed80
Author: Paul Sanwald <pa...@elastic.co>
AuthorDate: Tue Oct 29 04:21:52 2019 -0400

    LUCENE-9024: Optimize IntroSelector to use median of medians (#966)
---
 lucene/CHANGES.txt                                 |  4 +
 .../java/org/apache/lucene/util/IntroSelector.java | 97 ++++++++++++++++++----
 .../org/apache/lucene/util/TestIntroSelector.java  |  2 +-
 3 files changed, 85 insertions(+), 18 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 89406a0..d2d111b 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -35,6 +35,10 @@ Optimizations
 * LUCENE-8932: BKDReader's index is now stored off-heap when the IndexInput is
   an instance of ByteBufferIndexInput. (Jack Conradson via Adrien Grand)
 
+* LUCENE-9024: IntroSelector now falls back to the median of medians algorithm
+  instead of sorting when the maximum recursion level is exceeded, providing
+  better worst-case runtime. (Paul Sanwald via Adrien Grand)
+
 Bug Fixes
 
 * LUCENE-9001: Fix race condition in SetOnce. (Przemko Robakowski)
diff --git a/lucene/core/src/java/org/apache/lucene/util/IntroSelector.java b/lucene/core/src/java/org/apache/lucene/util/IntroSelector.java
index 2984801..b492d22 100644
--- a/lucene/core/src/java/org/apache/lucene/util/IntroSelector.java
+++ b/lucene/core/src/java/org/apache/lucene/util/IntroSelector.java
@@ -20,9 +20,8 @@ import java.util.Comparator;
 
 /** Implementation of the quick select algorithm.
  *  <p>It uses the median of the first, middle and last values as a pivot and
- *  falls back to a heap sort when the number of recursion levels exceeds
- *  {@code 2 lg(n)}, as a consequence it runs in linear time on average and in
- *  {@code n log(n)} time in the worst case.</p>
+ *  falls back to a median of medians when the number of recursion levels exceeds
+ *  {@code 2 lg(n)}, as a consequence it runs in linear time on average.</p>
  *  @lucene.internal */
 public abstract class IntroSelector extends Selector {
 
@@ -33,26 +32,90 @@ public abstract class IntroSelector extends Selector {
     quickSelect(from, to, k, maxDepth);
   }
 
-  // heap sort
-  // TODO: use median of median instead to have linear worst-case rather than
-  // n*log(n)
-  void slowSelect(int from, int to, int k) {
-    new Sorter() {
+  int slowSelect(int from, int to, int k) {
+    return medianOfMediansSelect(from, to-1, k);
+  }
+
+  int medianOfMediansSelect(int left, int right, int k) {
+    do {
+      // Defensive check, this is also checked in the calling
+      // method. Including here so this method can be used
+      // as a self contained quickSelect implementation.
+      if (left == right) {
+        return left;
+      }
+      int pivotIndex = pivot(left, right);
+      pivotIndex = partition(left, right, k, pivotIndex);
+      if (k == pivotIndex) {
+        return k;
+      } else if (k < pivotIndex) {
+        right = pivotIndex-1;
+      } else {
+        left = pivotIndex+1;
+      }
+    } while (left != right);
+    return left;
+  }
 
-      @Override
-      protected void swap(int i, int j) {
-        IntroSelector.this.swap(i, j);
+  private int partition(int left, int right, int k, int pivotIndex) {
+    setPivot(pivotIndex);
+    swap(pivotIndex, right);
+    int storeIndex = left;
+    for (int i = left; i < right; i++) {
+      if (comparePivot(i) > 0) {
+        swap(storeIndex, i);
+        storeIndex++;
+      }
+    }
+    int storeIndexEq = storeIndex;
+    for (int i = storeIndex; i < right; i++) {
+      if (comparePivot(i) == 0) {
+        swap(storeIndexEq, i);
+        storeIndexEq++;
       }
+    }
+    swap(right, storeIndexEq);
+    if (k < storeIndex) {
+      return storeIndex;
+    } else if (k <= storeIndexEq) {
+      return k;
+    }
+    return storeIndexEq;
+  }
+
+  private int pivot(int left, int right) {
+    if (right - left < 5) {
+      int pivotIndex = partition5(left, right);
+      return pivotIndex;
+    }
 
-      @Override
-      protected int compare(int i, int j) {
-        return IntroSelector.this.compare(i, j);
+    for (int i = left; i <= right; i=i+5) {
+      int subRight = i + 4;
+      if (subRight > right) {
+        subRight = right;
       }
+      int median5 = partition5(i, subRight);
+      swap(median5, left + ((i-left)/5));
+    }
+    int mid = ((right - left) / 10) + left + 1;
+    int to = left + ((right - left)/5);
+    return medianOfMediansSelect(left, to, mid);
+  }
 
-      public void sort(int from, int to) {
-        heapSort(from, to);
+  // selects the median of a group of at most five elements,
+  // implemented using insertion sort. Efficient due to
+  // bounded nature of data set.
+  private int partition5(int left, int right) {
+    int i = left + 1;
+    while( i <= right) {
+      int j = i;
+      while (j > left && compare(j-1,j)>0) {
+        swap(j-1, j);
+        j--;
       }
-    }.sort(from, to);
+      i++;
+    }
+    return (left + right) >>> 1;
   }
 
   private void quickSelect(int from, int to, int k, int maxDepth) {
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestIntroSelector.java b/lucene/core/src/test/org/apache/lucene/util/TestIntroSelector.java
index 468b2f7..c10269a 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestIntroSelector.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestIntroSelector.java
@@ -36,7 +36,7 @@ public class TestIntroSelector extends LuceneTestCase {
     final int from = random().nextInt(5);
     final int to = from + TestUtil.nextInt(random(), 1, 10000);
     final int max = random().nextBoolean() ? random().nextInt(100) : random().nextInt(100000);
-    Integer[] arr = new Integer[from + to + random().nextInt(5)];
+    Integer[] arr = new Integer[to + random().nextInt(5)];
     for (int i = 0; i < arr.length; ++i) {
       arr[i] = TestUtil.nextInt(random(), 0, max);
     }