You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@lucene.apache.org by GitBox <gi...@apache.org> on 2021/11/08 17:53:27 UTC

[GitHub] [lucene] jpountz commented on a change in pull request #430: LUCENE-10225: Improve IntroSelector.

jpountz commented on a change in pull request #430:
URL: https://github.com/apache/lucene/pull/430#discussion_r744962085



##########
File path: lucene/core/src/java/org/apache/lucene/util/MathUtil.java
##########
@@ -46,6 +46,24 @@ public static double log(double base, double x) {
     return Math.log(x) / Math.log(base);
   }
 
+  /**
+   * Returns {@code x <= 0 ? 0 : Math.floor(Math.log(x) / Math.log(2))}.
+   *
+   * <p>This specialized method is 30x faster than {@link #log(long, int)}.
+   */
+  public static int log2(int x) {
+    return x <= 0 ? 0 : 31 - Integer.numberOfLeadingZeros(x);
+  }
+
+  /**
+   * Returns {@code x <= 0 ? 0 : Math.floor(Math.log(x) / Math.log(2))}.
+   *
+   * <p>This specialized method is 30x faster than {@link #log(long, int)}.
+   */
+  public static int log2(long x) {
+    return x <= 0 ? 0 : 63 - Long.numberOfLeadingZeros(x);
+  }

Review comment:
       should we instead have special branches for `base == 2` in `#log(long, int)`?

##########
File path: lucene/core/src/java/org/apache/lucene/util/IntroSelector.java
##########
@@ -17,173 +17,147 @@
 package org.apache.lucene.util;
 
 import java.util.Comparator;
+import java.util.Random;
 
 /**
- * Implementation of the quick select algorithm.
+ * Implementation of the introspective quick select algorithm using Tukey's ninther
+ * median-of-medians for pivot selection and Bentley-McIlroy 3-way partitioning. In addition, small
+ * ranges are sorted with insertion sort. The introspective protection shuffles the sub-range if the
+ * max recursive depth is exceeded.
  *
- * <p>It uses the median of the first, middle and last values as a pivot and falls back to a median
- * of medians when the number of recursion levels exceeds {@code 2 lg(n)}, as a consequence it runs
+ * <p>This selection algorithm is fast on most data shapes, especially with low cardinality. It runs
  * in linear time on average.
  *
  * @lucene.internal
  */
 public abstract class IntroSelector extends Selector {
 
+  private Random random;
+
   @Override
   public final void select(int from, int to, int k) {
     checkArgs(from, to, k);
-    final int maxDepth = 2 * MathUtil.log(to - from, 2);
-    quickSelect(from, to, k, maxDepth);
+    select(from, to, k, 2 * MathUtil.log2(to - from));
   }
 
-  int slowSelect(int from, int to, int k) {
-    return medianOfMediansSelect(from, to - 1, k);
-  }
+  private void select(int from, int to, int k, int maxDepth) {
+    // This code is similar to IntroSorter#sort, adapted to loop on a single partition.
+
+    // Sort small ranges with insertion sort.
+    int size;
+    while ((size = to - from) > Sorter.INSERTION_SORT_THRESHOLD) {
 
-  int medianOfMediansSelect(int left, int right, int k) {
-    do {
-      // Defensive check, this is also checked in the calling
-      // method. Including here so this method can be used
-      // as a self contained quickSelect implementation.
-      if (left == right) {
-        return left;
+      if (--maxDepth == -1) {
+        // Max recursion depth exceeded: shuffle (only once) and continue.
+        shuffle(from, to);
       }
-      int pivotIndex = pivot(left, right);
-      pivotIndex = partition(left, right, k, pivotIndex);
-      if (k == pivotIndex) {
-        return k;
-      } else if (k < pivotIndex) {
-        right = pivotIndex - 1;
+
+      // Pivot selection based on medians.
+      int last = to - 1;
+      int mid = (from + last) >>> 1;
+      int pivot;
+      if (size <= IntroSorter.SINGLE_MEDIAN_THRESHOLD) {
+        // Select the pivot with a single median around the middle element.
+        // Do not take the median between [from, mid, last] because it hurts performance
+        // if the order is descending in conjunction with the 3-way partitioning.
+        int range = size >> 2;
+        pivot = median(mid - range, mid, mid + range);
       } else {
-        left = pivotIndex + 1;
+        // Select the pivot with the Tukey's ninther median of medians.
+        int range = size >> 3;
+        int doubleRange = range << 1;
+        int medianFirst = median(from, from + range, from + doubleRange);
+        int medianMiddle = median(mid - range, mid, mid + range);
+        int medianLast = median(last - doubleRange, last - range, last);
+        pivot = median(medianFirst, medianMiddle, medianLast);
       }
-    } while (left != right);
-    return left;
-  }
 
-  private int partition(int left, int right, int k, int pivotIndex) {
-    setPivot(pivotIndex);
-    swap(pivotIndex, right);
-    int storeIndex = left;
-    for (int i = left; i < right; i++) {
-      if (comparePivot(i) > 0) {
-        swap(storeIndex, i);
-        storeIndex++;
+      // Bentley-McIlroy 3-way partitioning.
+      setPivot(pivot);
+      swap(from, pivot);
+      int i = from;
+      int j = to;
+      int p = from + 1;
+      int q = last;
+      while (true) {
+        int leftCmp, rightCmp;
+        while ((leftCmp = comparePivot(++i)) > 0) {}
+        while ((rightCmp = comparePivot(--j)) < 0) {}
+        if (i >= j) {
+          if (i == j && rightCmp == 0) {
+            swap(i, p);
+          }
+          break;
+        }
+        swap(i, j);
+        if (rightCmp == 0) {
+          swap(i, p++);
+        }
+        if (leftCmp == 0) {
+          swap(j, q--);
+        }
       }
-    }
-    int storeIndexEq = storeIndex;
-    for (int i = storeIndex; i < right; i++) {
-      if (comparePivot(i) == 0) {
-        swap(storeIndexEq, i);
-        storeIndexEq++;
+      i = j + 1;
+      for (int l = from; l < p; ) {
+        swap(l++, j--);
       }
-    }
-    swap(right, storeIndexEq);
-    if (k < storeIndex) {
-      return storeIndex;
-    } else if (k <= storeIndexEq) {
-      return k;
-    }
-    return storeIndexEq;
-  }
-
-  private int pivot(int left, int right) {
-    if (right - left < 5) {
-      int pivotIndex = partition5(left, right);
-      return pivotIndex;
-    }
-
-    for (int i = left; i <= right; i = i + 5) {
-      int subRight = i + 4;
-      if (subRight > right) {
-        subRight = right;
+      for (int l = last; l > q; ) {
+        swap(l--, i++);
       }
-      int median5 = partition5(i, subRight);
-      swap(median5, left + ((i - left) / 5));
-    }
-    int mid = ((right - left) / 10) + left + 1;
-    int to = left + ((right - left) / 5);
-    return medianOfMediansSelect(left, to, mid);
-  }
 
-  // selects the median of a group of at most five elements,
-  // implemented using insertion sort. Efficient due to
-  // bounded nature of data set.
-  private int partition5(int left, int right) {
-    int i = left + 1;
-    while (i <= right) {
-      int j = i;
-      while (j > left && compare(j - 1, j) > 0) {
-        swap(j - 1, j);
-        j--;
+      // Select the partition containing the k-th element.
+      if (k <= j) {
+        to = j + 1;
+      } else if (k >= i) {
+        from = i;
+      } else {
+        return;
       }
-      i++;
     }
-    return (left + right) >>> 1;
-  }
 
-  private void quickSelect(int from, int to, int k, int maxDepth) {
-    assert from <= k;
-    assert k < to;
-    if (to - from == 1) {
-      return;
-    }
-    if (--maxDepth < 0) {
-      slowSelect(from, to, k);
-      return;
-    }
+    insertionSort(from, to);
+  }
 
-    final int mid = (from + to) >>> 1;
-    // heuristic: we use the median of the values at from, to-1 and mid as a pivot
-    if (compare(from, to - 1) > 0) {
-      swap(from, to - 1);
-    }
-    if (compare(to - 1, mid) > 0) {
-      swap(to - 1, mid);
-      if (compare(from, to - 1) > 0) {
-        swap(from, to - 1);
+  /** Copy of {@code IntroSorter#median}. */
+  private int median(int i, int j, int k) {
+    if (compare(i, j) < 0) {
+      if (compare(j, k) <= 0) {
+        return j;
       }
+      return compare(i, k) < 0 ? k : i;
     }
+    if (compare(j, k) >= 0) {
+      return j;
+    }
+    return compare(i, k) < 0 ? i : k;
+  }
 
-    setPivot(to - 1);
-
-    int left = from + 1;
-    int right = to - 2;
-
-    for (; ; ) {
-      while (comparePivot(left) > 0) {
-        ++left;
-      }
-
-      while (left < right && comparePivot(right) <= 0) {
-        --right;
-      }
-
-      if (left < right) {
-        swap(left, right);
-        --right;
-      } else {
-        break;
+  /** Copy of {@link Sorter#insertionSort(int, int)}. */
+  void insertionSort(int from, int to) {
+    for (int i = from + 1; i < to; ) {
+      int current = i++;
+      int previous;
+      while (compare((previous = current - 1), current) > 0) {
+        swap(previous, current);
+        if (previous == from) {
+          break;
+        }
+        current = previous;
       }
     }
-    swap(left, to - 1);
-
-    if (left == k) {
-      return;
-    } else if (left < k) {
-      quickSelect(left + 1, to, k, maxDepth);
-    } else {
-      quickSelect(from, left, k, maxDepth);
-    }
   }
 
   /**
-   * Compare entries found in slots <code>i</code> and <code>j</code>. The contract for the returned
-   * value is the same as {@link Comparator#compare(Object, Object)}.
+   * Shuffles the entries between from (inclusive) and to (exclusive) with Durstenfeld's algorithm.
    */
-  protected int compare(int i, int j) {
-    setPivot(i);
-    return comparePivot(j);
+  private void shuffle(int from, int to) {
+    if (this.random == null) {
+      this.random = new Random();

Review comment:
       Use SplittableRandom instead?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@lucene.apache.org
For additional commands, e-mail: issues-help@lucene.apache.org