You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2021/05/14 08:10:50 UTC

[lucene-solr] branch branch_8x updated (23c34c7 -> 86b6d35)

This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a change to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git.


    from 23c34c7  LUCENE-9827: avoid wasteful recompression for small segments (#2495)
     new e4d4438  LUCENE-9725: Remove unused imports.
     new 86b6d35  LUCENE-9932: Performance improvement for BKD index building

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 lucene/CHANGES.txt                                 |  2 +
 .../apache/lucene/codecs/MutablePointValues.java   |  5 ++
 .../org/apache/lucene/index/PointValuesWriter.java | 26 +++++++
 .../org/apache/lucene/util/MSBRadixSorter.java     | 10 +--
 .../apache/lucene/util/StableMSBRadixSorter.java   | 81 ++++++++++++++++++++++
 .../lucene/util/bkd/MutablePointsReaderUtils.java  | 65 ++++++++---------
 ...ixSorter.java => TestStableMSBRadixSorter.java} | 56 +++++++++++----
 .../test/org/apache/lucene/util/bkd/TestBKD.java   | 22 ++++++
 .../util/bkd/TestMutablePointsReaderUtils.java     | 60 +++++++++++++---
 .../apache/lucene/search/CombinedFieldQuery.java   | 20 ------
 .../lucene/search/MultiNormsLeafSimScorer.java     |  2 -
 11 files changed, 258 insertions(+), 91 deletions(-)
 create mode 100644 lucene/core/src/java/org/apache/lucene/util/StableMSBRadixSorter.java
 copy lucene/core/src/test/org/apache/lucene/util/{TestMSBRadixSorter.java => TestStableMSBRadixSorter.java} (75%)

[lucene-solr] 02/02: LUCENE-9932: Performance improvement for BKD index building

Posted by jp...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit 86b6d35f7229010757924da3312ffb2fe72b17f4
Author: neoReMinD <xu...@gmail.com>
AuthorDate: Fri May 14 09:49:27 2021 +0200

    LUCENE-9932: Performance improvement for BKD index building
---
 lucene/CHANGES.txt                                 |   2 +
 .../apache/lucene/codecs/MutablePointValues.java   |   5 +
 .../org/apache/lucene/index/PointValuesWriter.java |  26 +++
 .../org/apache/lucene/util/MSBRadixSorter.java     |  10 +-
 .../apache/lucene/util/StableMSBRadixSorter.java   |  81 ++++++++
 .../lucene/util/bkd/MutablePointsReaderUtils.java  |  65 +++----
 .../lucene/util/TestStableMSBRadixSorter.java      | 211 +++++++++++++++++++++
 .../test/org/apache/lucene/util/bkd/TestBKD.java   |  22 +++
 .../util/bkd/TestMutablePointsReaderUtils.java     |  60 ++++--
 9 files changed, 428 insertions(+), 54 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index fb5c0ab..ac78502 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -58,6 +58,8 @@ Improvements
 Optimizations
 ---------------------
 
+* LUCENE-9932: Performance improvement for BKD index building (neoremind)
+
 * LUCENE-9827: Speed up merging of stored fields and term vectors for smaller segments.
   (Daniel Mitterdorfer, Dimitrios Liapis, Adrien Grand, Robert Muir)
 
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/MutablePointValues.java b/lucene/core/src/java/org/apache/lucene/codecs/MutablePointValues.java
index 8f4d69c..2d38ac5 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/MutablePointValues.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/MutablePointValues.java
@@ -39,4 +39,9 @@ public abstract class MutablePointValues extends PointValues {
   /** Swap the i-th and j-th values. */
   public abstract void swap(int i, int j);
 
+  /** Save the i-th value into the j-th position in temporary storage. */
+  public abstract void save(int i, int j);
+
+  /** Restore values between i-th and j-th(excluding) in temporary storage into original storage. */
+  public abstract void restore(int i, int j);
 }
diff --git a/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
index d715872..ddbc6a6 100644
--- a/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
@@ -72,6 +72,7 @@ class PointValuesWriter {
   public void flush(SegmentWriteState state, Sorter.DocMap sortMap, PointsWriter writer) throws IOException {
     PointValues points = new MutablePointValues() {
       final int[] ords = new int[numPoints];
+      int[] temp;
       {
         for (int i = 0; i < numPoints; ++i) {
           ords[i] = i;
@@ -154,6 +155,21 @@ class PointValuesWriter {
         final long offset = (long) packedBytesLength * ords[i] + k;
         return bytes.readByte(offset);
       }
+
+      @Override
+      public void save(int i, int j) {
+        if (temp == null) {
+          temp = new int[ords.length];
+        }
+        temp[j] = ords[i];
+      }
+
+      @Override
+      public void restore(int i, int j) {
+        if (temp != null) {
+          System.arraycopy(temp, i, ords, i, j - i);
+        }
+      }
     };
 
     final PointValues values;
@@ -277,5 +293,15 @@ class PointValuesWriter {
     public void swap(int i, int j) {
       in.swap(i, j);
     }
+
+    @Override
+    public void save(int i, int j) {
+      in.save(i, j);
+    }
+
+    @Override
+    public void restore(int i, int j) {
+      in.restore(i, j);
+    }
   }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/util/MSBRadixSorter.java b/lucene/core/src/java/org/apache/lucene/util/MSBRadixSorter.java
index f0e2a61..f1059a9 100644
--- a/lucene/core/src/java/org/apache/lucene/util/MSBRadixSorter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/MSBRadixSorter.java
@@ -31,7 +31,7 @@ public abstract class MSBRadixSorter extends Sorter {
   // locality)
   private static final int LEVEL_THRESHOLD = 8;
   // size of histograms: 256 + 1 to indicate that the string is finished
-  private static final int HISTOGRAM_SIZE = 257;
+  protected static final int HISTOGRAM_SIZE = 257;
   // buckets below this size will be sorted with introsort
   private static final int LENGTH_THRESHOLD = 100;
 
@@ -40,7 +40,7 @@ public abstract class MSBRadixSorter extends Sorter {
   private final int[] endOffsets = new int[HISTOGRAM_SIZE];
   private final int[] commonPrefix;
 
-  private final int maxLength;
+  protected final int maxLength;
 
   /**
    * Sole constructor.
@@ -121,7 +121,7 @@ public abstract class MSBRadixSorter extends Sorter {
     sort(from, to, 0, 0);
   }
 
-  private void sort(int from, int to, int k, int l) {
+  protected void sort(int from, int to, int k, int l) {
     if (to - from <= LENGTH_THRESHOLD || l >= LEVEL_THRESHOLD) {
       introSort(from, to, k);
     } else {
@@ -195,7 +195,7 @@ public abstract class MSBRadixSorter extends Sorter {
   }
 
   /** Return a number for the k-th character between 0 and {@link #HISTOGRAM_SIZE}. */
-  private int getBucket(int i, int k) {
+  protected int getBucket(int i, int k) {
     return byteAt(i, k) + 1;
   }
 
@@ -268,7 +268,7 @@ public abstract class MSBRadixSorter extends Sorter {
    * @param startOffsets start offsets per bucket
    * @param endOffsets end offsets per bucket
    */
-  private void reorder(int from, int to, int[] startOffsets, int[] endOffsets, int k) {
+  protected void reorder(int from, int to, int[] startOffsets, int[] endOffsets, int k) {
     // reorder in place, like the dutch flag problem
     for (int i = 0; i < HISTOGRAM_SIZE; ++i) {
       final int limit = endOffsets[i];
diff --git a/lucene/core/src/java/org/apache/lucene/util/StableMSBRadixSorter.java b/lucene/core/src/java/org/apache/lucene/util/StableMSBRadixSorter.java
new file mode 100644
index 0000000..c6ec774
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/util/StableMSBRadixSorter.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util;
+
+/**
+ * Stable radix sorter for variable-length strings.
+ *
+ * @lucene.internal
+ */
+public abstract class StableMSBRadixSorter extends MSBRadixSorter {
+
+  private final int[] fixedStartOffsets;
+
+  public StableMSBRadixSorter(int maxLength) {
+    super(maxLength);
+    fixedStartOffsets = new int[HISTOGRAM_SIZE];
+  }
+
+  /** Save the i-th value into the j-th position in temporary storage. */
+  protected abstract void save(int i, int j);
+
+  /** Restore values between i-th and j-th(excluding) in temporary storage into original storage. */
+  protected abstract void restore(int i, int j);
+
+  @Override
+  protected Sorter getFallbackSorter(int k) {
+    return new InPlaceMergeSorter() {
+      @Override
+      protected void swap(int i, int j) {
+        StableMSBRadixSorter.this.swap(i, j);
+      }
+
+      @Override
+      protected int compare(int i, int j) {
+        for (int o = k; o < maxLength; ++o) {
+          final int b1 = byteAt(i, o);
+          final int b2 = byteAt(j, o);
+          if (b1 != b2) {
+            return b1 - b2;
+          } else if (b1 == -1) {
+            break;
+          }
+        }
+        return 0;
+      }
+    };
+  }
+
+  /**
+   * Reorder elements in stable way, since Dutch sort does not guarantee ordering for same values.
+   *
+   * <p>When this method returns, startOffsets and endOffsets are equal.
+   */
+  @Override
+  protected void reorder(int from, int to, int[] startOffsets, int[] endOffsets, int k) {
+    System.arraycopy(startOffsets, 0, fixedStartOffsets, 0, startOffsets.length);
+    for (int i = 0; i < HISTOGRAM_SIZE; ++i) {
+      final int limit = endOffsets[i];
+      for (int h1 = fixedStartOffsets[i]; h1 < limit; h1++) {
+        final int b = getBucket(from + h1, k);
+        final int h2 = startOffsets[b]++;
+        save(from + h1, from + h2);
+      }
+    }
+    restore(from, to);
+  }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java b/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
index 3d54e69..e6bfd2a 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/MutablePointsReaderUtils.java
@@ -21,9 +21,9 @@ import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.FutureArrays;
 import org.apache.lucene.util.IntroSelector;
 import org.apache.lucene.util.IntroSorter;
-import org.apache.lucene.util.MSBRadixSorter;
 import org.apache.lucene.util.RadixSelector;
 import org.apache.lucene.util.Selector;
+import org.apache.lucene.util.StableMSBRadixSorter;
 import org.apache.lucene.util.packed.PackedInts;
 
 /** Utility APIs for sorting and partitioning buffered points.
@@ -36,8 +36,22 @@ public final class MutablePointsReaderUtils {
   /** Sort the given {@link MutablePointValues} based on its packed value then doc ID. */
   public static void sort(BKDConfig config, int maxDoc,
                           MutablePointValues reader, int from, int to) {
-    final int bitsPerDocId = PackedInts.bitsRequired(maxDoc - 1);
-    new MSBRadixSorter(config.packedBytesLength + (bitsPerDocId + 7) / 8) {
+    boolean sortedByDocID = true;
+    int prevDoc = 0;
+    for (int i = from; i < to; ++i) {
+      int doc = reader.getDocID(i);
+      if (doc < prevDoc) {
+        sortedByDocID = false;
+        break;
+      }
+      prevDoc = doc;
+    }
+
+    // No need to tie break on doc IDs if already sorted by doc ID, since we use a stable sort.
+    // This should be a common situation as IndexWriter accumulates data in doc ID order when
+    // index sorting is not enabled.
+    final int bitsPerDocId = sortedByDocID ? 0 : PackedInts.bitsRequired(maxDoc - 1);
+    new StableMSBRadixSorter(config.packedBytesLength + (bitsPerDocId + 7) / 8) {
 
       @Override
       protected void swap(int i, int j) {
@@ -45,6 +59,16 @@ public final class MutablePointsReaderUtils {
       }
 
       @Override
+      protected void save(int i, int j) {
+        reader.save(i, j);
+      }
+
+      @Override
+      protected void restore(int i, int j) {
+        reader.restore(i, j);
+      }
+
+      @Override
       protected int byteAt(int i, int k) {
         if (k < config.packedBytesLength) {
           return Byte.toUnsignedInt(reader.getByteAt(i, k));
@@ -53,41 +77,6 @@ public final class MutablePointsReaderUtils {
           return (reader.getDocID(i) >>> Math.max(0, shift)) & 0xff;
         }
       }
-
-      @Override
-      protected org.apache.lucene.util.Sorter getFallbackSorter(int k) {
-        return new IntroSorter() {
-
-          final BytesRef pivot = new BytesRef();
-          final BytesRef scratch = new BytesRef();
-          int pivotDoc;
-
-          @Override
-          protected void swap(int i, int j) {
-            reader.swap(i, j);
-          }
-
-          @Override
-          protected void setPivot(int i) {
-            reader.getValue(i, pivot);
-            pivotDoc = reader.getDocID(i);
-          }
-
-          @Override
-          protected int comparePivot(int j) {
-            if (k < config.packedBytesLength) {
-              reader.getValue(j, scratch);
-              int cmp = FutureArrays.compareUnsigned(pivot.bytes, pivot.offset + k, pivot.offset + k + config.packedBytesLength - k,
-                      scratch.bytes, scratch.offset + k, scratch.offset + k + config.packedBytesLength - k);
-              if (cmp != 0) {
-                return cmp;
-              }
-            }
-            return pivotDoc - reader.getDocID(j);
-          }
-        };
-      }
-
     }.sort(from, to);
   }
 
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestStableMSBRadixSorter.java b/lucene/core/src/test/org/apache/lucene/util/TestStableMSBRadixSorter.java
new file mode 100644
index 0000000..387b6b6
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/util/TestStableMSBRadixSorter.java
@@ -0,0 +1,211 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+public class TestStableMSBRadixSorter extends LuceneTestCase {
+
+  private void test(BytesRef[] refs, int len) {
+    BytesRef[] expected = ArrayUtil.copyOfSubArray(refs, 0, len);
+    Arrays.sort(expected);
+
+    int maxLength = 0;
+    for (int i = 0; i < len; ++i) {
+      BytesRef ref = refs[i];
+      maxLength = Math.max(maxLength, ref.length);
+    }
+    switch (random().nextInt(3)) {
+      case 0:
+        maxLength += TestUtil.nextInt(random(), 1, 5);
+        break;
+      case 1:
+        maxLength = Integer.MAX_VALUE;
+        break;
+      default:
+        // leave unchanged
+        break;
+    }
+
+    final int finalMaxLength = maxLength;
+    new StableMSBRadixSorter(maxLength) {
+
+      private BytesRef[] temp;
+
+      @Override
+      protected int byteAt(int i, int k) {
+        assertTrue(k < finalMaxLength);
+        BytesRef ref = refs[i];
+        if (ref.length <= k) {
+          return -1;
+        }
+        return ref.bytes[ref.offset + k] & 0xff;
+      }
+
+      @Override
+      protected void swap(int i, int j) {
+        BytesRef tmp = refs[i];
+        refs[i] = refs[j];
+        refs[j] = tmp;
+      }
+
+      @Override
+      protected void save(int i, int j) {
+        if (temp == null) {
+          temp = new BytesRef[refs.length];
+        }
+        temp[j] = refs[i];
+      }
+
+      @Override
+      protected void restore(int i, int j) {
+        if (temp != null) {
+          System.arraycopy(temp, i, refs, i, j - i);
+        }
+      }
+    }.sort(0, len);
+    BytesRef[] actual = ArrayUtil.copyOfSubArray(refs, 0, len);
+    assertArrayEquals(expected, actual);
+    // Verify that the arrays are not only equal after sorting with Arrays#sort and
+    // StableMSBRadixSorter
+    // but also that they have the very same instance at every index.
+    // This is different from MSBRadixSorter which does not guarantee ordering of the same value.
+    assertEquals(expected.length, actual.length);
+    for (int i = 0; i < expected.length; i++) {
+      assertSame(expected[i].bytes, actual[i].bytes);
+    }
+  }
+
+  public void testEmpty() {
+    test(new BytesRef[random().nextInt(5)], 0);
+  }
+
+  public void testOneValue() {
+    BytesRef bytes = new BytesRef(TestUtil.randomSimpleString(random()));
+    test(new BytesRef[] {bytes}, 1);
+  }
+
+  public void testTwoValues() {
+    BytesRef bytes1 = new BytesRef(TestUtil.randomSimpleString(random()));
+    BytesRef bytes2 = new BytesRef(TestUtil.randomSimpleString(random()));
+    test(new BytesRef[] {bytes1, bytes2}, 2);
+  }
+
+  private void testRandom(int commonPrefixLen, int maxLen) {
+    byte[] commonPrefix = new byte[commonPrefixLen];
+    random().nextBytes(commonPrefix);
+    final int len = random().nextInt(100000);
+    BytesRef[] bytes = new BytesRef[len + random().nextInt(50)];
+    for (int i = 0; i < len; ++i) {
+      byte[] b = new byte[commonPrefixLen + random().nextInt(maxLen)];
+      random().nextBytes(b);
+      System.arraycopy(commonPrefix, 0, b, 0, commonPrefixLen);
+      bytes[i] = new BytesRef(b);
+    }
+    test(bytes, len);
+  }
+
+  public void testRandom() {
+    for (int iter = 0; iter < 10; ++iter) {
+      testRandom(0, 10);
+    }
+  }
+
+  public void testRandomWithLotsOfDuplicates() {
+    for (int iter = 0; iter < 10; ++iter) {
+      testRandom(0, 2);
+    }
+  }
+
+  public void testRandomWithSharedPrefix() {
+    for (int iter = 0; iter < 10; ++iter) {
+      testRandom(TestUtil.nextInt(random(), 1, 30), 10);
+    }
+  }
+
+  public void testRandomWithSharedPrefixAndLotsOfDuplicates() {
+    for (int iter = 0; iter < 10; ++iter) {
+      testRandom(TestUtil.nextInt(random(), 1, 30), 2);
+    }
+  }
+
+  public void testRandom2() {
+    // how large our alphabet is
+    int letterCount = TestUtil.nextInt(random(), 2, 10);
+
+    // how many substring fragments to use
+    int substringCount = TestUtil.nextInt(random(), 2, 10);
+    Set<BytesRef> substringsSet = new HashSet<>();
+
+    // how many strings to make
+    int stringCount = atLeast(10000);
+
+    // System.out.println("letterCount=" + letterCount + " substringCount=" + substringCount + "
+    // stringCount=" + stringCount);
+    while (substringsSet.size() < substringCount) {
+      int length = TestUtil.nextInt(random(), 2, 10);
+      byte[] bytes = new byte[length];
+      for (int i = 0; i < length; i++) {
+        bytes[i] = (byte) random().nextInt(letterCount);
+      }
+      BytesRef br = new BytesRef(bytes);
+      substringsSet.add(br);
+      // System.out.println("add substring count=" + substringsSet.size() + ": " + br);
+    }
+
+    BytesRef[] substrings = substringsSet.toArray(new BytesRef[substringsSet.size()]);
+    double[] chance = new double[substrings.length];
+    double sum = 0.0;
+    for (int i = 0; i < substrings.length; i++) {
+      chance[i] = random().nextDouble();
+      sum += chance[i];
+    }
+
+    // give each substring a random chance of occurring:
+    double accum = 0.0;
+    for (int i = 0; i < substrings.length; i++) {
+      accum += chance[i] / sum;
+      chance[i] = accum;
+    }
+
+    Set<BytesRef> stringsSet = new HashSet<>();
+    int iters = 0;
+    while (stringsSet.size() < stringCount && iters < stringCount * 5) {
+      int count = TestUtil.nextInt(random(), 1, 5);
+      BytesRefBuilder b = new BytesRefBuilder();
+      for (int i = 0; i < count; i++) {
+        double v = random().nextDouble();
+        accum = 0.0;
+        for (int j = 0; j < substrings.length; j++) {
+          accum += chance[j];
+          if (accum >= v) {
+            b.append(substrings[j]);
+            break;
+          }
+        }
+      }
+      BytesRef br = b.toBytesRef();
+      stringsSet.add(br);
+      // System.out.println("add string count=" + stringsSet.size() + ": " + br);
+      iters++;
+    }
+
+    test(stringsSet.toArray(new BytesRef[stringsSet.size()]), stringsSet.size());
+  }
+}
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
index f6eacc6..d37284a 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
@@ -1465,6 +1465,18 @@ public class TestBKD extends LuceneTestCase {
 
       @Override
       public byte getByteAt(int i, int k) {
+        BytesRef b = new BytesRef();
+        getValue(i, b);
+        return b.bytes[b.offset + k];
+      }
+
+      @Override
+      public void save(int i, int j) {
+        throw new UnsupportedOperationException();
+      }
+
+      @Override
+      public void restore(int i, int j) {
         throw new UnsupportedOperationException();
       }
     };
@@ -1582,6 +1594,16 @@ public class TestBKD extends LuceneTestCase {
       public int getDocCount() {
         return 11;
       }
+
+      @Override
+      public void save(int i, int j) {
+        throw new UnsupportedOperationException();
+      }
+
+      @Override
+      public void restore(int i, int j) {
+        throw new UnsupportedOperationException();
+      }
     };
     try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
       IllegalStateException ex = expectThrows(IllegalStateException.class, () -> { w.writeField(out, out, out, "", val);});
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java
index e75ab24..d4869b0 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestMutablePointsReaderUtils.java
@@ -30,16 +30,22 @@ import org.apache.lucene.util.TestUtil;
 public class TestMutablePointsReaderUtils extends LuceneTestCase {
 
   public void testSort() {
-    for (int iter = 0; iter < 5; ++iter) {
-      doTestSort();
+    for (int iter = 0; iter < 10; ++iter) {
+      doTestSort(false);
+    }
+  }
+
+  public void testSortWithIncrementalDocId() {
+    for (int iter = 0; iter < 10; ++iter) {
+      doTestSort(true);
     }
   }
 
-  private void doTestSort() {
+  private void doTestSort(boolean isDocIdIncremental) {
     final int bytesPerDim = TestUtil.nextInt(random(), 1, 16);
     final int maxDoc = TestUtil.nextInt(random(), 1, 1 << random().nextInt(30));
     BKDConfig config = new BKDConfig(1, 1, bytesPerDim, BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE);
-    Point[] points = createRandomPoints(config, maxDoc, new int[1]);
+    Point[] points = createRandomPoints(config, maxDoc, new int[1], isDocIdIncremental);
     DummyPointsReader reader = new DummyPointsReader(points);
     MutablePointsReaderUtils.sort(config, maxDoc, reader, 0, points.length);
     Arrays.sort(points, new Comparator<Point>() {
@@ -53,7 +59,23 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
       }
     });
     assertNotSame(points, reader.points);
-    assertArrayEquals(points, reader.points);
+    assertEquals(points.length, reader.points.length);
+
+    // Check doc IDs are in ascending order.
+    // If doc IDs are already increasing, StableMSBRadixSorter should keep doc ID's ordering.
+    // If doc IDs are not ordered, StableMSBRadixSorter should compare doc ID to guarantee the
+    // ordering.
+    Point prevPoint = null;
+    for (int i = 0; i < points.length; i++) {
+      assertEquals(points[i].packedValue, reader.points[i].packedValue);
+      assertSame(points[i].packedValue, reader.points[i].packedValue);
+      if (prevPoint != null) {
+        if (reader.points[i].packedValue.equals(prevPoint.packedValue)) {
+          assertTrue(reader.points[i].doc >= prevPoint.doc);
+        }
+      }
+      prevPoint = reader.points[i];
+    }
   }
 
   public void testSortByDim() {
@@ -66,7 +88,7 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
     BKDConfig config = createRandomConfig();
     final int maxDoc = TestUtil.nextInt(random(), 1, 1 << random().nextInt(30));
     int[] commonPrefixLengths = new int[config.numDims];
-    Point[] points = createRandomPoints(config, maxDoc, commonPrefixLengths);
+    Point[] points = createRandomPoints(config, maxDoc, commonPrefixLengths, false);
     DummyPointsReader reader = new DummyPointsReader(points);
     final int sortedDim = random().nextInt(config.numIndexDims);
     MutablePointsReaderUtils.sortByDim(config, sortedDim, commonPrefixLengths, reader, 0, points.length,
@@ -99,7 +121,7 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
     BKDConfig config = createRandomConfig();
     int[] commonPrefixLengths  = new int[config.numDims];
     final int maxDoc = TestUtil.nextInt(random(), 1, 1 << random().nextInt(30));
-    Point[] points = createRandomPoints(config, maxDoc, commonPrefixLengths);
+    Point[] points = createRandomPoints(config, maxDoc, commonPrefixLengths, false);
     final int splitDim =  random().nextInt(config.numIndexDims);
     DummyPointsReader reader = new DummyPointsReader(points);
     final int pivot = TestUtil.nextInt(random(), 0, points.length - 1);
@@ -138,15 +160,15 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
     return new BKDConfig(numDims, numIndexDims, bytesPerDim, maxPointsInLeafNode);
   }
 
-  private static Point[] createRandomPoints(BKDConfig config, int maxDoc, int[] commonPrefixLengths) {
+  private static Point[] createRandomPoints(BKDConfig config, int maxDoc, int[] commonPrefixLengths, boolean isDocIdIncremental) {
     assertTrue(commonPrefixLengths.length == config.numDims);
     final int numPoints = TestUtil.nextInt(random(), 1, 100000);
     Point[] points = new Point[numPoints];
-    if (random().nextInt(5) != 0) {
+    if (random().nextInt(10) != 0) {
       for (int i = 0; i < numPoints; ++i) {
         byte[] value = new byte[config.packedBytesLength];
         random().nextBytes(value);
-        points[i] = new Point(value, random().nextInt(maxDoc));
+        points[i] = new Point(value, isDocIdIncremental ? Math.min(i, maxDoc - 1) : random().nextInt(maxDoc));
       }
       for (int i = 0; i < config.numDims; ++i) {
         commonPrefixLengths[i] = TestUtil.nextInt(random(), 0, config.bytesPerDim);
@@ -170,7 +192,7 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
         System.arraycopy(indexDims, 0, value, 0, config.packedIndexBytesLength);
         random().nextBytes(dataDims);
         System.arraycopy(dataDims, 0, value, config.packedIndexBytesLength, numDataDims * config.bytesPerDim);
-        points[i] = new Point(value, random().nextInt(maxDoc));
+        points[i] = new Point(value, isDocIdIncremental ? Math.min(i, maxDoc - 1) : random().nextInt(maxDoc));
       }
       for (int i = 0; i < config.numIndexDims; ++i) {
         commonPrefixLengths[i] = config.bytesPerDim;
@@ -228,6 +250,8 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
 
     private final Point[] points;
 
+    private Point[] temp;
+
     DummyPointsReader(Point[] points) {
       this.points = points.clone();
     }
@@ -300,6 +324,20 @@ public class TestMutablePointsReaderUtils extends LuceneTestCase {
       throw new UnsupportedOperationException();
     }
 
+    @Override
+    public void save(int i, int j) {
+      if (temp == null) {
+        temp = new Point[points.length];
+      }
+      temp[j] = points[i];
+    }
+
+    @Override
+    public void restore(int i, int j) {
+      if (temp != null) {
+        System.arraycopy(temp, i, points, i, j - i);
+      }
+    }
   }
 
 }

[lucene-solr] 01/02: LUCENE-9725: Remove unused imports.

Posted by jp...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

jpountz pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit e4d4438e047eb68110e6d7c6242c11c3fcd121e2
Author: Adrien Grand <jp...@gmail.com>
AuthorDate: Tue Apr 6 14:35:06 2021 +0200

    LUCENE-9725: Remove unused imports.
---
 .../org/apache/lucene/search/CombinedFieldQuery.java | 20 --------------------
 .../lucene/search/MultiNormsLeafSimScorer.java       |  2 --
 2 files changed, 22 deletions(-)

diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/search/CombinedFieldQuery.java
index 6838358..f673711 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/search/CombinedFieldQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/CombinedFieldQuery.java
@@ -33,26 +33,6 @@ import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermState;
 import org.apache.lucene.index.TermStates;
 import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.search.BooleanClause;
-import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.CollectionStatistics;
-import org.apache.lucene.search.DisiPriorityQueue;
-import org.apache.lucene.search.DisiWrapper;
-import org.apache.lucene.search.DisjunctionDISIApproximation;
-import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.search.Explanation;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.LeafSimScorer;
-import org.apache.lucene.search.Matches;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.QueryVisitor;
-import org.apache.lucene.search.ScoreMode;
-import org.apache.lucene.search.Scorer;
-import org.apache.lucene.search.SynonymQuery;
-import org.apache.lucene.search.TermQuery;
-import org.apache.lucene.search.TermScorer;
-import org.apache.lucene.search.TermStatistics;
-import org.apache.lucene.search.Weight;
 import org.apache.lucene.search.similarities.BM25Similarity;
 import org.apache.lucene.search.similarities.DFRSimilarity;
 import org.apache.lucene.search.similarities.Similarity;
diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java b/lucene/sandbox/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java
index c354eb3..8ebefcd 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java
@@ -25,8 +25,6 @@ import java.util.List;
 import java.util.Objects;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.NumericDocValues;
-import org.apache.lucene.search.Explanation;
-import org.apache.lucene.search.LeafSimScorer;
 import org.apache.lucene.search.similarities.Similarity.SimScorer;
 import org.apache.lucene.util.SmallFloat;