You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2018/11/27 09:56:57 UTC
[15/16] lucene-solr:jira/http2: LUCENE-8562: Speed up merging segments of points with data dimensions by only sorting on the indexed dimensions

LUCENE-8562: Speed up merging segments of points with data dimensions by only sorting on the indexed dimensions


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/72ca4488
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/72ca4488
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/72ca4488

Branch: refs/heads/jira/http2
Commit: 72ca4488d1313ffd2b9b8cf43027f7677022e80f
Parents: 68c0774
Author: iverase <iv...@apache.org>
Authored: Tue Nov 27 10:26:49 2018 +0100
Committer: iverase <iv...@apache.org>
Committed: Tue Nov 27 10:26:49 2018 +0100

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  3 +
 .../org/apache/lucene/util/bkd/BKDWriter.java   | 74 +++++++++++++++-----
 .../org/apache/lucene/util/bkd/TestBKD.java     | 29 ++++++++
 3 files changed, 87 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/72ca4488/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 5a347d8..86d06bc 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -225,6 +225,9 @@ Improvements
 * LUCENE-8463: TopFieldCollector can now early-terminates queries when sorting by SortField.DOC.
   (Christophe Bismuth via Jim Ferenczi)
 
+* LUCENE-8562: Speed up merging segments of points with data dimensions by only sorting on the indexed
+  dimensions. (Ignacio Vera)
+
 Optimizations
 
 * LUCENE-8552: FieldInfos.getMergedFieldInfos no longer does any merging if there is <= 1 segment.

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/72ca4488/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
index c4ac04e..1ffa275 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@@ -767,6 +767,10 @@ public class BKDWriter implements Closeable {
   /** Sort the heap writer by the specified dim */
   private void sortHeapPointWriter(final HeapPointWriter writer, int dim) {
     final int pointCount = Math.toIntExact(this.pointCount);
+    sortHeapPointWriter(writer, pointCount, dim);
+  }
+  /** Sort the heap writer by the specified dim */
+  private void sortHeapPointWriter(final HeapPointWriter writer, int pointCount, int dim) {
     // Tie-break by docID:
 
     // No need to tie break on ord, for the case where the same doc has the same value in a given dimension indexed more than once: it
@@ -959,7 +963,7 @@ public class BKDWriter implements Closeable {
     }
 
     LongBitSet ordBitSet;
-    if (numDataDims > 1) {
+    if (numIndexDims > 1) {
       if (singleValuePerDoc) {
         ordBitSet = new LongBitSet(maxDoc);
       } else {
@@ -994,7 +998,7 @@ public class BKDWriter implements Closeable {
     assert pointCount / numLeaves <= maxPointsInLeafNode: "pointCount=" + pointCount + " numLeaves=" + numLeaves + " maxPointsInLeafNode=" + maxPointsInLeafNode;
 
     // Sort all docs once by each dimension:
-    PathSlice[] sortedPointWriters = new PathSlice[numDataDims];
+    PathSlice[] sortedPointWriters = new PathSlice[numIndexDims];
 
     // This is only used on exception; on normal code paths we close all files we opened:
     List<Closeable> toCloseHeroically = new ArrayList<>();
@@ -1002,9 +1006,7 @@ public class BKDWriter implements Closeable {
     boolean success = false;
     try {
       //long t0 = System.nanoTime();
-      // even with selective indexing we create the sortedPointWriters so we can compress
-      // the leaf node data by common prefix
-      for(int dim=0;dim<numDataDims;dim++) {
+      for(int dim=0;dim<numIndexDims;dim++) {
         sortedPointWriters[dim] = new PathSlice(sort(dim), 0, pointCount);
       }
       //long t1 = System.nanoTime();
@@ -1445,7 +1447,7 @@ public class BKDWriter implements Closeable {
       boolean result = reader.next();
       assert result: "rightCount=" + rightCount + " source.count=" + source.count + " source.writer=" + source.writer;
       System.arraycopy(reader.packedValue(), splitDim*bytesPerDim, scratch1, 0, bytesPerDim);
-      if (numDataDims > 1) {
+      if (numIndexDims > 1) {
         assert ordBitSet.get(reader.ord()) == false;
         ordBitSet.set(reader.ord());
         // Subtract 1 from rightCount because we already did the first value above (so we could record the split value):
@@ -1619,7 +1621,7 @@ public class BKDWriter implements Closeable {
       assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues,
           docIDs, 0);
       writeLeafBlockPackedValues(scratchOut, commonPrefixLengths, count, sortedDim, packedValues);
-      
+
       out.writeBytes(scratchOut.getBytes(), 0, scratchOut.getPosition());
       scratchOut.reset();
 
@@ -1678,10 +1680,10 @@ public class BKDWriter implements Closeable {
                      long[] leafBlockFPs,
                      List<Closeable> toCloseHeroically) throws IOException {
 
-    for(PathSlice slice : slices) {
+    for (PathSlice slice : slices) {
       assert slice.count == slices[0].count;
     }
-    
+
     if (numDataDims == 1 && slices[0].writer instanceof OfflinePointWriter && slices[0].count <= maxPointsSortInHeap) {
       // Special case for 1D, to cutover to heap once we recurse deeply enough:
       slices[0] = switchToHeap(slices[0], toCloseHeroically);
@@ -1695,7 +1697,7 @@ public class BKDWriter implements Closeable {
       int sortedDim = 0;
       int sortedDimCardinality = Integer.MAX_VALUE;
 
-      for (int dim=0;dim<numDataDims;dim++) {
+      for (int dim=0;dim<numIndexDims;dim++) {
         if (slices[dim].writer instanceof HeapPointWriter == false) {
           // Adversarial cases can cause this, e.g. very lopsided data, all equal points, such that we started
           // offline, but then kept splitting only in one dimension, and so never had to rewrite into heap writer
@@ -1740,7 +1742,41 @@ public class BKDWriter implements Closeable {
         }
       }
 
-      PathSlice source = slices[sortedDim];
+      PathSlice dataDimPathSlice = null;
+
+      if (numDataDims != numIndexDims) {
+        HeapPointWriter heapSource = (HeapPointWriter) slices[0].writer;
+        int from = (int) slices[0].start;
+        int to = from + (int) slices[0].count;
+        Arrays.fill(commonPrefixLengths, numIndexDims, numDataDims, bytesPerDim);
+        heapSource.readPackedValue(from, scratch1);
+        for (int i = from + 1; i < to; ++i) {
+          heapSource.readPackedValue(i, scratch2);
+          for (int dim = numIndexDims; dim < numDataDims; dim++) {
+            final int offset = dim * bytesPerDim;
+            for (int j = 0; j < commonPrefixLengths[dim]; j++) {
+              if (scratch1[offset + j] != scratch2[offset + j]) {
+                commonPrefixLengths[dim] = j;
+                break;
+              }
+            }
+          }
+        }
+        //handle case when all index dimensions contain the same value but not the data dimensions
+        if (commonPrefixLengths[sortedDim] == bytesPerDim) {
+          for (int dim = numIndexDims; dim < numDataDims; ++dim) {
+            if (commonPrefixLengths[dim] != bytesPerDim) {
+              sortedDim = dim;
+              //create a new slice in memory
+              dataDimPathSlice = switchToHeap(slices[0], toCloseHeroically);
+              sortHeapPointWriter((HeapPointWriter) dataDimPathSlice.writer, (int) dataDimPathSlice.count, sortedDim);
+              break;
+            }
+          }
+        }
+      }
+
+      PathSlice source = (dataDimPathSlice != null) ? dataDimPathSlice : slices[sortedDim];
 
       // We ensured that maxPointsSortInHeap was >= maxPointsInLeafNode, so we better be in heap at this point:
       HeapPointWriter heapSource = (HeapPointWriter) source.writer;
@@ -1804,8 +1840,8 @@ public class BKDWriter implements Closeable {
 
       // Partition all PathSlice that are not the split dim into sorted left and right sets, so we can recurse:
 
-      PathSlice[] leftSlices = new PathSlice[numDataDims];
-      PathSlice[] rightSlices = new PathSlice[numDataDims];
+      PathSlice[] leftSlices = new PathSlice[numIndexDims];
+      PathSlice[] rightSlices = new PathSlice[numIndexDims];
 
       byte[] minSplitPackedValue = new byte[packedIndexBytesLength];
       System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, packedIndexBytesLength);
@@ -1815,13 +1851,13 @@ public class BKDWriter implements Closeable {
 
       // When we are on this dim, below, we clear the ordBitSet:
       int dimToClear;
-      if (numDataDims - 1 == splitDim) {
-        dimToClear = numDataDims - 2;
+      if (numIndexDims - 1 == splitDim) {
+        dimToClear = numIndexDims - 2;
       } else {
-        dimToClear = numDataDims - 1;
+        dimToClear = numIndexDims - 1;
       }
 
-      for(int dim=0;dim<numDataDims;dim++) {
+      for(int dim=0;dim<numIndexDims;dim++) {
 
         if (dim == splitDim) {
           // No need to partition on this dim since it's a simple slice of the incoming already sorted slice, and we
@@ -1858,7 +1894,7 @@ public class BKDWriter implements Closeable {
             ordBitSet, out,
             minPackedValue, maxSplitPackedValue, parentSplits,
             splitPackedValues, leafBlockFPs, toCloseHeroically);
-      for(int dim=0;dim<numDataDims;dim++) {
+      for(int dim=0;dim<numIndexDims;dim++) {
         // Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim:
         if (dim != splitDim) {
           leftSlices[dim].writer.destroy();
@@ -1871,7 +1907,7 @@ public class BKDWriter implements Closeable {
             ordBitSet, out,
             minSplitPackedValue, maxPackedValue, parentSplits,
             splitPackedValues, leafBlockFPs, toCloseHeroically);
-      for(int dim=0;dim<numDataDims;dim++) {
+      for(int dim=0;dim<numIndexDims;dim++) {
         // Don't destroy the dim we split on because we just re-used what our caller above gave us for that dim:
         if (dim != splitDim) {
           rightSlices[dim].writer.destroy();

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/72ca4488/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
index d75d785..a01c927 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
@@ -492,6 +492,35 @@ public class TestBKD extends LuceneTestCase {
     verify(docValues, null, numDataDims, numIndexDims, numBytesPerDim);
   }
 
+  public void testIndexDimEqualDataDimDifferent() throws Exception {
+    int numBytesPerDim = TestUtil.nextInt(random(), 2, 30);
+    int numDataDims = TestUtil.nextInt(random(), 2, 5);
+    int numIndexDims = TestUtil.nextInt(random(), 1, numDataDims - 1);
+
+    int numDocs = atLeast(1000);
+    byte[][][] docValues = new byte[numDocs][][];
+
+    byte[][] indexDimensions = new byte[numDataDims][];
+    for(int dim=0;dim<numIndexDims;dim++) {
+      indexDimensions[dim] = new byte[numBytesPerDim];
+      random().nextBytes(indexDimensions[dim]);
+    }
+
+    for(int docID=0;docID<numDocs;docID++) {
+      byte[][] values = new byte[numDataDims][];
+      for(int dim=0;dim<numIndexDims;dim++) {
+        values[dim] = indexDimensions[dim];
+      }
+      for (int dim = numIndexDims; dim < numDataDims; dim++) {
+          values[dim] = new byte[numBytesPerDim];
+          random().nextBytes(values[dim]);
+      }
+      docValues[docID] = values;
+    }
+
+    verify(docValues, null, numDataDims, numIndexDims, numBytesPerDim);
+  }
+
   public void testOneDimEqual() throws Exception {
     int numBytesPerDim = TestUtil.nextInt(random(), 2, 30);
     int numDataDims = TestUtil.nextInt(random(), 1, 5);