You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by si...@apache.org on 2022/06/21 04:31:08 UTC
[pinot] branch master updated: Use binary search for index creation dictionary lookup (#8924)

This is an automated email from the ASF dual-hosted git repository.

siddteotia pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 2644f48e7c Use binary search for index creation dictionary lookup (#8924)
2644f48e7c is described below

commit 2644f48e7c896efcff4d5411ff81ddc09e13c3ac
Author: Xiaotian (Jackie) Jiang <17...@users.noreply.github.com>
AuthorDate: Mon Jun 20 21:31:02 2022 -0700

    Use binary search for index creation dictionary lookup (#8924)
---
 .../creator/impl/SegmentDictionaryCreator.java     | 129 +++++++++++----------
 1 file changed, 65 insertions(+), 64 deletions(-)

diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentDictionaryCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentDictionaryCreator.java
index 2805975596..a3c460eac4 100644
--- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentDictionaryCreator.java
+++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentDictionaryCreator.java
@@ -19,16 +19,12 @@
 package org.apache.pinot.segment.local.segment.creator.impl;
 
 import com.google.common.base.Preconditions;
-import it.unimi.dsi.fastutil.doubles.Double2IntOpenHashMap;
-import it.unimi.dsi.fastutil.floats.Float2IntOpenHashMap;
-import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
-import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
-import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
 import java.io.Closeable;
 import java.io.File;
 import java.io.IOException;
 import java.math.BigDecimal;
 import java.nio.ByteOrder;
+import java.util.Arrays;
 import org.apache.commons.io.FileUtils;
 import org.apache.pinot.segment.local.io.util.FixedByteValueReaderWriter;
 import org.apache.pinot.segment.local.io.util.VarLengthValueWriter;
@@ -52,11 +48,11 @@ public class SegmentDictionaryCreator implements Closeable {
   private final File _dictionaryFile;
   private final boolean _useVarLengthDictionary;
 
-  private Int2IntOpenHashMap _intValueToIndexMap;
-  private Long2IntOpenHashMap _longValueToIndexMap;
-  private Float2IntOpenHashMap _floatValueToIndexMap;
-  private Double2IntOpenHashMap _doubleValueToIndexMap;
-  private Object2IntOpenHashMap<Object> _objectValueToIndexMap;
+  private int[] _sortedInts;
+  private long[] _sortedLongs;
+  private float[] _sortedFloats;
+  private double[] _sortedDoubles;
+  private Object[] _sortedObjects;
   private int _numBytesPerEntry = 0;
 
   public SegmentDictionaryCreator(FieldSpec fieldSpec, File indexDir, boolean useVarLengthDictionary) {
@@ -76,96 +72,83 @@ public class SegmentDictionaryCreator implements Closeable {
 
     switch (_storedType) {
       case INT:
-        int[] sortedInts = (int[]) sortedValues;
-        int numValues = sortedInts.length;
+        _sortedInts = (int[]) sortedValues;
+        int numValues = _sortedInts.length;
         Preconditions.checkState(numValues > 0);
-        _intValueToIndexMap = new Int2IntOpenHashMap(numValues);
 
         // Backward-compatible: index file is always big-endian
         try (PinotDataBuffer dataBuffer = PinotDataBuffer.mapFile(_dictionaryFile, false, 0,
             (long) numValues * Integer.BYTES, ByteOrder.BIG_ENDIAN, getClass().getSimpleName());
             FixedByteValueReaderWriter writer = new FixedByteValueReaderWriter(dataBuffer)) {
           for (int i = 0; i < numValues; i++) {
-            int value = sortedInts[i];
-            _intValueToIndexMap.put(value, i);
-            writer.writeInt(i, value);
+            writer.writeInt(i, _sortedInts[i]);
           }
         }
         LOGGER.info("Created dictionary for INT column: {} with cardinality: {}, range: {} to {}", _columnName,
-            numValues, sortedInts[0], sortedInts[numValues - 1]);
+            numValues, _sortedInts[0], _sortedInts[numValues - 1]);
         return;
 
       case LONG:
-        long[] sortedLongs = (long[]) sortedValues;
-        numValues = sortedLongs.length;
+        _sortedLongs = (long[]) sortedValues;
+        numValues = _sortedLongs.length;
         Preconditions.checkState(numValues > 0);
-        _longValueToIndexMap = new Long2IntOpenHashMap(numValues);
 
         // Backward-compatible: index file is always big-endian
         try (PinotDataBuffer dataBuffer = PinotDataBuffer.mapFile(_dictionaryFile, false, 0,
             (long) numValues * Long.BYTES, ByteOrder.BIG_ENDIAN, getClass().getSimpleName());
             FixedByteValueReaderWriter writer = new FixedByteValueReaderWriter(dataBuffer)) {
           for (int i = 0; i < numValues; i++) {
-            long value = sortedLongs[i];
-            _longValueToIndexMap.put(value, i);
-            writer.writeLong(i, value);
+            writer.writeLong(i, _sortedLongs[i]);
           }
         }
         LOGGER.info("Created dictionary for LONG column: {} with cardinality: {}, range: {} to {}", _columnName,
-            numValues, sortedLongs[0], sortedLongs[numValues - 1]);
+            numValues, _sortedLongs[0], _sortedLongs[numValues - 1]);
         return;
 
       case FLOAT:
-        float[] sortedFloats = (float[]) sortedValues;
-        numValues = sortedFloats.length;
+        _sortedFloats = (float[]) sortedValues;
+        numValues = _sortedFloats.length;
         Preconditions.checkState(numValues > 0);
-        _floatValueToIndexMap = new Float2IntOpenHashMap(numValues);
 
         // Backward-compatible: index file is always big-endian
         try (PinotDataBuffer dataBuffer = PinotDataBuffer.mapFile(_dictionaryFile, false, 0,
             (long) numValues * Float.BYTES, ByteOrder.BIG_ENDIAN, getClass().getSimpleName());
             FixedByteValueReaderWriter writer = new FixedByteValueReaderWriter(dataBuffer)) {
           for (int i = 0; i < numValues; i++) {
-            float value = sortedFloats[i];
-            _floatValueToIndexMap.put(value, i);
-            writer.writeFloat(i, value);
+            writer.writeFloat(i, _sortedFloats[i]);
           }
         }
         LOGGER.info("Created dictionary for FLOAT column: {} with cardinality: {}, range: {} to {}", _columnName,
-            numValues, sortedFloats[0], sortedFloats[numValues - 1]);
+            numValues, _sortedFloats[0], _sortedFloats[numValues - 1]);
         return;
 
       case DOUBLE:
-        double[] sortedDoubles = (double[]) sortedValues;
-        numValues = sortedDoubles.length;
+        _sortedDoubles = (double[]) sortedValues;
+        numValues = _sortedDoubles.length;
         Preconditions.checkState(numValues > 0);
-        _doubleValueToIndexMap = new Double2IntOpenHashMap(numValues);
 
         // Backward-compatible: index file is always big-endian
         try (PinotDataBuffer dataBuffer = PinotDataBuffer.mapFile(_dictionaryFile, false, 0,
             (long) numValues * Double.BYTES, ByteOrder.BIG_ENDIAN, getClass().getSimpleName());
             FixedByteValueReaderWriter writer = new FixedByteValueReaderWriter(dataBuffer)) {
           for (int i = 0; i < numValues; i++) {
-            double value = sortedDoubles[i];
-            _doubleValueToIndexMap.put(value, i);
-            writer.writeDouble(i, value);
+            writer.writeDouble(i, _sortedDoubles[i]);
           }
         }
         LOGGER.info("Created dictionary for DOUBLE column: {} with cardinality: {}, range: {} to {}", _columnName,
-            numValues, sortedDoubles[0], sortedDoubles[numValues - 1]);
+            numValues, _sortedDoubles[0], _sortedDoubles[numValues - 1]);
         return;
 
       case BIG_DECIMAL:
         BigDecimal[] sortedBigDecimals = (BigDecimal[]) sortedValues;
+        _sortedObjects = sortedBigDecimals;
         numValues = sortedBigDecimals.length;
         Preconditions.checkState(numValues > 0);
-        _objectValueToIndexMap = new Object2IntOpenHashMap<>(numValues);
 
         // Get the maximum length of all entries
         byte[][] sortedBigDecimalBytes = new byte[numValues][];
         for (int i = 0; i < numValues; i++) {
           BigDecimal value = sortedBigDecimals[i];
-          _objectValueToIndexMap.put(value, i);
           byte[] valueBytes = BigDecimalUtils.serialize(value);
           sortedBigDecimalBytes[i] = valueBytes;
           _numBytesPerEntry = Math.max(_numBytesPerEntry, valueBytes.length);
@@ -179,15 +162,14 @@ public class SegmentDictionaryCreator implements Closeable {
 
       case STRING:
         String[] sortedStrings = (String[]) sortedValues;
+        _sortedObjects = sortedStrings;
         numValues = sortedStrings.length;
         Preconditions.checkState(numValues > 0);
-        _objectValueToIndexMap = new Object2IntOpenHashMap<>(numValues);
 
         // Get the maximum length of all entries
         byte[][] sortedStringBytes = new byte[numValues][];
         for (int i = 0; i < numValues; i++) {
           String value = sortedStrings[i];
-          _objectValueToIndexMap.put(value, i);
           byte[] valueBytes = value.getBytes(UTF_8);
           sortedStringBytes[i] = valueBytes;
           _numBytesPerEntry = Math.max(_numBytesPerEntry, valueBytes.length);
@@ -201,16 +183,15 @@ public class SegmentDictionaryCreator implements Closeable {
 
       case BYTES:
         ByteArray[] sortedBytes = (ByteArray[]) sortedValues;
+        _sortedObjects = sortedBytes;
         numValues = sortedBytes.length;
         Preconditions.checkState(numValues > 0);
-        _objectValueToIndexMap = new Object2IntOpenHashMap<>(numValues);
 
         // Get the maximum length of all entries
         byte[][] sortedByteArrays = new byte[numValues][];
         for (int i = 0; i < numValues; i++) {
           ByteArray value = sortedBytes[i];
           sortedByteArrays[i] = value.getBytes();
-          _objectValueToIndexMap.put(value, i);
           _numBytesPerEntry = Math.max(_numBytesPerEntry, value.getBytes().length);
         }
 
@@ -262,20 +243,20 @@ public class SegmentDictionaryCreator implements Closeable {
   public int indexOfSV(Object value) {
     switch (_storedType) {
       case INT:
-        return _intValueToIndexMap.get((int) value);
+        return indexOf((int) value);
       case LONG:
-        return _longValueToIndexMap.get((long) value);
+        return indexOf((long) value);
       case FLOAT:
-        return _floatValueToIndexMap.get((float) value);
+        return indexOf((float) value);
       case DOUBLE:
-        return _doubleValueToIndexMap.get((double) value);
-      case STRING:
+        return indexOf((double) value);
       case BIG_DECIMAL:
-        return _objectValueToIndexMap.getInt(value);
+      case STRING:
+        return indexOf(value);
       case BYTES:
-        return _objectValueToIndexMap.getInt(new ByteArray((byte[]) value));
+        return indexOf(new ByteArray((byte[]) value));
       default:
-        throw new UnsupportedOperationException("Unsupported data type : " + _storedType);
+        throw new UnsupportedOperationException("Unsupported SV data type: " + _storedType);
     }
   }
 
@@ -286,49 +267,69 @@ public class SegmentDictionaryCreator implements Closeable {
     switch (_storedType) {
       case INT:
         for (int i = 0; i < multiValues.length; i++) {
-          indexes[i] = _intValueToIndexMap.get((int) multiValues[i]);
+          indexes[i] = indexOf((int) multiValues[i]);
         }
         break;
       case LONG:
         for (int i = 0; i < multiValues.length; i++) {
-          indexes[i] = _longValueToIndexMap.get((long) multiValues[i]);
+          indexes[i] = indexOf((long) multiValues[i]);
         }
         break;
       case FLOAT:
         for (int i = 0; i < multiValues.length; i++) {
-          indexes[i] = _floatValueToIndexMap.get((float) multiValues[i]);
+          indexes[i] = indexOf((float) multiValues[i]);
         }
         break;
       case DOUBLE:
         for (int i = 0; i < multiValues.length; i++) {
-          indexes[i] = _doubleValueToIndexMap.get((double) multiValues[i]);
+          indexes[i] = indexOf((double) multiValues[i]);
         }
         break;
       case STRING:
         for (int i = 0; i < multiValues.length; i++) {
-          indexes[i] = _objectValueToIndexMap.getInt(multiValues[i]);
+          indexes[i] = indexOf(multiValues[i]);
         }
         break;
       case BYTES:
         for (int i = 0; i < multiValues.length; i++) {
-          indexes[i] = _objectValueToIndexMap.getInt(new ByteArray((byte[]) multiValues[i]));
+          indexes[i] = indexOf(new ByteArray((byte[]) multiValues[i]));
         }
         break;
       default:
-        throw new UnsupportedOperationException("Unsupported data type : " + _storedType);
+        throw new UnsupportedOperationException("Unsupported MV data type: " + _storedType);
     }
     return indexes;
   }
 
+  private int indexOf(int value) {
+    return Arrays.binarySearch(_sortedInts, value);
+  }
+
+  private int indexOf(long value) {
+    return Arrays.binarySearch(_sortedLongs, value);
+  }
+
+  private int indexOf(float value) {
+    return Arrays.binarySearch(_sortedFloats, value);
+  }
+
+  private int indexOf(double value) {
+    return Arrays.binarySearch(_sortedDoubles, value);
+  }
+
+  private int indexOf(Object value) {
+    return Arrays.binarySearch(_sortedObjects, value);
+  }
+
   /**
    * Cleans up the no longer needed objects after all the indexing is done to free up some memory.
    */
   public void postIndexingCleanup() {
-    _intValueToIndexMap = null;
-    _longValueToIndexMap = null;
-    _floatValueToIndexMap = null;
-    _doubleValueToIndexMap = null;
-    _objectValueToIndexMap = null;
+    _sortedInts = null;
+    _sortedLongs = null;
+    _sortedFloats = null;
+    _sortedDoubles = null;
+    _sortedObjects = null;
   }
 
   @Override


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org