You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by GitBox <gi...@apache.org> on 2022/09/21 22:55:33 UTC

[GitHub] [pinot] Jackie-Jiang commented on a diff in pull request #9441: [enhancement] [release-note] Update get bytes to return raw bytes of string and support getBytesMV

Jackie-Jiang commented on code in PR #9441:
URL: https://github.com/apache/pinot/pull/9441#discussion_r977022852


##########
pinot-core/src/main/java/org/apache/pinot/core/common/DataBlockCache.java:
##########
@@ -493,6 +493,24 @@ public int[] getNumValuesForMVColumn(String column) {
     return numValues;
   }
 
+  /**
+   * Get the bytes values for a multi-valued column.
+   *
+   * @param column Column name
+   * @return Array of bytes values
+   */
+  public byte[][][] getBytesValuesForMVColumn(String column) {

Review Comment:
   (minor) Move it in front of `getNumValuesForMVColumn()`



##########
pinot-core/src/test/java/org/apache/pinot/core/common/DataFetcherTest.java:
##########
@@ -363,7 +363,12 @@ public void testFetchBytesValues(String column) {
     _dataFetcher.fetchBytesValues(column, docIds, length, bytesValues);
 
     for (int i = 0; i < length; i++) {
-      Assert.assertEquals(new String(bytesValues[i], UTF_8), Integer.toString(_values[docIds[i]]), ERROR_MESSAGE);
+      String stringValue = Integer.toString(_values[docIds[i]]);
+      if (column != HEX_STRING_COLUMN && column != NO_DICT_HEX_STRING_COLUMN) {

Review Comment:
   Instead of changing here, we should change `testFetchBytesValues()` to test on `STRING_COLUMN` and `NO_DICT_STRING_COLUMN` instead



##########
pinot-core/src/main/java/org/apache/pinot/core/common/DataFetcher.java:
##########
@@ -419,6 +419,18 @@ public void fetchNumValues(String column, int[] inDocIds, int length, int[] outN
     _columnValueReaderMap.get(column).readNumValuesMV(inDocIds, length, outNumValues);
   }
 
+  /**
+   * Fetch the bytes values for a multi-valued column.
+   *
+   * @param column Column name
+   * @param inDocIds Input document Ids buffer
+   * @param length Number of input document Ids
+   * @param outValues Buffer for output
+   */
+  public void fetchBytesValues(String column, int[] inDocIds, int length, byte[][][] outValues) {

Review Comment:
   (minor) Move it in front of `fetchNumValues()`



##########
pinot-core/src/main/java/org/apache/pinot/core/common/DataFetcher.java:
##########
@@ -611,12 +623,8 @@ void readBytesValues(int[] docIds, int length, byte[][] valueBuffer) {
         _dictionary.readBytesValues(dictIdBuffer, length, valueBuffer);
       } else {
         switch (_storedType) {
-          case STRING:
-            for (int i = 0; i < length; i++) {
-              valueBuffer[i] = BytesUtils.toBytes(_reader.getString(docIds[i], readerContext));
-            }
-            break;
           case BYTES:
+          case STRING:

Review Comment:
   We can remove the switch and directly read the values



##########
pinot-core/src/main/java/org/apache/pinot/core/common/DataFetcher.java:
##########
@@ -748,6 +756,20 @@ public void readNumValuesMV(int[] docIds, int length, int[] numValuesBuffer) {
       }
     }
 
+    void readBytesValuesMV(int[] docIds, int length, byte[][][] valuesBuffer) {

Review Comment:
   (minor) Move it in front of `readNumValuesMV()`



##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/ValueReader.java:
##########
@@ -40,6 +40,8 @@ default BigDecimal getBigDecimal(int index, int numBytesPerValue) {
     return BigDecimalUtils.deserialize(getBytes(index, numBytesPerValue));
   }
 
+  byte[] getUnpaddedBytes(int index, int numBytesPerValue, byte paddingByte, byte[] buffer);

Review Comment:
   Add some javadoc here



##########
pinot-core/src/main/java/org/apache/pinot/core/operator/transform/function/BaseTransformFunction.java:
##########
@@ -18,6 +18,7 @@
  */
 package org.apache.pinot.core.operator.transform.function;
 
+import com.clearspring.analytics.util.Preconditions;

Review Comment:
   This is the wrong import. We use the `Preconditions` from the guava



##########
pinot-core/src/main/java/org/apache/pinot/core/common/DataFetcher.java:
##########
@@ -748,6 +756,20 @@ public void readNumValuesMV(int[] docIds, int length, int[] numValuesBuffer) {
       }
     }
 
+    void readBytesValuesMV(int[] docIds, int length, byte[][][] valuesBuffer) {
+      Tracing.activeRecording().setInputDataType(_storedType, _singleValue);
+      if (_dictionary != null) {
+        for (int i = 0; i < length; i++) {
+          int numValues = _reader.getDictIdMV(docIds[i], _reusableMVDictIds, getReaderContext());

Review Comment:
   Create reader context before entering the loop. See `readStringValuesMV` for reference



##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/VarLengthValueReader.java:
##########
@@ -96,6 +96,11 @@ public String getUnpaddedString(int index, int numBytesPerValue, byte paddingByt
     return new String(buffer, 0, length, UTF_8);
   }
 
+  @Override
+  public byte[] getUnpaddedBytes(int index, int numBytesPerValue, byte paddingByte, byte[] buffer) {

Review Comment:
   (minor) Move it before `getUnpaddedString()`



##########
pinot-core/src/main/java/org/apache/pinot/core/operator/transform/function/BaseTransformFunction.java:
##########
@@ -588,4 +589,34 @@ public String[][] transformToStringValuesMV(ProjectionBlock projectionBlock) {
     }
     return _stringValuesMV;
   }
+
+  @Override
+  public byte[][][] transformToBytesValuesMV(ProjectionBlock projectionBlock) {
+    int length = projectionBlock.getNumDocs();
+    if (_bytesValuesMV == null) {
+      _bytesValuesMV = new byte[length][][];
+    }
+    Dictionary dictionary = getDictionary();
+    if (dictionary != null) {
+      int[][] dictIdsMV = transformToDictIdsMV(projectionBlock);
+      for (int i = 0; i < length; i++) {
+        int[] dictIds = dictIdsMV[i];
+        int numValues = dictIds.length;
+        byte[][] bytesValues = new byte[numValues][];
+        dictionary.readBytesValues(dictIds, numValues, bytesValues);
+        _bytesValuesMV[i] = bytesValues;
+      }
+    } else {
+      Preconditions.checkState(getResultMetadata().getDataType().getStoredType() == DataType.STRING);

Review Comment:
   Use `assert` instead which doesn't have overhead
   ```suggestion
         assert getResultMetadata().getDataType().getStoredType() == DataType.STRING;
   ```



##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/dictionary/StringOnHeapMutableDictionary.java:
##########
@@ -157,7 +157,7 @@ public String getStringValue(int dictId) {
 
   @Override
   public byte[] getBytesValue(int dictId) {
-    return BytesUtils.toBytes(getStringValue(dictId));
+    return getStringValue(dictId).getBytes(StandardCharsets.UTF_8);

Review Comment:
   (nit) `import static java.nio.charset.StandardCharsets.UTF_8;` to be consistent with other classes



##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BaseImmutableDictionary.java:
##########
@@ -311,6 +311,10 @@ protected String getUnpaddedString(int dictId, byte[] buffer) {
     return _valueReader.getUnpaddedString(dictId, _numBytesPerValue, _paddingByte, buffer);
   }
 
+  protected byte[] getUnpaddedBytes(int dictId, byte[] buffer) {

Review Comment:
   (minor) Move it before `getUnpaddedString()`



##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/StringDictionary.java:
##########
@@ -77,7 +76,8 @@ public String getStringValue(int dictId) {
 
   @Override
   public byte[] getBytesValue(int dictId) {
-    return BytesUtils.toBytes(getUnpaddedString(dictId, getBuffer()));
+    byte[] buffer = getBuffer();
+    return getUnpaddedBytes(dictId, buffer);

Review Comment:
   (nit)
   ```suggestion
       return getUnpaddedBytes(dictId, getBuffer());
   ```



##########
pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/mutable/MutableForwardIndex.java:
##########
@@ -552,4 +552,48 @@ default void setDoubleMV(int docId, double[] values) {
   default void setStringMV(int docId, String[] values) {
     throw new UnsupportedOperationException();
   }
+
+  /**
+   * Reads the BYTES type multi-value at the given document id into the passed in value buffer (the buffer size must be
+   * enough to hold all the values for the multi-value entry) and returns the number of values within the multi-value
+   * entry.
+   *
+   * @param docId Document id
+   * @param valueBuffer Value buffer
+   * @return Number of values within the multi-value entry
+   */
+  default int getBytesMV(int docId, byte[][] valueBuffer) {

Review Comment:
   (minor) Move the getters after the string getters



##########
pinot-spi/src/main/java/org/apache/pinot/spi/utils/BytesUtils.java:
##########
@@ -40,6 +41,25 @@ public static byte[] toBytes(String stringValue) {
     }
   }
 
+  /**
+   * Convert a string to UTF8 byte array.
+   * @param stringValue
+   * @return UTF8 byte array.
+   */
+  public static byte[] toUTF8Bytes(String stringValue) {

Review Comment:
   Are these 2 methods used? IIRC we used to have these methods in `StringUtil`, but then removed them because there is no much point adding a util function for a one code.



##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/OnHeapStringDictionary.java:
##########
@@ -55,6 +58,7 @@ public OnHeapStringDictionary(PinotDataBuffer dataBuffer, int length, int numByt
       String unpaddedString = getUnpaddedString(i, buffer);
       _unpaddedStrings[i] = unpaddedString;
       _unPaddedStringToIdMap.put(unpaddedString, i);
+      _unpaddedBytes[i] = getUnpaddedBytes(i, buffer);

Review Comment:
   Only read the unpadded bytes, and directly create unpaddedString from the unpadded bytes for better performance



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org