You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by "Jackie-Jiang (via GitHub)" <gi...@apache.org> on 2024/02/26 23:04:33 UTC

[PR] Fix ColumnMinMaxValueGenerator [pinot]

Jackie-Jiang opened a new pull request, #12502:
URL: https://github.com/apache/pinot/pull/12502

   Fix the following issues:
   - Support generating min/max value for BIG_DECIMAL column
   - Do not abort the segment load when min/max value cannot be generated
   - Do not persist min/max value when its length is over the limit (512 characters). We cannot persist the approximate value because we might directly use the value as the query result
   
   TODO: We should add unit test for this. Tracked with #12501


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org


Re: [PR] Fix ColumnMinMaxValueGenerator [pinot]

Posted by "Jackie-Jiang (via GitHub)" <gi...@apache.org>.
Jackie-Jiang commented on code in PR #12502:
URL: https://github.com/apache/pinot/pull/12502#discussion_r1503471374


##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/columnminmaxvalue/ColumnMinMaxValueGenerator.java:
##########
@@ -124,227 +131,251 @@ private List<String> getColumnsToAddMinMaxValue() {
   }
 
   private boolean needAddColumnMinMaxValueForColumn(String columnName) {
-    ColumnMetadata columnMetadata = _segmentMetadata.getColumnMetadataFor(columnName);
+    return needAddColumnMinMaxValueForColumn(_segmentMetadata.getColumnMetadataFor(columnName));
+  }
+
+  private boolean needAddColumnMinMaxValueForColumn(ColumnMetadata columnMetadata) {
     return columnMetadata.getMinValue() == null && columnMetadata.getMaxValue() == null
         && !columnMetadata.isMinMaxValueInvalid();
   }
 
-  private void addColumnMinMaxValueForColumn(String columnName)
-      throws Exception {
-    // Skip column with min/max value already set
+  private void addColumnMinMaxValueForColumn(String columnName) {
     ColumnMetadata columnMetadata = _segmentMetadata.getColumnMetadataFor(columnName);
-    if (columnMetadata.getMinValue() != null || columnMetadata.getMaxValue() != null) {
+    if (!needAddColumnMinMaxValueForColumn(columnMetadata)) {
       return;
     }
+    try {
+      if (columnMetadata.hasDictionary()) {
+        addColumnMinMaxValueWithDictionary(columnMetadata);
+      } else {
+        addColumnMinMaxValueWithoutDictionary(columnMetadata);
+      }
+      _minMaxValueAdded = true;
+    } catch (Exception e) {
+      LOGGER.error("Caught exception while generating min/max value for column: {} in segment: {}, continuing without "
+          + "persisting them", columnName, _segmentMetadata.getName(), e);
+    }
+  }
 
+  private void addColumnMinMaxValueWithDictionary(ColumnMetadata columnMetadata)
+      throws IOException {
+    try (Dictionary dictionary = getDictionaryForColumn(columnMetadata)) {
+      SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnMetadata.getColumnName(),
+          dictionary.getInternal(0), dictionary.getInternal(dictionary.length() - 1),
+          columnMetadata.getDataType().getStoredType());
+    }
+  }
+
+  private Dictionary getDictionaryForColumn(ColumnMetadata columnMetadata)
+      throws IOException {
+    String columnName = columnMetadata.getColumnName();
+    DataType dataType = columnMetadata.getDataType();
+    PinotDataBuffer dictionaryBuffer = _segmentWriter.getIndexFor(columnName, StandardIndexes.dictionary());
+    int length = columnMetadata.getCardinality();
+    switch (dataType.getStoredType()) {
+      case INT:
+        return new IntDictionary(dictionaryBuffer, length);
+      case LONG:
+        return new LongDictionary(dictionaryBuffer, length);
+      case FLOAT:
+        return new FloatDictionary(dictionaryBuffer, length);
+      case DOUBLE:
+        return new DoubleDictionary(dictionaryBuffer, length);
+      case BIG_DECIMAL:
+        return new BigDecimalDictionary(dictionaryBuffer, length, columnMetadata.getColumnMaxLength());
+      case STRING:
+        return new StringDictionary(dictionaryBuffer, length, columnMetadata.getColumnMaxLength());
+      case BYTES:
+        return new BytesDictionary(dictionaryBuffer, length, columnMetadata.getColumnMaxLength());
+      default:
+        throw new IllegalStateException("Unsupported data type: " + dataType + " for column: " + columnName);
+    }
+  }
+
+  @SuppressWarnings({"rawtypes", "unchecked"})
+  private void addColumnMinMaxValueWithoutDictionary(ColumnMetadata columnMetadata)
+      throws IOException {
+    String columnName = columnMetadata.getColumnName();
     DataType dataType = columnMetadata.getDataType();
     DataType storedType = dataType.getStoredType();
-    if (columnMetadata.hasDictionary()) {
-      PinotDataBuffer dictionaryBuffer = _segmentWriter.getIndexFor(columnName, StandardIndexes.dictionary());
-      int length = columnMetadata.getCardinality();
-      switch (storedType) {
-        case INT:
-          try (IntDictionary intDictionary = new IntDictionary(dictionaryBuffer, length)) {
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-                intDictionary.getStringValue(0), intDictionary.getStringValue(length - 1), storedType);
-          }
-          break;
-        case LONG:
-          try (LongDictionary longDictionary = new LongDictionary(dictionaryBuffer, length)) {
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-                longDictionary.getStringValue(0), longDictionary.getStringValue(length - 1), storedType);
-          }
-          break;
-        case FLOAT:
-          try (FloatDictionary floatDictionary = new FloatDictionary(dictionaryBuffer, length)) {
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-                floatDictionary.getStringValue(0), floatDictionary.getStringValue(length - 1), storedType);
-          }
-          break;
-        case DOUBLE:
-          try (DoubleDictionary doubleDictionary = new DoubleDictionary(dictionaryBuffer, length)) {
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-                doubleDictionary.getStringValue(0), doubleDictionary.getStringValue(length - 1), storedType);
-          }
-          break;
-        case STRING:
-          try (StringDictionary stringDictionary = new StringDictionary(dictionaryBuffer, length,
-              columnMetadata.getColumnMaxLength())) {
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-                stringDictionary.getStringValue(0), stringDictionary.getStringValue(length - 1), storedType);
-          }
-          break;
-        case BYTES:
-          try (BytesDictionary bytesDictionary = new BytesDictionary(dictionaryBuffer, length,
-              columnMetadata.getColumnMaxLength())) {
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-                bytesDictionary.getStringValue(0), bytesDictionary.getStringValue(length - 1), storedType);
-          }
-          break;
-        default:
-          throw new IllegalStateException("Unsupported data type: " + dataType + " for column: " + columnName);
-      }
-    } else {
-      // setting min/max for non-dictionary columns.
+    boolean isSingleValue = columnMetadata.isSingleValue();
+    PinotDataBuffer rawIndexBuffer = _segmentWriter.getIndexFor(columnName, StandardIndexes.forward());
+    try (ForwardIndexReader rawIndexReader = ForwardIndexReaderFactory.createRawIndexReader(rawIndexBuffer, storedType,
+        isSingleValue); ForwardIndexReaderContext readerContext = rawIndexReader.createContext()) {
       int numDocs = columnMetadata.getTotalDocs();
-      PinotDataBuffer rawIndexBuffer = _segmentWriter.getIndexFor(columnName, StandardIndexes.forward());
-      boolean isSingleValue = _segmentMetadata.getSchema().getFieldSpecFor(columnName).isSingleValueField();
-      try (
-          ForwardIndexReader rawIndexReader = ForwardIndexReaderFactory.createRawIndexReader(rawIndexBuffer, storedType,
-              isSingleValue); ForwardIndexReaderContext readerContext = rawIndexReader.createContext()) {
-        switch (storedType) {
-          case INT: {
-            int min = Integer.MAX_VALUE;
-            int max = Integer.MIN_VALUE;
-            if (isSingleValue) {
-              for (int docId = 0; docId < numDocs; docId++) {
-                int value = rawIndexReader.getInt(docId, readerContext);
+      Object minValue;
+      Object maxValue;
+      switch (storedType) {
+        case INT: {
+          int min = Integer.MAX_VALUE;
+          int max = Integer.MIN_VALUE;
+          if (isSingleValue) {
+            for (int docId = 0; docId < numDocs; docId++) {
+              int value = rawIndexReader.getInt(docId, readerContext);
+              min = Math.min(min, value);
+              max = Math.max(max, value);
+            }
+          } else {
+            for (int docId = 0; docId < numDocs; docId++) {
+              int[] values = rawIndexReader.getIntMV(docId, readerContext);
+              for (int value : values) {
                 min = Math.min(min, value);
                 max = Math.max(max, value);
               }
-            } else {
-              for (int docId = 0; docId < numDocs; docId++) {
-                int[] values = rawIndexReader.getIntMV(docId, readerContext);
-                for (int value : values) {
-                  min = Math.min(min, value);
-                  max = Math.max(max, value);
-                }
-              }
             }
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName, String.valueOf(min),
-                String.valueOf(max), storedType);
-            break;
           }
-          case LONG: {
-            long min = Long.MAX_VALUE;
-            long max = Long.MIN_VALUE;
-            if (isSingleValue) {
-              for (int docId = 0; docId < numDocs; docId++) {
-                long value = rawIndexReader.getLong(docId, readerContext);
+          minValue = min;
+          maxValue = max;
+          break;
+        }
+        case LONG: {
+          long min = Long.MAX_VALUE;
+          long max = Long.MIN_VALUE;
+          if (isSingleValue) {
+            for (int docId = 0; docId < numDocs; docId++) {
+              long value = rawIndexReader.getLong(docId, readerContext);
+              min = Math.min(min, value);
+              max = Math.max(max, value);
+            }
+          } else {
+            for (int docId = 0; docId < numDocs; docId++) {
+              long[] values = rawIndexReader.getLongMV(docId, readerContext);
+              for (long value : values) {
                 min = Math.min(min, value);
                 max = Math.max(max, value);
               }
-            } else {
-              for (int docId = 0; docId < numDocs; docId++) {
-                long[] values = rawIndexReader.getLongMV(docId, readerContext);
-                for (long value : values) {
-                  min = Math.min(min, value);
-                  max = Math.max(max, value);
-                }
-              }
             }
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName, String.valueOf(min),
-                String.valueOf(max), storedType);
-            break;
           }
-          case FLOAT: {
-            float min = Float.POSITIVE_INFINITY;
-            float max = Float.NEGATIVE_INFINITY;
-            if (isSingleValue) {
-              for (int docId = 0; docId < numDocs; docId++) {
-                float value = rawIndexReader.getFloat(docId, readerContext);
+          minValue = min;
+          maxValue = max;
+          break;
+        }
+        case FLOAT: {
+          float min = Float.POSITIVE_INFINITY;
+          float max = Float.NEGATIVE_INFINITY;
+          if (isSingleValue) {
+            for (int docId = 0; docId < numDocs; docId++) {
+              float value = rawIndexReader.getFloat(docId, readerContext);
+              min = Math.min(min, value);
+              max = Math.max(max, value);
+            }
+          } else {
+            for (int docId = 0; docId < numDocs; docId++) {
+              float[] values = rawIndexReader.getFloatMV(docId, readerContext);
+              for (float value : values) {
                 min = Math.min(min, value);
                 max = Math.max(max, value);
               }
-            } else {
-              for (int docId = 0; docId < numDocs; docId++) {
-                float[] values = rawIndexReader.getFloatMV(docId, readerContext);
-                for (float value : values) {
-                  min = Math.min(min, value);
-                  max = Math.max(max, value);
-                }
-              }
             }
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName, String.valueOf(min),
-                String.valueOf(max), storedType);
-            break;
           }
-          case DOUBLE: {
-            double min = Double.POSITIVE_INFINITY;
-            double max = Double.NEGATIVE_INFINITY;
-            if (isSingleValue) {
-              for (int docId = 0; docId < numDocs; docId++) {
-                double value = rawIndexReader.getDouble(docId, readerContext);
+          minValue = min;
+          maxValue = max;
+          break;
+        }
+        case DOUBLE: {
+          double min = Double.POSITIVE_INFINITY;
+          double max = Double.NEGATIVE_INFINITY;
+          if (isSingleValue) {
+            for (int docId = 0; docId < numDocs; docId++) {
+              double value = rawIndexReader.getDouble(docId, readerContext);
+              min = Math.min(min, value);
+              max = Math.max(max, value);
+            }
+          } else {
+            for (int docId = 0; docId < numDocs; docId++) {
+              double[] values = rawIndexReader.getDoubleMV(docId, readerContext);
+              for (double value : values) {
                 min = Math.min(min, value);
                 max = Math.max(max, value);
               }
-            } else {
-              for (int docId = 0; docId < numDocs; docId++) {
-                double[] values = rawIndexReader.getDoubleMV(docId, readerContext);
-                for (double value : values) {
-                  min = Math.min(min, value);
-                  max = Math.max(max, value);
-                }
-              }
             }
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName, String.valueOf(min),
-                String.valueOf(max), storedType);
-            break;
           }
-          case STRING: {
-            String min = null;
-            String max = null;
-            if (isSingleValue) {
-              for (int docId = 0; docId < numDocs; docId++) {
-                String value = rawIndexReader.getString(docId, readerContext);
+          minValue = min;
+          maxValue = max;
+          break;
+        }
+        case BIG_DECIMAL: {
+          Preconditions.checkState(isSingleValue, "Unsupported multi-value BIG_DECIMAL column: %s", columnName);

Review Comment:
   MV is only allowed within `DIMENSION`, and it is very rare to configure `BIG_DECIMAL` as dimension. In the future we can consider adding MV support, but as of now we don't have APIs to read MV BigDecimal values



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org


Re: [PR] Fix ColumnMinMaxValueGenerator [pinot]

Posted by "Jackie-Jiang (via GitHub)" <gi...@apache.org>.
Jackie-Jiang commented on PR #12502:
URL: https://github.com/apache/pinot/pull/12502#issuecomment-1965492288

   @jadami10 Please take a look
   
   @abhioncbr I have to revert some of the changes in #10990 as I just realize we cannot keep approximate value as min/max because they can be used as query result


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org


Re: [PR] Fix ColumnMinMaxValueGenerator [pinot]

Posted by "Jackie-Jiang (via GitHub)" <gi...@apache.org>.
Jackie-Jiang merged PR #12502:
URL: https://github.com/apache/pinot/pull/12502


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org


Re: [PR] Fix ColumnMinMaxValueGenerator [pinot]

Posted by "Jackie-Jiang (via GitHub)" <gi...@apache.org>.
Jackie-Jiang commented on code in PR #12502:
URL: https://github.com/apache/pinot/pull/12502#discussion_r1503469768


##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java:
##########
@@ -639,64 +636,17 @@ public static void addColumnMinMaxValueInfo(PropertiesConfiguration properties,
   }
 
   /**
-   * Helper method to get the valid value for setting min/max. Returns {@code null} if the value is not supported in
-   * {@link PropertiesConfiguration}, e.g. contains character with surrogate.
+   * Helper method to get the valid value for setting min/max. Returns {@code null} if the value is too long (longer
+   * than 512 characters), or is not supported in {@link PropertiesConfiguration}, e.g. contains character with
+   * surrogate.
    */
   @Nullable
-  private static String getValidPropertyValue(String value, boolean isMax, DataType storedType) {
-    String valueWithinLengthLimit = getValueWithinLengthLimit(value, isMax, storedType);
-    return storedType == DataType.STRING ? CommonsConfigurationUtils.replaceSpecialCharacterInPropertyValue(
-        valueWithinLengthLimit) : valueWithinLengthLimit;
-  }
-
-  /**
-   * Returns the original string if its length is within the allowed limit. If the string's length exceeds the limit,
-   * returns a truncated version of the string with maintaining min or max value.
-   */
-  @VisibleForTesting
-  static String getValueWithinLengthLimit(String value, boolean isMax, DataType storedType) {
-    int length = value.length();
-    if (length <= METADATA_PROPERTY_LENGTH_LIMIT) {
-      return value;
-    }
-    switch (storedType) {
-      case STRING:
-        if (isMax) {
-          int trimIndexValue = METADATA_PROPERTY_LENGTH_LIMIT - 1;
-          // determining the index for the character having value less than '\uFFFF'
-          while (trimIndexValue < length && value.charAt(trimIndexValue) == '\uFFFF') {
-            trimIndexValue++;
-          }
-          if (trimIndexValue == length) {
-            return value;
-          } else {
-            // assigning the '\uFFFF' to make the value max.
-            return value.substring(0, trimIndexValue) + '\uFFFF';
-          }
-        } else {
-          return value.substring(0, METADATA_PROPERTY_LENGTH_LIMIT);
-        }
-      case BYTES:
-        if (isMax) {
-          byte[] valueInByteArray = BytesUtils.toBytes(value);
-          int trimIndexValue = METADATA_PROPERTY_LENGTH_LIMIT / 2 - 1;
-          // determining the index for the byte having value less than 0xFF
-          while (trimIndexValue < valueInByteArray.length && valueInByteArray[trimIndexValue] == (byte) 0xFF) {
-            trimIndexValue++;
-          }
-          if (trimIndexValue == valueInByteArray.length) {
-            return value;
-          } else {
-            byte[] shortByteValue = Arrays.copyOf(valueInByteArray, trimIndexValue + 1);
-            shortByteValue[trimIndexValue] = (byte) 0xFF; // assigning the 0xFF to make the value max.
-            return BytesUtils.toHexString(shortByteValue);
-          }
-        } else {
-          return BytesUtils.toHexString(Arrays.copyOf(BytesUtils.toBytes(value), (METADATA_PROPERTY_LENGTH_LIMIT / 2)));
-        }
-      default:
-        throw new IllegalStateException("Unsupported stored type for property value length reduction: " + storedType);
+  private static String getValidPropertyValue(String value, DataType storedType) {
+    if (value.length() > METADATA_PROPERTY_LENGTH_LIMIT) {

Review Comment:
   When the value is over the length limit, with current code, we pick a closest value within the length limit as the approximation



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org


Re: [PR] Fix ColumnMinMaxValueGenerator [pinot]

Posted by "abhioncbr (via GitHub)" <gi...@apache.org>.
abhioncbr commented on code in PR #12502:
URL: https://github.com/apache/pinot/pull/12502#discussion_r1503454225


##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java:
##########
@@ -639,64 +636,17 @@ public static void addColumnMinMaxValueInfo(PropertiesConfiguration properties,
   }
 
   /**
-   * Helper method to get the valid value for setting min/max. Returns {@code null} if the value is not supported in
-   * {@link PropertiesConfiguration}, e.g. contains character with surrogate.
+   * Helper method to get the valid value for setting min/max. Returns {@code null} if the value is too long (longer
+   * than 512 characters), or is not supported in {@link PropertiesConfiguration}, e.g. contains character with
+   * surrogate.
    */
   @Nullable
-  private static String getValidPropertyValue(String value, boolean isMax, DataType storedType) {
-    String valueWithinLengthLimit = getValueWithinLengthLimit(value, isMax, storedType);
-    return storedType == DataType.STRING ? CommonsConfigurationUtils.replaceSpecialCharacterInPropertyValue(
-        valueWithinLengthLimit) : valueWithinLengthLimit;
-  }
-
-  /**
-   * Returns the original string if its length is within the allowed limit. If the string's length exceeds the limit,
-   * returns a truncated version of the string with maintaining min or max value.
-   */
-  @VisibleForTesting
-  static String getValueWithinLengthLimit(String value, boolean isMax, DataType storedType) {
-    int length = value.length();
-    if (length <= METADATA_PROPERTY_LENGTH_LIMIT) {
-      return value;
-    }
-    switch (storedType) {
-      case STRING:
-        if (isMax) {
-          int trimIndexValue = METADATA_PROPERTY_LENGTH_LIMIT - 1;
-          // determining the index for the character having value less than '\uFFFF'
-          while (trimIndexValue < length && value.charAt(trimIndexValue) == '\uFFFF') {
-            trimIndexValue++;
-          }
-          if (trimIndexValue == length) {
-            return value;
-          } else {
-            // assigning the '\uFFFF' to make the value max.
-            return value.substring(0, trimIndexValue) + '\uFFFF';
-          }
-        } else {
-          return value.substring(0, METADATA_PROPERTY_LENGTH_LIMIT);
-        }
-      case BYTES:
-        if (isMax) {
-          byte[] valueInByteArray = BytesUtils.toBytes(value);
-          int trimIndexValue = METADATA_PROPERTY_LENGTH_LIMIT / 2 - 1;
-          // determining the index for the byte having value less than 0xFF
-          while (trimIndexValue < valueInByteArray.length && valueInByteArray[trimIndexValue] == (byte) 0xFF) {
-            trimIndexValue++;
-          }
-          if (trimIndexValue == valueInByteArray.length) {
-            return value;
-          } else {
-            byte[] shortByteValue = Arrays.copyOf(valueInByteArray, trimIndexValue + 1);
-            shortByteValue[trimIndexValue] = (byte) 0xFF; // assigning the 0xFF to make the value max.
-            return BytesUtils.toHexString(shortByteValue);
-          }
-        } else {
-          return BytesUtils.toHexString(Arrays.copyOf(BytesUtils.toBytes(value), (METADATA_PROPERTY_LENGTH_LIMIT / 2)));
-        }
-      default:
-        throw new IllegalStateException("Unsupported stored type for property value length reduction: " + storedType);
+  private static String getValidPropertyValue(String value, DataType storedType) {
+    if (value.length() > METADATA_PROPERTY_LENGTH_LIMIT) {
+      return null;
     }
+    return storedType == DataType.STRING ? CommonsConfigurationUtils.replaceSpecialCharacterInPropertyValue(value)

Review Comment:
   When we did this at that moment we were commons-configuration1 and there were certain challenges with it for special character support. 
   We have moved to commons-configuration2 recently  and we can double check that whether it's still an issue or not. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org


Re: [PR] Fix ColumnMinMaxValueGenerator [pinot]

Posted by "codecov-commenter (via GitHub)" <gi...@apache.org>.
codecov-commenter commented on PR #12502:
URL: https://github.com/apache/pinot/pull/12502#issuecomment-1965534131

   ## [Codecov](https://app.codecov.io/gh/apache/pinot/pull/12502?src=pr&el=h1&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) Report
   Attention: Patch coverage is `38.84892%` with `85 lines` in your changes are missing coverage. Please review.
   > Project coverage is 27.70%. Comparing base [(`59551e4`)](https://app.codecov.io/gh/apache/pinot/commit/59551e45224f1535c4863fd577622b37366ccc97?el=desc&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) to head [(`62fb09b`)](https://app.codecov.io/gh/apache/pinot/pull/12502?src=pr&el=desc&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache).
   > Report is 14 commits behind head on master.
   
   | [Files](https://app.codecov.io/gh/apache/pinot/pull/12502?src=pr&el=tree&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) | Patch % | Lines |
   |---|---|---|
   | [.../columnminmaxvalue/ColumnMinMaxValueGenerator.java](https://app.codecov.io/gh/apache/pinot/pull/12502?src=pr&el=tree&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache#diff-cGlub3Qtc2VnbWVudC1sb2NhbC9zcmMvbWFpbi9qYXZhL29yZy9hcGFjaGUvcGlub3Qvc2VnbWVudC9sb2NhbC9zZWdtZW50L2luZGV4L2xvYWRlci9jb2x1bW5taW5tYXh2YWx1ZS9Db2x1bW5NaW5NYXhWYWx1ZUdlbmVyYXRvci5qYXZh) | 38.63% | [78 Missing and 3 partials :warning: ](https://app.codecov.io/gh/apache/pinot/pull/12502?src=pr&el=tree&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) |
   | [...ment/creator/impl/SegmentColumnarIndexCreator.java](https://app.codecov.io/gh/apache/pinot/pull/12502?src=pr&el=tree&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache#diff-cGlub3Qtc2VnbWVudC1sb2NhbC9zcmMvbWFpbi9qYXZhL29yZy9hcGFjaGUvcGlub3Qvc2VnbWVudC9sb2NhbC9zZWdtZW50L2NyZWF0b3IvaW1wbC9TZWdtZW50Q29sdW1uYXJJbmRleENyZWF0b3IuamF2YQ==) | 42.85% | [1 Missing and 3 partials :warning: ](https://app.codecov.io/gh/apache/pinot/pull/12502?src=pr&el=tree&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) |
   
   <details><summary>Additional details and impacted files</summary>
   
   
   ```diff
   @@              Coverage Diff              @@
   ##             master   #12502       +/-   ##
   =============================================
   - Coverage     61.75%   27.70%   -34.05%     
     Complexity      207      207               
   =============================================
     Files          2436     2436               
     Lines        133233   133227        -6     
     Branches      20636    20642        +6     
   =============================================
   - Hits          82274    36911    -45363     
   - Misses        44911    93502    +48591     
   + Partials       6048     2814     -3234     
   ```
   
   | [Flag](https://app.codecov.io/gh/apache/pinot/pull/12502/flags?src=pr&el=flags&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) | Coverage Δ | |
   |---|---|---|
   | [custom-integration1](https://app.codecov.io/gh/apache/pinot/pull/12502/flags?src=pr&el=flag&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) | `<0.01% <0.00%> (ø)` | |
   | [integration](https://app.codecov.io/gh/apache/pinot/pull/12502/flags?src=pr&el=flag&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) | `<0.01% <0.00%> (ø)` | |
   | [integration1](https://app.codecov.io/gh/apache/pinot/pull/12502/flags?src=pr&el=flag&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) | `<0.01% <0.00%> (ø)` | |
   | [integration2](https://app.codecov.io/gh/apache/pinot/pull/12502/flags?src=pr&el=flag&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) | `0.00% <0.00%> (ø)` | |
   | [java-11](https://app.codecov.io/gh/apache/pinot/pull/12502/flags?src=pr&el=flag&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) | `?` | |
   | [java-21](https://app.codecov.io/gh/apache/pinot/pull/12502/flags?src=pr&el=flag&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) | `27.70% <38.84%> (-33.92%)` | :arrow_down: |
   | [skip-bytebuffers-false](https://app.codecov.io/gh/apache/pinot/pull/12502/flags?src=pr&el=flag&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) | `<0.01% <0.00%> (-61.75%)` | :arrow_down: |
   | [skip-bytebuffers-true](https://app.codecov.io/gh/apache/pinot/pull/12502/flags?src=pr&el=flag&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) | `27.70% <38.84%> (-0.03%)` | :arrow_down: |
   | [temurin](https://app.codecov.io/gh/apache/pinot/pull/12502/flags?src=pr&el=flag&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) | `27.70% <38.84%> (-34.05%)` | :arrow_down: |
   | [unittests](https://app.codecov.io/gh/apache/pinot/pull/12502/flags?src=pr&el=flag&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) | `27.70% <38.84%> (-34.05%)` | :arrow_down: |
   | [unittests1](https://app.codecov.io/gh/apache/pinot/pull/12502/flags?src=pr&el=flag&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) | `?` | |
   | [unittests2](https://app.codecov.io/gh/apache/pinot/pull/12502/flags?src=pr&el=flag&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache) | `27.70% <38.84%> (-0.04%)` | :arrow_down: |
   
   Flags with carried forward coverage won't be shown. [Click here](https://docs.codecov.io/docs/carryforward-flags?utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache#carryforward-flags-in-the-pull-request-comment) to find out more.
   
   
   </details>
   
   [:umbrella: View full report in Codecov by Sentry](https://app.codecov.io/gh/apache/pinot/pull/12502?src=pr&el=continue&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache).   
   :loudspeaker: Have feedback on the report? [Share it here](https://about.codecov.io/codecov-pr-comment-feedback/?utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=apache).
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org


Re: [PR] Fix ColumnMinMaxValueGenerator [pinot]

Posted by "abhioncbr (via GitHub)" <gi...@apache.org>.
abhioncbr commented on code in PR #12502:
URL: https://github.com/apache/pinot/pull/12502#discussion_r1503454225


##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java:
##########
@@ -639,64 +636,17 @@ public static void addColumnMinMaxValueInfo(PropertiesConfiguration properties,
   }
 
   /**
-   * Helper method to get the valid value for setting min/max. Returns {@code null} if the value is not supported in
-   * {@link PropertiesConfiguration}, e.g. contains character with surrogate.
+   * Helper method to get the valid value for setting min/max. Returns {@code null} if the value is too long (longer
+   * than 512 characters), or is not supported in {@link PropertiesConfiguration}, e.g. contains character with
+   * surrogate.
    */
   @Nullable
-  private static String getValidPropertyValue(String value, boolean isMax, DataType storedType) {
-    String valueWithinLengthLimit = getValueWithinLengthLimit(value, isMax, storedType);
-    return storedType == DataType.STRING ? CommonsConfigurationUtils.replaceSpecialCharacterInPropertyValue(
-        valueWithinLengthLimit) : valueWithinLengthLimit;
-  }
-
-  /**
-   * Returns the original string if its length is within the allowed limit. If the string's length exceeds the limit,
-   * returns a truncated version of the string with maintaining min or max value.
-   */
-  @VisibleForTesting
-  static String getValueWithinLengthLimit(String value, boolean isMax, DataType storedType) {
-    int length = value.length();
-    if (length <= METADATA_PROPERTY_LENGTH_LIMIT) {
-      return value;
-    }
-    switch (storedType) {
-      case STRING:
-        if (isMax) {
-          int trimIndexValue = METADATA_PROPERTY_LENGTH_LIMIT - 1;
-          // determining the index for the character having value less than '\uFFFF'
-          while (trimIndexValue < length && value.charAt(trimIndexValue) == '\uFFFF') {
-            trimIndexValue++;
-          }
-          if (trimIndexValue == length) {
-            return value;
-          } else {
-            // assigning the '\uFFFF' to make the value max.
-            return value.substring(0, trimIndexValue) + '\uFFFF';
-          }
-        } else {
-          return value.substring(0, METADATA_PROPERTY_LENGTH_LIMIT);
-        }
-      case BYTES:
-        if (isMax) {
-          byte[] valueInByteArray = BytesUtils.toBytes(value);
-          int trimIndexValue = METADATA_PROPERTY_LENGTH_LIMIT / 2 - 1;
-          // determining the index for the byte having value less than 0xFF
-          while (trimIndexValue < valueInByteArray.length && valueInByteArray[trimIndexValue] == (byte) 0xFF) {
-            trimIndexValue++;
-          }
-          if (trimIndexValue == valueInByteArray.length) {
-            return value;
-          } else {
-            byte[] shortByteValue = Arrays.copyOf(valueInByteArray, trimIndexValue + 1);
-            shortByteValue[trimIndexValue] = (byte) 0xFF; // assigning the 0xFF to make the value max.
-            return BytesUtils.toHexString(shortByteValue);
-          }
-        } else {
-          return BytesUtils.toHexString(Arrays.copyOf(BytesUtils.toBytes(value), (METADATA_PROPERTY_LENGTH_LIMIT / 2)));
-        }
-      default:
-        throw new IllegalStateException("Unsupported stored type for property value length reduction: " + storedType);
+  private static String getValidPropertyValue(String value, DataType storedType) {
+    if (value.length() > METADATA_PROPERTY_LENGTH_LIMIT) {
+      return null;
     }
+    return storedType == DataType.STRING ? CommonsConfigurationUtils.replaceSpecialCharacterInPropertyValue(value)

Review Comment:
   When we did this at that moment we were using commons-configuration1 library for propertiesConfiguration and there were certain challenges with it for special character support. 
   We have moved to commons-configuration2 recently  and we can double check that whether it's still an issue or not. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org


Re: [PR] Fix ColumnMinMaxValueGenerator [pinot]

Posted by "Jackie-Jiang (via GitHub)" <gi...@apache.org>.
Jackie-Jiang commented on code in PR #12502:
URL: https://github.com/apache/pinot/pull/12502#discussion_r1503471880


##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/columnminmaxvalue/ColumnMinMaxValueGenerator.java:
##########
@@ -124,227 +131,251 @@ private List<String> getColumnsToAddMinMaxValue() {
   }
 
   private boolean needAddColumnMinMaxValueForColumn(String columnName) {
-    ColumnMetadata columnMetadata = _segmentMetadata.getColumnMetadataFor(columnName);
+    return needAddColumnMinMaxValueForColumn(_segmentMetadata.getColumnMetadataFor(columnName));
+  }
+
+  private boolean needAddColumnMinMaxValueForColumn(ColumnMetadata columnMetadata) {
     return columnMetadata.getMinValue() == null && columnMetadata.getMaxValue() == null
         && !columnMetadata.isMinMaxValueInvalid();
   }
 
-  private void addColumnMinMaxValueForColumn(String columnName)
-      throws Exception {
-    // Skip column with min/max value already set
+  private void addColumnMinMaxValueForColumn(String columnName) {
     ColumnMetadata columnMetadata = _segmentMetadata.getColumnMetadataFor(columnName);
-    if (columnMetadata.getMinValue() != null || columnMetadata.getMaxValue() != null) {
+    if (!needAddColumnMinMaxValueForColumn(columnMetadata)) {
       return;
     }
+    try {
+      if (columnMetadata.hasDictionary()) {
+        addColumnMinMaxValueWithDictionary(columnMetadata);
+      } else {
+        addColumnMinMaxValueWithoutDictionary(columnMetadata);
+      }
+      _minMaxValueAdded = true;
+    } catch (Exception e) {
+      LOGGER.error("Caught exception while generating min/max value for column: {} in segment: {}, continuing without "
+          + "persisting them", columnName, _segmentMetadata.getName(), e);
+    }
+  }
 
+  private void addColumnMinMaxValueWithDictionary(ColumnMetadata columnMetadata)
+      throws IOException {
+    try (Dictionary dictionary = getDictionaryForColumn(columnMetadata)) {
+      SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnMetadata.getColumnName(),
+          dictionary.getInternal(0), dictionary.getInternal(dictionary.length() - 1),
+          columnMetadata.getDataType().getStoredType());
+    }
+  }
+
+  private Dictionary getDictionaryForColumn(ColumnMetadata columnMetadata)
+      throws IOException {
+    String columnName = columnMetadata.getColumnName();
+    DataType dataType = columnMetadata.getDataType();
+    PinotDataBuffer dictionaryBuffer = _segmentWriter.getIndexFor(columnName, StandardIndexes.dictionary());
+    int length = columnMetadata.getCardinality();
+    switch (dataType.getStoredType()) {
+      case INT:
+        return new IntDictionary(dictionaryBuffer, length);
+      case LONG:
+        return new LongDictionary(dictionaryBuffer, length);
+      case FLOAT:
+        return new FloatDictionary(dictionaryBuffer, length);
+      case DOUBLE:
+        return new DoubleDictionary(dictionaryBuffer, length);
+      case BIG_DECIMAL:
+        return new BigDecimalDictionary(dictionaryBuffer, length, columnMetadata.getColumnMaxLength());
+      case STRING:
+        return new StringDictionary(dictionaryBuffer, length, columnMetadata.getColumnMaxLength());
+      case BYTES:
+        return new BytesDictionary(dictionaryBuffer, length, columnMetadata.getColumnMaxLength());
+      default:
+        throw new IllegalStateException("Unsupported data type: " + dataType + " for column: " + columnName);
+    }
+  }
+
+  @SuppressWarnings({"rawtypes", "unchecked"})
+  private void addColumnMinMaxValueWithoutDictionary(ColumnMetadata columnMetadata)
+      throws IOException {
+    String columnName = columnMetadata.getColumnName();
     DataType dataType = columnMetadata.getDataType();
     DataType storedType = dataType.getStoredType();
-    if (columnMetadata.hasDictionary()) {
-      PinotDataBuffer dictionaryBuffer = _segmentWriter.getIndexFor(columnName, StandardIndexes.dictionary());
-      int length = columnMetadata.getCardinality();
-      switch (storedType) {
-        case INT:
-          try (IntDictionary intDictionary = new IntDictionary(dictionaryBuffer, length)) {
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-                intDictionary.getStringValue(0), intDictionary.getStringValue(length - 1), storedType);
-          }
-          break;
-        case LONG:
-          try (LongDictionary longDictionary = new LongDictionary(dictionaryBuffer, length)) {
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-                longDictionary.getStringValue(0), longDictionary.getStringValue(length - 1), storedType);
-          }
-          break;
-        case FLOAT:
-          try (FloatDictionary floatDictionary = new FloatDictionary(dictionaryBuffer, length)) {
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-                floatDictionary.getStringValue(0), floatDictionary.getStringValue(length - 1), storedType);
-          }
-          break;
-        case DOUBLE:
-          try (DoubleDictionary doubleDictionary = new DoubleDictionary(dictionaryBuffer, length)) {
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-                doubleDictionary.getStringValue(0), doubleDictionary.getStringValue(length - 1), storedType);
-          }
-          break;
-        case STRING:
-          try (StringDictionary stringDictionary = new StringDictionary(dictionaryBuffer, length,
-              columnMetadata.getColumnMaxLength())) {
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-                stringDictionary.getStringValue(0), stringDictionary.getStringValue(length - 1), storedType);
-          }
-          break;
-        case BYTES:
-          try (BytesDictionary bytesDictionary = new BytesDictionary(dictionaryBuffer, length,
-              columnMetadata.getColumnMaxLength())) {
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-                bytesDictionary.getStringValue(0), bytesDictionary.getStringValue(length - 1), storedType);
-          }
-          break;
-        default:
-          throw new IllegalStateException("Unsupported data type: " + dataType + " for column: " + columnName);
-      }
-    } else {
-      // setting min/max for non-dictionary columns.
+    boolean isSingleValue = columnMetadata.isSingleValue();
+    PinotDataBuffer rawIndexBuffer = _segmentWriter.getIndexFor(columnName, StandardIndexes.forward());
+    try (ForwardIndexReader rawIndexReader = ForwardIndexReaderFactory.createRawIndexReader(rawIndexBuffer, storedType,
+        isSingleValue); ForwardIndexReaderContext readerContext = rawIndexReader.createContext()) {
       int numDocs = columnMetadata.getTotalDocs();
-      PinotDataBuffer rawIndexBuffer = _segmentWriter.getIndexFor(columnName, StandardIndexes.forward());
-      boolean isSingleValue = _segmentMetadata.getSchema().getFieldSpecFor(columnName).isSingleValueField();
-      try (
-          ForwardIndexReader rawIndexReader = ForwardIndexReaderFactory.createRawIndexReader(rawIndexBuffer, storedType,
-              isSingleValue); ForwardIndexReaderContext readerContext = rawIndexReader.createContext()) {
-        switch (storedType) {
-          case INT: {
-            int min = Integer.MAX_VALUE;
-            int max = Integer.MIN_VALUE;
-            if (isSingleValue) {
-              for (int docId = 0; docId < numDocs; docId++) {
-                int value = rawIndexReader.getInt(docId, readerContext);
+      Object minValue;
+      Object maxValue;
+      switch (storedType) {
+        case INT: {
+          int min = Integer.MAX_VALUE;
+          int max = Integer.MIN_VALUE;
+          if (isSingleValue) {
+            for (int docId = 0; docId < numDocs; docId++) {
+              int value = rawIndexReader.getInt(docId, readerContext);
+              min = Math.min(min, value);
+              max = Math.max(max, value);
+            }
+          } else {
+            for (int docId = 0; docId < numDocs; docId++) {
+              int[] values = rawIndexReader.getIntMV(docId, readerContext);
+              for (int value : values) {
                 min = Math.min(min, value);
                 max = Math.max(max, value);
               }
-            } else {
-              for (int docId = 0; docId < numDocs; docId++) {
-                int[] values = rawIndexReader.getIntMV(docId, readerContext);
-                for (int value : values) {
-                  min = Math.min(min, value);
-                  max = Math.max(max, value);
-                }
-              }
             }
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName, String.valueOf(min),
-                String.valueOf(max), storedType);
-            break;
           }
-          case LONG: {
-            long min = Long.MAX_VALUE;
-            long max = Long.MIN_VALUE;
-            if (isSingleValue) {
-              for (int docId = 0; docId < numDocs; docId++) {
-                long value = rawIndexReader.getLong(docId, readerContext);
+          minValue = min;
+          maxValue = max;
+          break;
+        }
+        case LONG: {
+          long min = Long.MAX_VALUE;
+          long max = Long.MIN_VALUE;
+          if (isSingleValue) {
+            for (int docId = 0; docId < numDocs; docId++) {
+              long value = rawIndexReader.getLong(docId, readerContext);
+              min = Math.min(min, value);
+              max = Math.max(max, value);
+            }
+          } else {
+            for (int docId = 0; docId < numDocs; docId++) {
+              long[] values = rawIndexReader.getLongMV(docId, readerContext);
+              for (long value : values) {
                 min = Math.min(min, value);
                 max = Math.max(max, value);
               }
-            } else {
-              for (int docId = 0; docId < numDocs; docId++) {
-                long[] values = rawIndexReader.getLongMV(docId, readerContext);
-                for (long value : values) {
-                  min = Math.min(min, value);
-                  max = Math.max(max, value);
-                }
-              }
             }
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName, String.valueOf(min),
-                String.valueOf(max), storedType);
-            break;
           }
-          case FLOAT: {
-            float min = Float.POSITIVE_INFINITY;
-            float max = Float.NEGATIVE_INFINITY;
-            if (isSingleValue) {
-              for (int docId = 0; docId < numDocs; docId++) {
-                float value = rawIndexReader.getFloat(docId, readerContext);
+          minValue = min;
+          maxValue = max;
+          break;
+        }
+        case FLOAT: {
+          float min = Float.POSITIVE_INFINITY;
+          float max = Float.NEGATIVE_INFINITY;
+          if (isSingleValue) {
+            for (int docId = 0; docId < numDocs; docId++) {
+              float value = rawIndexReader.getFloat(docId, readerContext);
+              min = Math.min(min, value);
+              max = Math.max(max, value);
+            }
+          } else {
+            for (int docId = 0; docId < numDocs; docId++) {
+              float[] values = rawIndexReader.getFloatMV(docId, readerContext);
+              for (float value : values) {
                 min = Math.min(min, value);
                 max = Math.max(max, value);
               }
-            } else {
-              for (int docId = 0; docId < numDocs; docId++) {
-                float[] values = rawIndexReader.getFloatMV(docId, readerContext);
-                for (float value : values) {
-                  min = Math.min(min, value);
-                  max = Math.max(max, value);
-                }
-              }
             }
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName, String.valueOf(min),
-                String.valueOf(max), storedType);
-            break;
           }
-          case DOUBLE: {
-            double min = Double.POSITIVE_INFINITY;
-            double max = Double.NEGATIVE_INFINITY;
-            if (isSingleValue) {
-              for (int docId = 0; docId < numDocs; docId++) {
-                double value = rawIndexReader.getDouble(docId, readerContext);
+          minValue = min;
+          maxValue = max;
+          break;
+        }
+        case DOUBLE: {
+          double min = Double.POSITIVE_INFINITY;
+          double max = Double.NEGATIVE_INFINITY;
+          if (isSingleValue) {
+            for (int docId = 0; docId < numDocs; docId++) {
+              double value = rawIndexReader.getDouble(docId, readerContext);
+              min = Math.min(min, value);
+              max = Math.max(max, value);
+            }
+          } else {
+            for (int docId = 0; docId < numDocs; docId++) {
+              double[] values = rawIndexReader.getDoubleMV(docId, readerContext);
+              for (double value : values) {
                 min = Math.min(min, value);
                 max = Math.max(max, value);
               }
-            } else {
-              for (int docId = 0; docId < numDocs; docId++) {
-                double[] values = rawIndexReader.getDoubleMV(docId, readerContext);
-                for (double value : values) {
-                  min = Math.min(min, value);
-                  max = Math.max(max, value);
-                }
-              }
             }
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName, String.valueOf(min),
-                String.valueOf(max), storedType);
-            break;
           }
-          case STRING: {
-            String min = null;
-            String max = null;
-            if (isSingleValue) {
-              for (int docId = 0; docId < numDocs; docId++) {
-                String value = rawIndexReader.getString(docId, readerContext);
+          minValue = min;
+          maxValue = max;
+          break;
+        }
+        case BIG_DECIMAL: {
+          Preconditions.checkState(isSingleValue, "Unsupported multi-value BIG_DECIMAL column: %s", columnName);
+          BigDecimal min = null;
+          BigDecimal max = null;
+          for (int docId = 1; docId < numDocs; docId++) {

Review Comment:
   Good catch! This is a bug (initially I put first value as min/max, then forgot to change it back..)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org


Re: [PR] Fix ColumnMinMaxValueGenerator [pinot]

Posted by "jadami10 (via GitHub)" <gi...@apache.org>.
jadami10 commented on code in PR #12502:
URL: https://github.com/apache/pinot/pull/12502#discussion_r1503435128


##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/columnminmaxvalue/ColumnMinMaxValueGenerator.java:
##########
@@ -124,227 +131,251 @@ private List<String> getColumnsToAddMinMaxValue() {
   }
 
   private boolean needAddColumnMinMaxValueForColumn(String columnName) {
-    ColumnMetadata columnMetadata = _segmentMetadata.getColumnMetadataFor(columnName);
+    return needAddColumnMinMaxValueForColumn(_segmentMetadata.getColumnMetadataFor(columnName));
+  }
+
+  private boolean needAddColumnMinMaxValueForColumn(ColumnMetadata columnMetadata) {
     return columnMetadata.getMinValue() == null && columnMetadata.getMaxValue() == null
         && !columnMetadata.isMinMaxValueInvalid();
   }
 
-  private void addColumnMinMaxValueForColumn(String columnName)
-      throws Exception {
-    // Skip column with min/max value already set
+  private void addColumnMinMaxValueForColumn(String columnName) {
     ColumnMetadata columnMetadata = _segmentMetadata.getColumnMetadataFor(columnName);
-    if (columnMetadata.getMinValue() != null || columnMetadata.getMaxValue() != null) {
+    if (!needAddColumnMinMaxValueForColumn(columnMetadata)) {
       return;
     }
+    try {
+      if (columnMetadata.hasDictionary()) {
+        addColumnMinMaxValueWithDictionary(columnMetadata);
+      } else {
+        addColumnMinMaxValueWithoutDictionary(columnMetadata);
+      }
+      _minMaxValueAdded = true;
+    } catch (Exception e) {
+      LOGGER.error("Caught exception while generating min/max value for column: {} in segment: {}, continuing without "

Review Comment:
   this is the part that's definitely more informed and thorough than my fix. So any errors here are now logged rather than throwing. thank you



##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java:
##########
@@ -639,64 +636,17 @@ public static void addColumnMinMaxValueInfo(PropertiesConfiguration properties,
   }
 
   /**
-   * Helper method to get the valid value for setting min/max. Returns {@code null} if the value is not supported in
-   * {@link PropertiesConfiguration}, e.g. contains character with surrogate.
+   * Helper method to get the valid value for setting min/max. Returns {@code null} if the value is too long (longer
+   * than 512 characters), or is not supported in {@link PropertiesConfiguration}, e.g. contains character with
+   * surrogate.
    */
   @Nullable
-  private static String getValidPropertyValue(String value, boolean isMax, DataType storedType) {
-    String valueWithinLengthLimit = getValueWithinLengthLimit(value, isMax, storedType);
-    return storedType == DataType.STRING ? CommonsConfigurationUtils.replaceSpecialCharacterInPropertyValue(
-        valueWithinLengthLimit) : valueWithinLengthLimit;
-  }
-
-  /**
-   * Returns the original string if its length is within the allowed limit. If the string's length exceeds the limit,
-   * returns a truncated version of the string with maintaining min or max value.
-   */
-  @VisibleForTesting
-  static String getValueWithinLengthLimit(String value, boolean isMax, DataType storedType) {
-    int length = value.length();
-    if (length <= METADATA_PROPERTY_LENGTH_LIMIT) {
-      return value;
-    }
-    switch (storedType) {
-      case STRING:
-        if (isMax) {
-          int trimIndexValue = METADATA_PROPERTY_LENGTH_LIMIT - 1;
-          // determining the index for the character having value less than '\uFFFF'
-          while (trimIndexValue < length && value.charAt(trimIndexValue) == '\uFFFF') {
-            trimIndexValue++;
-          }
-          if (trimIndexValue == length) {
-            return value;
-          } else {
-            // assigning the '\uFFFF' to make the value max.
-            return value.substring(0, trimIndexValue) + '\uFFFF';
-          }
-        } else {
-          return value.substring(0, METADATA_PROPERTY_LENGTH_LIMIT);
-        }
-      case BYTES:
-        if (isMax) {
-          byte[] valueInByteArray = BytesUtils.toBytes(value);
-          int trimIndexValue = METADATA_PROPERTY_LENGTH_LIMIT / 2 - 1;
-          // determining the index for the byte having value less than 0xFF
-          while (trimIndexValue < valueInByteArray.length && valueInByteArray[trimIndexValue] == (byte) 0xFF) {
-            trimIndexValue++;
-          }
-          if (trimIndexValue == valueInByteArray.length) {
-            return value;
-          } else {
-            byte[] shortByteValue = Arrays.copyOf(valueInByteArray, trimIndexValue + 1);
-            shortByteValue[trimIndexValue] = (byte) 0xFF; // assigning the 0xFF to make the value max.
-            return BytesUtils.toHexString(shortByteValue);
-          }
-        } else {
-          return BytesUtils.toHexString(Arrays.copyOf(BytesUtils.toBytes(value), (METADATA_PROPERTY_LENGTH_LIMIT / 2)));
-        }
-      default:
-        throw new IllegalStateException("Unsupported stored type for property value length reduction: " + storedType);
+  private static String getValidPropertyValue(String value, DataType storedType) {
+    if (value.length() > METADATA_PROPERTY_LENGTH_LIMIT) {
+      return null;
     }
+    return storedType == DataType.STRING ? CommonsConfigurationUtils.replaceSpecialCharacterInPropertyValue(value)

Review Comment:
   what's the issue with keeping special characters in? does `PropertiesConfiguration` not support them?



##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/columnminmaxvalue/ColumnMinMaxValueGenerator.java:
##########
@@ -124,227 +131,251 @@ private List<String> getColumnsToAddMinMaxValue() {
   }
 
   private boolean needAddColumnMinMaxValueForColumn(String columnName) {
-    ColumnMetadata columnMetadata = _segmentMetadata.getColumnMetadataFor(columnName);
+    return needAddColumnMinMaxValueForColumn(_segmentMetadata.getColumnMetadataFor(columnName));
+  }
+
+  private boolean needAddColumnMinMaxValueForColumn(ColumnMetadata columnMetadata) {
     return columnMetadata.getMinValue() == null && columnMetadata.getMaxValue() == null
         && !columnMetadata.isMinMaxValueInvalid();
   }
 
-  private void addColumnMinMaxValueForColumn(String columnName)
-      throws Exception {
-    // Skip column with min/max value already set
+  private void addColumnMinMaxValueForColumn(String columnName) {
     ColumnMetadata columnMetadata = _segmentMetadata.getColumnMetadataFor(columnName);
-    if (columnMetadata.getMinValue() != null || columnMetadata.getMaxValue() != null) {
+    if (!needAddColumnMinMaxValueForColumn(columnMetadata)) {
       return;
     }
+    try {
+      if (columnMetadata.hasDictionary()) {
+        addColumnMinMaxValueWithDictionary(columnMetadata);
+      } else {
+        addColumnMinMaxValueWithoutDictionary(columnMetadata);
+      }
+      _minMaxValueAdded = true;
+    } catch (Exception e) {
+      LOGGER.error("Caught exception while generating min/max value for column: {} in segment: {}, continuing without "
+          + "persisting them", columnName, _segmentMetadata.getName(), e);
+    }
+  }
 
+  private void addColumnMinMaxValueWithDictionary(ColumnMetadata columnMetadata)
+      throws IOException {
+    try (Dictionary dictionary = getDictionaryForColumn(columnMetadata)) {
+      SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnMetadata.getColumnName(),

Review Comment:
   nice simplification here!



##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/columnminmaxvalue/ColumnMinMaxValueGenerator.java:
##########
@@ -124,227 +131,251 @@ private List<String> getColumnsToAddMinMaxValue() {
   }
 
   private boolean needAddColumnMinMaxValueForColumn(String columnName) {
-    ColumnMetadata columnMetadata = _segmentMetadata.getColumnMetadataFor(columnName);
+    return needAddColumnMinMaxValueForColumn(_segmentMetadata.getColumnMetadataFor(columnName));
+  }
+
+  private boolean needAddColumnMinMaxValueForColumn(ColumnMetadata columnMetadata) {
     return columnMetadata.getMinValue() == null && columnMetadata.getMaxValue() == null
         && !columnMetadata.isMinMaxValueInvalid();
   }
 
-  private void addColumnMinMaxValueForColumn(String columnName)
-      throws Exception {
-    // Skip column with min/max value already set
+  private void addColumnMinMaxValueForColumn(String columnName) {
     ColumnMetadata columnMetadata = _segmentMetadata.getColumnMetadataFor(columnName);
-    if (columnMetadata.getMinValue() != null || columnMetadata.getMaxValue() != null) {
+    if (!needAddColumnMinMaxValueForColumn(columnMetadata)) {
       return;
     }
+    try {
+      if (columnMetadata.hasDictionary()) {
+        addColumnMinMaxValueWithDictionary(columnMetadata);
+      } else {
+        addColumnMinMaxValueWithoutDictionary(columnMetadata);
+      }
+      _minMaxValueAdded = true;
+    } catch (Exception e) {
+      LOGGER.error("Caught exception while generating min/max value for column: {} in segment: {}, continuing without "
+          + "persisting them", columnName, _segmentMetadata.getName(), e);
+    }
+  }
 
+  private void addColumnMinMaxValueWithDictionary(ColumnMetadata columnMetadata)
+      throws IOException {
+    try (Dictionary dictionary = getDictionaryForColumn(columnMetadata)) {
+      SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnMetadata.getColumnName(),
+          dictionary.getInternal(0), dictionary.getInternal(dictionary.length() - 1),
+          columnMetadata.getDataType().getStoredType());
+    }
+  }
+
+  private Dictionary getDictionaryForColumn(ColumnMetadata columnMetadata)
+      throws IOException {
+    String columnName = columnMetadata.getColumnName();
+    DataType dataType = columnMetadata.getDataType();
+    PinotDataBuffer dictionaryBuffer = _segmentWriter.getIndexFor(columnName, StandardIndexes.dictionary());
+    int length = columnMetadata.getCardinality();
+    switch (dataType.getStoredType()) {
+      case INT:
+        return new IntDictionary(dictionaryBuffer, length);
+      case LONG:
+        return new LongDictionary(dictionaryBuffer, length);
+      case FLOAT:
+        return new FloatDictionary(dictionaryBuffer, length);
+      case DOUBLE:
+        return new DoubleDictionary(dictionaryBuffer, length);
+      case BIG_DECIMAL:
+        return new BigDecimalDictionary(dictionaryBuffer, length, columnMetadata.getColumnMaxLength());
+      case STRING:
+        return new StringDictionary(dictionaryBuffer, length, columnMetadata.getColumnMaxLength());
+      case BYTES:
+        return new BytesDictionary(dictionaryBuffer, length, columnMetadata.getColumnMaxLength());
+      default:
+        throw new IllegalStateException("Unsupported data type: " + dataType + " for column: " + columnName);
+    }
+  }
+
+  @SuppressWarnings({"rawtypes", "unchecked"})
+  private void addColumnMinMaxValueWithoutDictionary(ColumnMetadata columnMetadata)
+      throws IOException {
+    String columnName = columnMetadata.getColumnName();
     DataType dataType = columnMetadata.getDataType();
     DataType storedType = dataType.getStoredType();
-    if (columnMetadata.hasDictionary()) {
-      PinotDataBuffer dictionaryBuffer = _segmentWriter.getIndexFor(columnName, StandardIndexes.dictionary());
-      int length = columnMetadata.getCardinality();
-      switch (storedType) {
-        case INT:
-          try (IntDictionary intDictionary = new IntDictionary(dictionaryBuffer, length)) {
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-                intDictionary.getStringValue(0), intDictionary.getStringValue(length - 1), storedType);
-          }
-          break;
-        case LONG:
-          try (LongDictionary longDictionary = new LongDictionary(dictionaryBuffer, length)) {
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-                longDictionary.getStringValue(0), longDictionary.getStringValue(length - 1), storedType);
-          }
-          break;
-        case FLOAT:
-          try (FloatDictionary floatDictionary = new FloatDictionary(dictionaryBuffer, length)) {
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-                floatDictionary.getStringValue(0), floatDictionary.getStringValue(length - 1), storedType);
-          }
-          break;
-        case DOUBLE:
-          try (DoubleDictionary doubleDictionary = new DoubleDictionary(dictionaryBuffer, length)) {
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-                doubleDictionary.getStringValue(0), doubleDictionary.getStringValue(length - 1), storedType);
-          }
-          break;
-        case STRING:
-          try (StringDictionary stringDictionary = new StringDictionary(dictionaryBuffer, length,
-              columnMetadata.getColumnMaxLength())) {
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-                stringDictionary.getStringValue(0), stringDictionary.getStringValue(length - 1), storedType);
-          }
-          break;
-        case BYTES:
-          try (BytesDictionary bytesDictionary = new BytesDictionary(dictionaryBuffer, length,
-              columnMetadata.getColumnMaxLength())) {
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-                bytesDictionary.getStringValue(0), bytesDictionary.getStringValue(length - 1), storedType);
-          }
-          break;
-        default:
-          throw new IllegalStateException("Unsupported data type: " + dataType + " for column: " + columnName);
-      }
-    } else {
-      // setting min/max for non-dictionary columns.
+    boolean isSingleValue = columnMetadata.isSingleValue();
+    PinotDataBuffer rawIndexBuffer = _segmentWriter.getIndexFor(columnName, StandardIndexes.forward());
+    try (ForwardIndexReader rawIndexReader = ForwardIndexReaderFactory.createRawIndexReader(rawIndexBuffer, storedType,
+        isSingleValue); ForwardIndexReaderContext readerContext = rawIndexReader.createContext()) {
       int numDocs = columnMetadata.getTotalDocs();
-      PinotDataBuffer rawIndexBuffer = _segmentWriter.getIndexFor(columnName, StandardIndexes.forward());
-      boolean isSingleValue = _segmentMetadata.getSchema().getFieldSpecFor(columnName).isSingleValueField();
-      try (
-          ForwardIndexReader rawIndexReader = ForwardIndexReaderFactory.createRawIndexReader(rawIndexBuffer, storedType,
-              isSingleValue); ForwardIndexReaderContext readerContext = rawIndexReader.createContext()) {
-        switch (storedType) {
-          case INT: {
-            int min = Integer.MAX_VALUE;
-            int max = Integer.MIN_VALUE;
-            if (isSingleValue) {
-              for (int docId = 0; docId < numDocs; docId++) {
-                int value = rawIndexReader.getInt(docId, readerContext);
+      Object minValue;
+      Object maxValue;
+      switch (storedType) {
+        case INT: {
+          int min = Integer.MAX_VALUE;
+          int max = Integer.MIN_VALUE;
+          if (isSingleValue) {
+            for (int docId = 0; docId < numDocs; docId++) {
+              int value = rawIndexReader.getInt(docId, readerContext);
+              min = Math.min(min, value);
+              max = Math.max(max, value);
+            }
+          } else {
+            for (int docId = 0; docId < numDocs; docId++) {
+              int[] values = rawIndexReader.getIntMV(docId, readerContext);
+              for (int value : values) {
                 min = Math.min(min, value);
                 max = Math.max(max, value);
               }
-            } else {
-              for (int docId = 0; docId < numDocs; docId++) {
-                int[] values = rawIndexReader.getIntMV(docId, readerContext);
-                for (int value : values) {
-                  min = Math.min(min, value);
-                  max = Math.max(max, value);
-                }
-              }
             }
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName, String.valueOf(min),
-                String.valueOf(max), storedType);
-            break;
           }
-          case LONG: {
-            long min = Long.MAX_VALUE;
-            long max = Long.MIN_VALUE;
-            if (isSingleValue) {
-              for (int docId = 0; docId < numDocs; docId++) {
-                long value = rawIndexReader.getLong(docId, readerContext);
+          minValue = min;
+          maxValue = max;
+          break;
+        }
+        case LONG: {
+          long min = Long.MAX_VALUE;
+          long max = Long.MIN_VALUE;
+          if (isSingleValue) {
+            for (int docId = 0; docId < numDocs; docId++) {
+              long value = rawIndexReader.getLong(docId, readerContext);
+              min = Math.min(min, value);
+              max = Math.max(max, value);
+            }
+          } else {
+            for (int docId = 0; docId < numDocs; docId++) {
+              long[] values = rawIndexReader.getLongMV(docId, readerContext);
+              for (long value : values) {
                 min = Math.min(min, value);
                 max = Math.max(max, value);
               }
-            } else {
-              for (int docId = 0; docId < numDocs; docId++) {
-                long[] values = rawIndexReader.getLongMV(docId, readerContext);
-                for (long value : values) {
-                  min = Math.min(min, value);
-                  max = Math.max(max, value);
-                }
-              }
             }
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName, String.valueOf(min),
-                String.valueOf(max), storedType);
-            break;
           }
-          case FLOAT: {
-            float min = Float.POSITIVE_INFINITY;
-            float max = Float.NEGATIVE_INFINITY;
-            if (isSingleValue) {
-              for (int docId = 0; docId < numDocs; docId++) {
-                float value = rawIndexReader.getFloat(docId, readerContext);
+          minValue = min;
+          maxValue = max;
+          break;
+        }
+        case FLOAT: {
+          float min = Float.POSITIVE_INFINITY;
+          float max = Float.NEGATIVE_INFINITY;
+          if (isSingleValue) {
+            for (int docId = 0; docId < numDocs; docId++) {
+              float value = rawIndexReader.getFloat(docId, readerContext);
+              min = Math.min(min, value);
+              max = Math.max(max, value);
+            }
+          } else {
+            for (int docId = 0; docId < numDocs; docId++) {
+              float[] values = rawIndexReader.getFloatMV(docId, readerContext);
+              for (float value : values) {
                 min = Math.min(min, value);
                 max = Math.max(max, value);
               }
-            } else {
-              for (int docId = 0; docId < numDocs; docId++) {
-                float[] values = rawIndexReader.getFloatMV(docId, readerContext);
-                for (float value : values) {
-                  min = Math.min(min, value);
-                  max = Math.max(max, value);
-                }
-              }
             }
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName, String.valueOf(min),
-                String.valueOf(max), storedType);
-            break;
           }
-          case DOUBLE: {
-            double min = Double.POSITIVE_INFINITY;
-            double max = Double.NEGATIVE_INFINITY;
-            if (isSingleValue) {
-              for (int docId = 0; docId < numDocs; docId++) {
-                double value = rawIndexReader.getDouble(docId, readerContext);
+          minValue = min;
+          maxValue = max;
+          break;
+        }
+        case DOUBLE: {
+          double min = Double.POSITIVE_INFINITY;
+          double max = Double.NEGATIVE_INFINITY;
+          if (isSingleValue) {
+            for (int docId = 0; docId < numDocs; docId++) {
+              double value = rawIndexReader.getDouble(docId, readerContext);
+              min = Math.min(min, value);
+              max = Math.max(max, value);
+            }
+          } else {
+            for (int docId = 0; docId < numDocs; docId++) {
+              double[] values = rawIndexReader.getDoubleMV(docId, readerContext);
+              for (double value : values) {
                 min = Math.min(min, value);
                 max = Math.max(max, value);
               }
-            } else {
-              for (int docId = 0; docId < numDocs; docId++) {
-                double[] values = rawIndexReader.getDoubleMV(docId, readerContext);
-                for (double value : values) {
-                  min = Math.min(min, value);
-                  max = Math.max(max, value);
-                }
-              }
             }
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName, String.valueOf(min),
-                String.valueOf(max), storedType);
-            break;
           }
-          case STRING: {
-            String min = null;
-            String max = null;
-            if (isSingleValue) {
-              for (int docId = 0; docId < numDocs; docId++) {
-                String value = rawIndexReader.getString(docId, readerContext);
+          minValue = min;
+          maxValue = max;
+          break;
+        }
+        case BIG_DECIMAL: {
+          Preconditions.checkState(isSingleValue, "Unsupported multi-value BIG_DECIMAL column: %s", columnName);
+          BigDecimal min = null;
+          BigDecimal max = null;
+          for (int docId = 1; docId < numDocs; docId++) {

Review Comment:
   why is this starting at `docId` 1 instead of 0?



##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/SegmentColumnarIndexCreator.java:
##########
@@ -639,64 +636,17 @@ public static void addColumnMinMaxValueInfo(PropertiesConfiguration properties,
   }
 
   /**
-   * Helper method to get the valid value for setting min/max. Returns {@code null} if the value is not supported in
-   * {@link PropertiesConfiguration}, e.g. contains character with surrogate.
+   * Helper method to get the valid value for setting min/max. Returns {@code null} if the value is too long (longer
+   * than 512 characters), or is not supported in {@link PropertiesConfiguration}, e.g. contains character with
+   * surrogate.
    */
   @Nullable
-  private static String getValidPropertyValue(String value, boolean isMax, DataType storedType) {
-    String valueWithinLengthLimit = getValueWithinLengthLimit(value, isMax, storedType);
-    return storedType == DataType.STRING ? CommonsConfigurationUtils.replaceSpecialCharacterInPropertyValue(
-        valueWithinLengthLimit) : valueWithinLengthLimit;
-  }
-
-  /**
-   * Returns the original string if its length is within the allowed limit. If the string's length exceeds the limit,
-   * returns a truncated version of the string with maintaining min or max value.
-   */
-  @VisibleForTesting
-  static String getValueWithinLengthLimit(String value, boolean isMax, DataType storedType) {
-    int length = value.length();
-    if (length <= METADATA_PROPERTY_LENGTH_LIMIT) {
-      return value;
-    }
-    switch (storedType) {
-      case STRING:
-        if (isMax) {
-          int trimIndexValue = METADATA_PROPERTY_LENGTH_LIMIT - 1;
-          // determining the index for the character having value less than '\uFFFF'
-          while (trimIndexValue < length && value.charAt(trimIndexValue) == '\uFFFF') {
-            trimIndexValue++;
-          }
-          if (trimIndexValue == length) {
-            return value;
-          } else {
-            // assigning the '\uFFFF' to make the value max.
-            return value.substring(0, trimIndexValue) + '\uFFFF';
-          }
-        } else {
-          return value.substring(0, METADATA_PROPERTY_LENGTH_LIMIT);
-        }
-      case BYTES:
-        if (isMax) {
-          byte[] valueInByteArray = BytesUtils.toBytes(value);
-          int trimIndexValue = METADATA_PROPERTY_LENGTH_LIMIT / 2 - 1;
-          // determining the index for the byte having value less than 0xFF
-          while (trimIndexValue < valueInByteArray.length && valueInByteArray[trimIndexValue] == (byte) 0xFF) {
-            trimIndexValue++;
-          }
-          if (trimIndexValue == valueInByteArray.length) {
-            return value;
-          } else {
-            byte[] shortByteValue = Arrays.copyOf(valueInByteArray, trimIndexValue + 1);
-            shortByteValue[trimIndexValue] = (byte) 0xFF; // assigning the 0xFF to make the value max.
-            return BytesUtils.toHexString(shortByteValue);
-          }
-        } else {
-          return BytesUtils.toHexString(Arrays.copyOf(BytesUtils.toBytes(value), (METADATA_PROPERTY_LENGTH_LIMIT / 2)));
-        }
-      default:
-        throw new IllegalStateException("Unsupported stored type for property value length reduction: " + storedType);
+  private static String getValidPropertyValue(String value, DataType storedType) {
+    if (value.length() > METADATA_PROPERTY_LENGTH_LIMIT) {

Review Comment:
   is the truncation what's causing incorrectness issue?



##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/columnminmaxvalue/ColumnMinMaxValueGenerator.java:
##########
@@ -124,227 +131,251 @@ private List<String> getColumnsToAddMinMaxValue() {
   }
 
   private boolean needAddColumnMinMaxValueForColumn(String columnName) {
-    ColumnMetadata columnMetadata = _segmentMetadata.getColumnMetadataFor(columnName);
+    return needAddColumnMinMaxValueForColumn(_segmentMetadata.getColumnMetadataFor(columnName));
+  }
+
+  private boolean needAddColumnMinMaxValueForColumn(ColumnMetadata columnMetadata) {
     return columnMetadata.getMinValue() == null && columnMetadata.getMaxValue() == null
         && !columnMetadata.isMinMaxValueInvalid();
   }
 
-  private void addColumnMinMaxValueForColumn(String columnName)
-      throws Exception {
-    // Skip column with min/max value already set
+  private void addColumnMinMaxValueForColumn(String columnName) {
     ColumnMetadata columnMetadata = _segmentMetadata.getColumnMetadataFor(columnName);
-    if (columnMetadata.getMinValue() != null || columnMetadata.getMaxValue() != null) {
+    if (!needAddColumnMinMaxValueForColumn(columnMetadata)) {
       return;
     }
+    try {
+      if (columnMetadata.hasDictionary()) {
+        addColumnMinMaxValueWithDictionary(columnMetadata);
+      } else {
+        addColumnMinMaxValueWithoutDictionary(columnMetadata);
+      }
+      _minMaxValueAdded = true;
+    } catch (Exception e) {
+      LOGGER.error("Caught exception while generating min/max value for column: {} in segment: {}, continuing without "
+          + "persisting them", columnName, _segmentMetadata.getName(), e);
+    }
+  }
 
+  private void addColumnMinMaxValueWithDictionary(ColumnMetadata columnMetadata)
+      throws IOException {
+    try (Dictionary dictionary = getDictionaryForColumn(columnMetadata)) {
+      SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnMetadata.getColumnName(),
+          dictionary.getInternal(0), dictionary.getInternal(dictionary.length() - 1),
+          columnMetadata.getDataType().getStoredType());
+    }
+  }
+
+  private Dictionary getDictionaryForColumn(ColumnMetadata columnMetadata)
+      throws IOException {
+    String columnName = columnMetadata.getColumnName();
+    DataType dataType = columnMetadata.getDataType();
+    PinotDataBuffer dictionaryBuffer = _segmentWriter.getIndexFor(columnName, StandardIndexes.dictionary());
+    int length = columnMetadata.getCardinality();
+    switch (dataType.getStoredType()) {
+      case INT:
+        return new IntDictionary(dictionaryBuffer, length);
+      case LONG:
+        return new LongDictionary(dictionaryBuffer, length);
+      case FLOAT:
+        return new FloatDictionary(dictionaryBuffer, length);
+      case DOUBLE:
+        return new DoubleDictionary(dictionaryBuffer, length);
+      case BIG_DECIMAL:
+        return new BigDecimalDictionary(dictionaryBuffer, length, columnMetadata.getColumnMaxLength());
+      case STRING:
+        return new StringDictionary(dictionaryBuffer, length, columnMetadata.getColumnMaxLength());
+      case BYTES:
+        return new BytesDictionary(dictionaryBuffer, length, columnMetadata.getColumnMaxLength());
+      default:
+        throw new IllegalStateException("Unsupported data type: " + dataType + " for column: " + columnName);
+    }
+  }
+
+  @SuppressWarnings({"rawtypes", "unchecked"})
+  private void addColumnMinMaxValueWithoutDictionary(ColumnMetadata columnMetadata)
+      throws IOException {
+    String columnName = columnMetadata.getColumnName();
     DataType dataType = columnMetadata.getDataType();
     DataType storedType = dataType.getStoredType();
-    if (columnMetadata.hasDictionary()) {
-      PinotDataBuffer dictionaryBuffer = _segmentWriter.getIndexFor(columnName, StandardIndexes.dictionary());
-      int length = columnMetadata.getCardinality();
-      switch (storedType) {
-        case INT:
-          try (IntDictionary intDictionary = new IntDictionary(dictionaryBuffer, length)) {
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-                intDictionary.getStringValue(0), intDictionary.getStringValue(length - 1), storedType);
-          }
-          break;
-        case LONG:
-          try (LongDictionary longDictionary = new LongDictionary(dictionaryBuffer, length)) {
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-                longDictionary.getStringValue(0), longDictionary.getStringValue(length - 1), storedType);
-          }
-          break;
-        case FLOAT:
-          try (FloatDictionary floatDictionary = new FloatDictionary(dictionaryBuffer, length)) {
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-                floatDictionary.getStringValue(0), floatDictionary.getStringValue(length - 1), storedType);
-          }
-          break;
-        case DOUBLE:
-          try (DoubleDictionary doubleDictionary = new DoubleDictionary(dictionaryBuffer, length)) {
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-                doubleDictionary.getStringValue(0), doubleDictionary.getStringValue(length - 1), storedType);
-          }
-          break;
-        case STRING:
-          try (StringDictionary stringDictionary = new StringDictionary(dictionaryBuffer, length,
-              columnMetadata.getColumnMaxLength())) {
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-                stringDictionary.getStringValue(0), stringDictionary.getStringValue(length - 1), storedType);
-          }
-          break;
-        case BYTES:
-          try (BytesDictionary bytesDictionary = new BytesDictionary(dictionaryBuffer, length,
-              columnMetadata.getColumnMaxLength())) {
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-                bytesDictionary.getStringValue(0), bytesDictionary.getStringValue(length - 1), storedType);
-          }
-          break;
-        default:
-          throw new IllegalStateException("Unsupported data type: " + dataType + " for column: " + columnName);
-      }
-    } else {
-      // setting min/max for non-dictionary columns.
+    boolean isSingleValue = columnMetadata.isSingleValue();
+    PinotDataBuffer rawIndexBuffer = _segmentWriter.getIndexFor(columnName, StandardIndexes.forward());
+    try (ForwardIndexReader rawIndexReader = ForwardIndexReaderFactory.createRawIndexReader(rawIndexBuffer, storedType,
+        isSingleValue); ForwardIndexReaderContext readerContext = rawIndexReader.createContext()) {
       int numDocs = columnMetadata.getTotalDocs();
-      PinotDataBuffer rawIndexBuffer = _segmentWriter.getIndexFor(columnName, StandardIndexes.forward());
-      boolean isSingleValue = _segmentMetadata.getSchema().getFieldSpecFor(columnName).isSingleValueField();
-      try (
-          ForwardIndexReader rawIndexReader = ForwardIndexReaderFactory.createRawIndexReader(rawIndexBuffer, storedType,
-              isSingleValue); ForwardIndexReaderContext readerContext = rawIndexReader.createContext()) {
-        switch (storedType) {
-          case INT: {
-            int min = Integer.MAX_VALUE;
-            int max = Integer.MIN_VALUE;
-            if (isSingleValue) {
-              for (int docId = 0; docId < numDocs; docId++) {
-                int value = rawIndexReader.getInt(docId, readerContext);
+      Object minValue;
+      Object maxValue;
+      switch (storedType) {
+        case INT: {
+          int min = Integer.MAX_VALUE;
+          int max = Integer.MIN_VALUE;
+          if (isSingleValue) {
+            for (int docId = 0; docId < numDocs; docId++) {
+              int value = rawIndexReader.getInt(docId, readerContext);
+              min = Math.min(min, value);
+              max = Math.max(max, value);
+            }
+          } else {
+            for (int docId = 0; docId < numDocs; docId++) {
+              int[] values = rawIndexReader.getIntMV(docId, readerContext);
+              for (int value : values) {
                 min = Math.min(min, value);
                 max = Math.max(max, value);
               }
-            } else {
-              for (int docId = 0; docId < numDocs; docId++) {
-                int[] values = rawIndexReader.getIntMV(docId, readerContext);
-                for (int value : values) {
-                  min = Math.min(min, value);
-                  max = Math.max(max, value);
-                }
-              }
             }
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName, String.valueOf(min),
-                String.valueOf(max), storedType);
-            break;
           }
-          case LONG: {
-            long min = Long.MAX_VALUE;
-            long max = Long.MIN_VALUE;
-            if (isSingleValue) {
-              for (int docId = 0; docId < numDocs; docId++) {
-                long value = rawIndexReader.getLong(docId, readerContext);
+          minValue = min;
+          maxValue = max;
+          break;
+        }
+        case LONG: {
+          long min = Long.MAX_VALUE;
+          long max = Long.MIN_VALUE;
+          if (isSingleValue) {
+            for (int docId = 0; docId < numDocs; docId++) {
+              long value = rawIndexReader.getLong(docId, readerContext);
+              min = Math.min(min, value);
+              max = Math.max(max, value);
+            }
+          } else {
+            for (int docId = 0; docId < numDocs; docId++) {
+              long[] values = rawIndexReader.getLongMV(docId, readerContext);
+              for (long value : values) {
                 min = Math.min(min, value);
                 max = Math.max(max, value);
               }
-            } else {
-              for (int docId = 0; docId < numDocs; docId++) {
-                long[] values = rawIndexReader.getLongMV(docId, readerContext);
-                for (long value : values) {
-                  min = Math.min(min, value);
-                  max = Math.max(max, value);
-                }
-              }
             }
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName, String.valueOf(min),
-                String.valueOf(max), storedType);
-            break;
           }
-          case FLOAT: {
-            float min = Float.POSITIVE_INFINITY;
-            float max = Float.NEGATIVE_INFINITY;
-            if (isSingleValue) {
-              for (int docId = 0; docId < numDocs; docId++) {
-                float value = rawIndexReader.getFloat(docId, readerContext);
+          minValue = min;
+          maxValue = max;
+          break;
+        }
+        case FLOAT: {
+          float min = Float.POSITIVE_INFINITY;
+          float max = Float.NEGATIVE_INFINITY;
+          if (isSingleValue) {
+            for (int docId = 0; docId < numDocs; docId++) {
+              float value = rawIndexReader.getFloat(docId, readerContext);
+              min = Math.min(min, value);
+              max = Math.max(max, value);
+            }
+          } else {
+            for (int docId = 0; docId < numDocs; docId++) {
+              float[] values = rawIndexReader.getFloatMV(docId, readerContext);
+              for (float value : values) {
                 min = Math.min(min, value);
                 max = Math.max(max, value);
               }
-            } else {
-              for (int docId = 0; docId < numDocs; docId++) {
-                float[] values = rawIndexReader.getFloatMV(docId, readerContext);
-                for (float value : values) {
-                  min = Math.min(min, value);
-                  max = Math.max(max, value);
-                }
-              }
             }
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName, String.valueOf(min),
-                String.valueOf(max), storedType);
-            break;
           }
-          case DOUBLE: {
-            double min = Double.POSITIVE_INFINITY;
-            double max = Double.NEGATIVE_INFINITY;
-            if (isSingleValue) {
-              for (int docId = 0; docId < numDocs; docId++) {
-                double value = rawIndexReader.getDouble(docId, readerContext);
+          minValue = min;
+          maxValue = max;
+          break;
+        }
+        case DOUBLE: {
+          double min = Double.POSITIVE_INFINITY;
+          double max = Double.NEGATIVE_INFINITY;
+          if (isSingleValue) {
+            for (int docId = 0; docId < numDocs; docId++) {
+              double value = rawIndexReader.getDouble(docId, readerContext);
+              min = Math.min(min, value);
+              max = Math.max(max, value);
+            }
+          } else {
+            for (int docId = 0; docId < numDocs; docId++) {
+              double[] values = rawIndexReader.getDoubleMV(docId, readerContext);
+              for (double value : values) {
                 min = Math.min(min, value);
                 max = Math.max(max, value);
               }
-            } else {
-              for (int docId = 0; docId < numDocs; docId++) {
-                double[] values = rawIndexReader.getDoubleMV(docId, readerContext);
-                for (double value : values) {
-                  min = Math.min(min, value);
-                  max = Math.max(max, value);
-                }
-              }
             }
-            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName, String.valueOf(min),
-                String.valueOf(max), storedType);
-            break;
           }
-          case STRING: {
-            String min = null;
-            String max = null;
-            if (isSingleValue) {
-              for (int docId = 0; docId < numDocs; docId++) {
-                String value = rawIndexReader.getString(docId, readerContext);
+          minValue = min;
+          maxValue = max;
+          break;
+        }
+        case BIG_DECIMAL: {
+          Preconditions.checkState(isSingleValue, "Unsupported multi-value BIG_DECIMAL column: %s", columnName);

Review Comment:
   is this generally true in Pinot or just too complicated to handle here for now?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org