You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by "jadami10 (via GitHub)" <gi...@apache.org> on 2024/02/26 17:20:31 UTC

Re: [PR] Ensure min/max value generation in the segment metadata. [pinot]

jadami10 commented on code in PR #10891:
URL: https://github.com/apache/pinot/pull/10891#discussion_r1503020298


##########
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/loader/columnminmaxvalue/ColumnMinMaxValueGenerator.java:
##########
@@ -119,65 +121,126 @@ private List<String> getColumnsToAddMinMaxValue() {
 
   private boolean needAddColumnMinMaxValueForColumn(String columnName) {
     ColumnMetadata columnMetadata = _segmentMetadata.getColumnMetadataFor(columnName);
-    return columnMetadata.hasDictionary() && columnMetadata.getMinValue() == null
+    return columnMetadata.getMinValue() == null
         && columnMetadata.getMaxValue() == null && !columnMetadata.isMinMaxValueInvalid();
   }
 
   private void addColumnMinMaxValueForColumn(String columnName)
       throws Exception {
     // Skip column without dictionary or with min/max value already set
     ColumnMetadata columnMetadata = _segmentMetadata.getColumnMetadataFor(columnName);
-    if (!columnMetadata.hasDictionary() || columnMetadata.getMinValue() != null
-        || columnMetadata.getMaxValue() != null) {
+    if (columnMetadata.getMinValue() != null || columnMetadata.getMaxValue() != null) {
       return;
     }
 
-    PinotDataBuffer dictionaryBuffer = _segmentWriter.getIndexFor(columnName, StandardIndexes.dictionary());
     DataType dataType = columnMetadata.getDataType().getStoredType();
-    int length = columnMetadata.getCardinality();
-    switch (dataType) {
-      case INT:
-        try (IntDictionary intDictionary = new IntDictionary(dictionaryBuffer, length)) {
-          SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-              intDictionary.getStringValue(0), intDictionary.getStringValue(length - 1));
-        }
-        break;
-      case LONG:
-        try (LongDictionary longDictionary = new LongDictionary(dictionaryBuffer, length)) {
-          SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-              longDictionary.getStringValue(0), longDictionary.getStringValue(length - 1));
-        }
-        break;
-      case FLOAT:
-        try (FloatDictionary floatDictionary = new FloatDictionary(dictionaryBuffer, length)) {
-          SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-              floatDictionary.getStringValue(0), floatDictionary.getStringValue(length - 1));
-        }
-        break;
-      case DOUBLE:
-        try (DoubleDictionary doubleDictionary = new DoubleDictionary(dictionaryBuffer, length)) {
-          SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-              doubleDictionary.getStringValue(0), doubleDictionary.getStringValue(length - 1));
-        }
-        break;
-      case STRING:
-        try (StringDictionary stringDictionary = new StringDictionary(dictionaryBuffer, length,
-            columnMetadata.getColumnMaxLength())) {
-          SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-              stringDictionary.getStringValue(0), stringDictionary.getStringValue(length - 1));
-        }
-        break;
-      case BYTES:
-        try (BytesDictionary bytesDictionary = new BytesDictionary(dictionaryBuffer, length,
-            columnMetadata.getColumnMaxLength())) {
-          SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
-              bytesDictionary.getStringValue(0), bytesDictionary.getStringValue(length - 1));
-        }
-        break;
-      default:
-        throw new IllegalStateException("Unsupported data type: " + dataType + " for column: " + columnName);
+    if (columnMetadata.hasDictionary()) {
+      PinotDataBuffer dictionaryBuffer = _segmentWriter.getIndexFor(columnName, StandardIndexes.dictionary());
+      int length = columnMetadata.getCardinality();
+      switch (dataType) {
+        case INT:
+          try (IntDictionary intDictionary = new IntDictionary(dictionaryBuffer, length)) {
+            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
+                intDictionary.getStringValue(0), intDictionary.getStringValue(length - 1));
+          }
+          break;
+        case LONG:
+          try (LongDictionary longDictionary = new LongDictionary(dictionaryBuffer, length)) {
+            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
+                longDictionary.getStringValue(0), longDictionary.getStringValue(length - 1));
+          }
+          break;
+        case FLOAT:
+          try (FloatDictionary floatDictionary = new FloatDictionary(dictionaryBuffer, length)) {
+            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
+                floatDictionary.getStringValue(0), floatDictionary.getStringValue(length - 1));
+          }
+          break;
+        case DOUBLE:
+          try (DoubleDictionary doubleDictionary = new DoubleDictionary(dictionaryBuffer, length)) {
+            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
+                doubleDictionary.getStringValue(0), doubleDictionary.getStringValue(length - 1));
+          }
+          break;
+        case STRING:
+          try (StringDictionary stringDictionary = new StringDictionary(dictionaryBuffer, length,
+              columnMetadata.getColumnMaxLength())) {
+            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
+                stringDictionary.getStringValue(0), stringDictionary.getStringValue(length - 1));
+          }
+          break;
+        case BYTES:
+          try (BytesDictionary bytesDictionary = new BytesDictionary(dictionaryBuffer, length,
+              columnMetadata.getColumnMaxLength())) {
+            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
+                bytesDictionary.getStringValue(0), bytesDictionary.getStringValue(length - 1));
+          }
+          break;
+        default:
+          throw new IllegalStateException("Unsupported data type: " + dataType + " for column: " + columnName);
+      }
+    } else if (_segmentMetadata.getSchema().getFieldSpecFor(columnName).getFieldType() == FieldSpec.FieldType.METRIC) {
+      // setting min/max for non-dictionary columns.
+      PinotDataBuffer forwardBuffer = _segmentWriter.getIndexFor(columnName, StandardIndexes.forward());
+      switch (dataType) {
+        case INT:
+          try (FixedByteChunkSVForwardIndexReader rawIndexReader = new FixedByteChunkSVForwardIndexReader(forwardBuffer,
+              DataType.INT); ChunkReaderContext readerContext = rawIndexReader.createContext()) {
+            Integer[] minMaxValue = {Integer.MAX_VALUE, Integer.MIN_VALUE};
+            for (int docs = 0; docs < columnMetadata.getTotalDocs(); docs++) {
+              minMaxValue = getMinMaxValue(minMaxValue, rawIndexReader.getInt(docs, readerContext));
+            }
+            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
+                String.valueOf(minMaxValue[0]), String.valueOf(minMaxValue[1]));
+          }
+          break;
+        case LONG:
+          try (FixedByteChunkSVForwardIndexReader rawIndexReader = new FixedByteChunkSVForwardIndexReader(forwardBuffer,
+              DataType.LONG); ChunkReaderContext readerContext = rawIndexReader.createContext()) {
+            Long[] minMaxValue = {Long.MAX_VALUE, Long.MIN_VALUE};
+            for (int docs = 0; docs < columnMetadata.getTotalDocs(); docs++) {
+              minMaxValue = getMinMaxValue(minMaxValue, rawIndexReader.getLong(docs, readerContext));
+            }
+            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
+                String.valueOf(minMaxValue[0]), String.valueOf(minMaxValue[1]));
+          }
+          break;
+        case FLOAT:
+          try (FixedByteChunkSVForwardIndexReader rawIndexReader = new FixedByteChunkSVForwardIndexReader(forwardBuffer,
+              DataType.FLOAT); ChunkReaderContext readerContext = rawIndexReader.createContext()) {
+            Float[] minMaxValue = {Float.MAX_VALUE, Float.MIN_VALUE};
+            for (int docs = 0; docs < columnMetadata.getTotalDocs(); docs++) {
+              minMaxValue = getMinMaxValue(minMaxValue, rawIndexReader.getFloat(docs, readerContext));
+            }
+            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
+                String.valueOf(minMaxValue[0]), String.valueOf(minMaxValue[1]));
+          }
+          break;
+        case DOUBLE:
+          try (FixedByteChunkSVForwardIndexReader rawIndexReader = new FixedByteChunkSVForwardIndexReader(forwardBuffer,
+              DataType.DOUBLE); ChunkReaderContext readerContext = rawIndexReader.createContext()) {
+            Double[] minMaxValue = {Double.MAX_VALUE, Double.MIN_VALUE};
+            for (int docs = 0; docs < columnMetadata.getTotalDocs(); docs++) {
+              minMaxValue = getMinMaxValue(minMaxValue, rawIndexReader.getDouble(docs, readerContext));
+            }
+            SegmentColumnarIndexCreator.addColumnMinMaxValueInfo(_segmentProperties, columnName,
+                String.valueOf(minMaxValue[0]), String.valueOf(minMaxValue[1]));
+          }
+          break;
+        default:

Review Comment:
   our attempted upgrade broke here because `BIG_DECIMAL` isn't supported. I'm curious why we choose to throw an error for an unsupported data type when we were seemingly ok with this data not being populated. 
   
   But more importantly, there's some wider issue here that we're missing entire data types, and no testing is catching it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org