You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by si...@apache.org on 2022/01/08 00:40:56 UTC

[pinot] branch master updated: Fix ByteArray datatype column metadata getMaxValue NPE bug and expose maxNumMultiValues (#7918)

This is an automated email from the ASF dual-hosted git repository.

siddteotia pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 06888d8  Fix ByteArray datatype column metadata getMaxValue NPE bug and expose maxNumMultiValues (#7918)
06888d8 is described below

commit 06888d8b60598b661ae90df7e7a2cc628460191a
Author: Liang Mingqiang <mi...@linkedin.com>
AuthorDate: Fri Jan 7 16:40:36 2022 -0800

    Fix ByteArray datatype column metadata getMaxValue NPE bug and expose maxNumMultiValues (#7918)
---
 .../restlet/resources/TableMetadataInfo.java       |   9 +-
 .../util/ServerSegmentMetadataReader.java          |  11 +-
 .../tests/OfflineClusterIntegrationTest.java       | 117 ++++++++++++++++-----
 .../pinot/server/api/resources/TablesResource.java |  22 ++--
 4 files changed, 118 insertions(+), 41 deletions(-)

diff --git a/pinot-common/src/main/java/org/apache/pinot/common/restlet/resources/TableMetadataInfo.java b/pinot-common/src/main/java/org/apache/pinot/common/restlet/resources/TableMetadataInfo.java
index c2c62d6..81d86b6 100644
--- a/pinot-common/src/main/java/org/apache/pinot/common/restlet/resources/TableMetadataInfo.java
+++ b/pinot-common/src/main/java/org/apache/pinot/common/restlet/resources/TableMetadataInfo.java
@@ -41,18 +41,21 @@ public class TableMetadataInfo {
   private final long _numRows;
   private final Map<String, Double> _columnLengthMap;
   private final Map<String, Double> _columnCardinalityMap;
+  private final Map<String, Double> _maxNumMultiValuesMap;
 
   @JsonCreator
   public TableMetadataInfo(@JsonProperty("tableName") String tableName,
       @JsonProperty("diskSizeInBytes") long sizeInBytes, @JsonProperty("numSegments") long numSegments,
       @JsonProperty("numRows") long numRows, @JsonProperty("columnLengthMap") Map<String, Double> columnLengthMap,
-      @JsonProperty("columnCardinalityMap") Map<String, Double> columnCardinalityMap) {
+      @JsonProperty("columnCardinalityMap") Map<String, Double> columnCardinalityMap,
+      @JsonProperty("maxNumMultiValuesMap") Map<String, Double> maxNumMultiValuesMap) {
     _tableName = tableName;
     _diskSizeInBytes = sizeInBytes;
     _numSegments = numSegments;
     _numRows = numRows;
     _columnLengthMap = columnLengthMap;
     _columnCardinalityMap = columnCardinalityMap;
+    _maxNumMultiValuesMap = maxNumMultiValuesMap;
   }
 
   public String getTableName() {
@@ -78,4 +81,8 @@ public class TableMetadataInfo {
   public Map<String, Double> getColumnCardinalityMap() {
     return _columnCardinalityMap;
   }
+
+  public Map<String, Double> getMaxNumMultiValuesMap() {
+    return _maxNumMultiValuesMap;
+  }
 }
diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/util/ServerSegmentMetadataReader.java b/pinot-controller/src/main/java/org/apache/pinot/controller/util/ServerSegmentMetadataReader.java
index 5fb8642..4ee509a 100644
--- a/pinot-controller/src/main/java/org/apache/pinot/controller/util/ServerSegmentMetadataReader.java
+++ b/pinot-controller/src/main/java/org/apache/pinot/controller/util/ServerSegmentMetadataReader.java
@@ -87,6 +87,7 @@ public class ServerSegmentMetadataReader {
     int failedParses = 0;
     final Map<String, Double> columnLengthMap = new HashMap<>();
     final Map<String, Double> columnCardinalityMap = new HashMap<>();
+    final Map<String, Double> maxNumMultiValuesMap = new HashMap<>();
     for (Map.Entry<String, String> streamResponse : serviceResponse._httpResponses.entrySet()) {
       try {
         TableMetadataInfo tableMetadataInfo =
@@ -96,6 +97,7 @@ public class ServerSegmentMetadataReader {
         totalNumSegments += tableMetadataInfo.getNumSegments();
         tableMetadataInfo.getColumnLengthMap().forEach((k, v) -> columnLengthMap.merge(k, v, Double::sum));
         tableMetadataInfo.getColumnCardinalityMap().forEach((k, v) -> columnCardinalityMap.merge(k, v, Double::sum));
+        tableMetadataInfo.getMaxNumMultiValuesMap().forEach((k, v) -> maxNumMultiValuesMap.merge(k, v, Double::sum));
       } catch (IOException e) {
         failedParses++;
         LOGGER.error("Unable to parse server {} response due to an error: ", streamResponse.getKey(), e);
@@ -104,17 +106,18 @@ public class ServerSegmentMetadataReader {
     int finalTotalNumSegments = totalNumSegments;
     columnLengthMap.replaceAll((k, v) -> v / finalTotalNumSegments);
     columnCardinalityMap.replaceAll((k, v) -> v / finalTotalNumSegments);
+    maxNumMultiValuesMap.replaceAll((k, v) -> v / finalTotalNumSegments);
 
     // Since table segments may have multiple replicas, divide diskSizeInBytes, numRows and numSegments by numReplica
-    // to avoid double counting, for columnAvgLengthMap and columnAvgCardinalityMap, dividing by numReplica is not
-    // needed since totalNumSegments already contains replicas.
+    // to avoid double counting, for columnAvgLengthMap, columnAvgCardinalityMap and maxNumMultiValuesMap, dividing by
+    // numReplica is not needed since totalNumSegments already contains replicas.
     totalDiskSizeInBytes /= numReplica;
     totalNumSegments /= numReplica;
     totalNumRows /= numReplica;
 
     TableMetadataInfo aggregateTableMetadataInfo =
-        new TableMetadataInfo("", totalDiskSizeInBytes, totalNumSegments, totalNumRows, columnLengthMap,
-            columnCardinalityMap);
+        new TableMetadataInfo(tableNameWithType, totalDiskSizeInBytes, totalNumSegments, totalNumRows, columnLengthMap,
+            columnCardinalityMap, maxNumMultiValuesMap);
     if (failedParses != 0) {
       LOGGER.warn("Failed to parse {} / {} aggregated segment metadata responses from servers.", failedParses,
           serverUrls.size());
diff --git a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/OfflineClusterIntegrationTest.java b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/OfflineClusterIntegrationTest.java
index 519675c..5a41f91 100644
--- a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/OfflineClusterIntegrationTest.java
+++ b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/OfflineClusterIntegrationTest.java
@@ -123,6 +123,7 @@ public class OfflineClusterIntegrationTest extends BaseClusterIntegrationTestSet
   private static final String NUM_ROWS_KEY = "numRows";
   private static final String COLUMN_LENGTH_MAP_KEY = "columnLengthMap";
   private static final String COLUMN_CARDINALITY_MAP_KEY = "columnCardinalityMap";
+  private static final String MAX_NUM_MULTI_VALUES_MAP_KEY = "maxNumMultiValuesMap";
   // TODO: This might lead to flaky test, as this disk size is not deterministic
   //       as it depends on the iteration order of a HashSet.
   private static final int DISK_SIZE_IN_BYTES = 20796000;
@@ -1832,21 +1833,63 @@ public class OfflineClusterIntegrationTest extends BaseClusterIntegrationTestSet
   @Test
   public void testAggregateMetadataAPI()
       throws IOException {
-    JsonNode oneColumnResponse = JsonUtils.stringToJsonNode(
-        sendGetRequest(_controllerBaseApiUrl + "/tables/mytable/metadata?columns=DestCityMarketID"));
-    assertEquals(oneColumnResponse.get(DISK_SIZE_IN_BYTES_KEY).asInt(), DISK_SIZE_IN_BYTES);
-    assertEquals(oneColumnResponse.get(NUM_SEGMENTS_KEY).asInt(), NUM_SEGMENTS);
-    assertEquals(oneColumnResponse.get(NUM_ROWS_KEY).asInt(), NUM_ROWS);
-    assertEquals(oneColumnResponse.get(COLUMN_LENGTH_MAP_KEY).size(), 1);
-    assertEquals(oneColumnResponse.get(COLUMN_CARDINALITY_MAP_KEY).size(), 1);
-
-    JsonNode threeColumnsResponse = JsonUtils.stringToJsonNode(sendGetRequest(_controllerBaseApiUrl
+    JsonNode oneSVColumnResponse = JsonUtils
+        .stringToJsonNode(sendGetRequest(_controllerBaseApiUrl + "/tables/mytable/metadata?columns=DestCityMarketID"));
+    assertEquals(oneSVColumnResponse.get(DISK_SIZE_IN_BYTES_KEY).asInt(), DISK_SIZE_IN_BYTES);
+    assertEquals(oneSVColumnResponse.get(NUM_SEGMENTS_KEY).asInt(), NUM_SEGMENTS);
+    assertEquals(oneSVColumnResponse.get(NUM_ROWS_KEY).asInt(), NUM_ROWS);
+    assertEquals(oneSVColumnResponse.get(COLUMN_LENGTH_MAP_KEY).size(), 1);
+    assertEquals(oneSVColumnResponse.get(COLUMN_CARDINALITY_MAP_KEY).size(), 1);
+    // DestCityMarketID is a SV column
+    assertEquals(oneSVColumnResponse.get(MAX_NUM_MULTI_VALUES_MAP_KEY).size(), 0);
+
+    JsonNode oneMVColumnResponse = JsonUtils
+        .stringToJsonNode(sendGetRequest(_controllerBaseApiUrl + "/tables/mytable/metadata?columns=DivLongestGTimes"));
+    assertEquals(oneMVColumnResponse.get(DISK_SIZE_IN_BYTES_KEY).asInt(), DISK_SIZE_IN_BYTES);
+    assertEquals(oneMVColumnResponse.get(NUM_SEGMENTS_KEY).asInt(), NUM_SEGMENTS);
+    assertEquals(oneMVColumnResponse.get(NUM_ROWS_KEY).asInt(), NUM_ROWS);
+    assertEquals(oneMVColumnResponse.get(COLUMN_LENGTH_MAP_KEY).size(), 1);
+    assertEquals(oneMVColumnResponse.get(COLUMN_CARDINALITY_MAP_KEY).size(), 1);
+    // DivLongestGTimes is a MV column
+    assertEquals(oneMVColumnResponse.get(MAX_NUM_MULTI_VALUES_MAP_KEY).size(), 1);
+
+    JsonNode threeSVColumnsResponse = JsonUtils.stringToJsonNode(sendGetRequest(_controllerBaseApiUrl
         + "/tables/mytable/metadata?columns=DivActualElapsedTime&columns=CRSElapsedTime&columns=OriginStateName"));
-    assertEquals(threeColumnsResponse.get(DISK_SIZE_IN_BYTES_KEY).asInt(), DISK_SIZE_IN_BYTES);
-    assertEquals(threeColumnsResponse.get(NUM_SEGMENTS_KEY).asInt(), NUM_SEGMENTS);
-    assertEquals(threeColumnsResponse.get(NUM_ROWS_KEY).asInt(), NUM_ROWS);
-    assertEquals(threeColumnsResponse.get(COLUMN_LENGTH_MAP_KEY).size(), 3);
-    assertEquals(threeColumnsResponse.get(COLUMN_CARDINALITY_MAP_KEY).size(), 3);
+    assertEquals(threeSVColumnsResponse.get(DISK_SIZE_IN_BYTES_KEY).asInt(), DISK_SIZE_IN_BYTES);
+    assertEquals(threeSVColumnsResponse.get(NUM_SEGMENTS_KEY).asInt(), NUM_SEGMENTS);
+    assertEquals(threeSVColumnsResponse.get(NUM_ROWS_KEY).asInt(), NUM_ROWS);
+    assertEquals(threeSVColumnsResponse.get(COLUMN_LENGTH_MAP_KEY).size(), 3);
+    assertEquals(threeSVColumnsResponse.get(COLUMN_CARDINALITY_MAP_KEY).size(), 3);
+    assertEquals(threeSVColumnsResponse.get(MAX_NUM_MULTI_VALUES_MAP_KEY).size(), 0);
+
+    JsonNode threeSVColumnsWholeEncodedResponse = JsonUtils.stringToJsonNode(sendGetRequest(
+        _controllerBaseApiUrl + "/tables/mytable/metadata?columns="
+            + "DivActualElapsedTime%26columns%3DCRSElapsedTime%26columns%3DOriginStateName"));
+    assertEquals(threeSVColumnsWholeEncodedResponse.get(DISK_SIZE_IN_BYTES_KEY).asInt(), DISK_SIZE_IN_BYTES);
+    assertEquals(threeSVColumnsWholeEncodedResponse.get(NUM_SEGMENTS_KEY).asInt(), NUM_SEGMENTS);
+    assertEquals(threeSVColumnsWholeEncodedResponse.get(NUM_ROWS_KEY).asInt(), NUM_ROWS);
+    assertEquals(threeSVColumnsWholeEncodedResponse.get(COLUMN_LENGTH_MAP_KEY).size(), 3);
+    assertEquals(threeSVColumnsWholeEncodedResponse.get(COLUMN_CARDINALITY_MAP_KEY).size(), 3);
+    assertEquals(threeSVColumnsWholeEncodedResponse.get(MAX_NUM_MULTI_VALUES_MAP_KEY).size(), 0);
+
+    JsonNode threeMVColumnsResponse = JsonUtils.stringToJsonNode(sendGetRequest(_controllerBaseApiUrl
+        + "/tables/mytable/metadata?columns=DivLongestGTimes&columns=DivWheelsOns&columns=DivAirports"));
+    assertEquals(threeMVColumnsResponse.get(DISK_SIZE_IN_BYTES_KEY).asInt(), DISK_SIZE_IN_BYTES);
+    assertEquals(threeMVColumnsResponse.get(NUM_SEGMENTS_KEY).asInt(), NUM_SEGMENTS);
+    assertEquals(threeMVColumnsResponse.get(NUM_ROWS_KEY).asInt(), NUM_ROWS);
+    assertEquals(threeMVColumnsResponse.get(COLUMN_LENGTH_MAP_KEY).size(), 3);
+    assertEquals(threeMVColumnsResponse.get(COLUMN_CARDINALITY_MAP_KEY).size(), 3);
+    assertEquals(threeMVColumnsResponse.get(MAX_NUM_MULTI_VALUES_MAP_KEY).size(), 3);
+
+    JsonNode threeMVColumnsWholeEncodedResponse = JsonUtils.stringToJsonNode(sendGetRequest(
+        _controllerBaseApiUrl + "/tables/mytable/metadata?columns="
+            + "DivLongestGTimes%26columns%3DDivWheelsOns%26columns%3DDivAirports"));
+    assertEquals(threeMVColumnsWholeEncodedResponse.get(DISK_SIZE_IN_BYTES_KEY).asInt(), DISK_SIZE_IN_BYTES);
+    assertEquals(threeMVColumnsWholeEncodedResponse.get(NUM_SEGMENTS_KEY).asInt(), NUM_SEGMENTS);
+    assertEquals(threeMVColumnsWholeEncodedResponse.get(NUM_ROWS_KEY).asInt(), NUM_ROWS);
+    assertEquals(threeMVColumnsWholeEncodedResponse.get(COLUMN_LENGTH_MAP_KEY).size(), 3);
+    assertEquals(threeMVColumnsWholeEncodedResponse.get(COLUMN_CARDINALITY_MAP_KEY).size(), 3);
+    assertEquals(threeMVColumnsWholeEncodedResponse.get(MAX_NUM_MULTI_VALUES_MAP_KEY).size(), 3);
 
     JsonNode zeroColumnResponse =
         JsonUtils.stringToJsonNode(sendGetRequest(_controllerBaseApiUrl + "/tables/mytable/metadata"));
@@ -1855,21 +1898,39 @@ public class OfflineClusterIntegrationTest extends BaseClusterIntegrationTestSet
     assertEquals(zeroColumnResponse.get(NUM_ROWS_KEY).asInt(), NUM_ROWS);
     assertEquals(zeroColumnResponse.get(COLUMN_LENGTH_MAP_KEY).size(), 0);
     assertEquals(zeroColumnResponse.get(COLUMN_CARDINALITY_MAP_KEY).size(), 0);
+    assertEquals(zeroColumnResponse.get(MAX_NUM_MULTI_VALUES_MAP_KEY).size(), 0);
 
-    JsonNode allColumnResponse =
+    JsonNode starColumnResponse =
         JsonUtils.stringToJsonNode(sendGetRequest(_controllerBaseApiUrl + "/tables/mytable/metadata?columns=*"));
-    assertEquals(allColumnResponse.get(DISK_SIZE_IN_BYTES_KEY).asInt(), DISK_SIZE_IN_BYTES);
-    assertEquals(allColumnResponse.get(NUM_SEGMENTS_KEY).asInt(), NUM_SEGMENTS);
-    assertEquals(allColumnResponse.get(NUM_ROWS_KEY).asInt(), NUM_ROWS);
-    assertEquals(allColumnResponse.get(COLUMN_LENGTH_MAP_KEY).size(), 82);
-    assertEquals(allColumnResponse.get(COLUMN_CARDINALITY_MAP_KEY).size(), 82);
-
-    allColumnResponse = JsonUtils.stringToJsonNode(sendGetRequest(
-        _controllerBaseApiUrl + "/tables/mytable/metadata?columns=CRSElapsedTime&columns=*&columns=OriginStateName"));
-    assertEquals(allColumnResponse.get(DISK_SIZE_IN_BYTES_KEY).asInt(), DISK_SIZE_IN_BYTES);
-    assertEquals(allColumnResponse.get(NUM_SEGMENTS_KEY).asInt(), NUM_SEGMENTS);
-    assertEquals(allColumnResponse.get(NUM_ROWS_KEY).asInt(), NUM_ROWS);
-    assertEquals(allColumnResponse.get(COLUMN_LENGTH_MAP_KEY).size(), 82);
-    assertEquals(allColumnResponse.get(COLUMN_CARDINALITY_MAP_KEY).size(), 82);
+    validateAllColumnsResponse(starColumnResponse);
+
+    JsonNode starEncodedColumnResponse =
+        JsonUtils.stringToJsonNode(sendGetRequest(_controllerBaseApiUrl + "/tables/mytable/metadata?columns=%2A"));
+    validateAllColumnsResponse(starEncodedColumnResponse);
+
+    JsonNode starWithExtraColumnResponse = JsonUtils.stringToJsonNode(sendGetRequest(
+        _controllerBaseApiUrl + "/tables/mytable/metadata?columns="
+            + "CRSElapsedTime&columns=*&columns=OriginStateName"));
+    validateAllColumnsResponse(starWithExtraColumnResponse);
+
+    JsonNode starWithExtraEncodedColumnResponse = JsonUtils.stringToJsonNode(sendGetRequest(
+        _controllerBaseApiUrl + "/tables/mytable/metadata?columns="
+            + "CRSElapsedTime&columns=%2A&columns=OriginStateName"));
+    validateAllColumnsResponse(starWithExtraEncodedColumnResponse);
+
+    JsonNode starWithExtraColumnWholeEncodedResponse = JsonUtils.stringToJsonNode(sendGetRequest(
+        _controllerBaseApiUrl + "/tables/mytable/metadata?columns="
+            + "CRSElapsedTime%26columns%3D%2A%26columns%3DOriginStateName"));
+    validateAllColumnsResponse(starWithExtraColumnWholeEncodedResponse);
+  }
+
+  private void validateAllColumnsResponse(JsonNode response) {
+    assertEquals(response.get(DISK_SIZE_IN_BYTES_KEY).asInt(), DISK_SIZE_IN_BYTES);
+    assertEquals(response.get(NUM_SEGMENTS_KEY).asInt(), NUM_SEGMENTS);
+    assertEquals(response.get(NUM_ROWS_KEY).asInt(), NUM_ROWS);
+    // mytable has 82 columns, among them 9 of the columns are MV columns.
+    assertEquals(response.get(COLUMN_LENGTH_MAP_KEY).size(), 82);
+    assertEquals(response.get(COLUMN_CARDINALITY_MAP_KEY).size(), 82);
+    assertEquals(response.get(MAX_NUM_MULTI_VALUES_MAP_KEY).size(), 9);
   }
 }
diff --git a/pinot-server/src/main/java/org/apache/pinot/server/api/resources/TablesResource.java b/pinot-server/src/main/java/org/apache/pinot/server/api/resources/TablesResource.java
index 693cabc..ebf3346 100644
--- a/pinot-server/src/main/java/org/apache/pinot/server/api/resources/TablesResource.java
+++ b/pinot-server/src/main/java/org/apache/pinot/server/api/resources/TablesResource.java
@@ -184,6 +184,7 @@ public class TablesResource {
     long totalNumRows = 0;
     Map<String, Double> columnLengthMap = new HashMap<>();
     Map<String, Double> columnCardinalityMap = new HashMap<>();
+    Map<String, Double> maxNumMultiValuesMap = new HashMap<>();
     try {
       for (SegmentDataManager segmentDataManager : segmentDataManagers) {
         if (segmentDataManager instanceof ImmutableSegmentDataManager) {
@@ -202,7 +203,7 @@ public class TablesResource {
           }
           for (String column : columnSet) {
             ColumnMetadata columnMetadata = segmentMetadata.getColumnMetadataMap().get(column);
-            int columnLength;
+            int columnLength = 0;
             DataType storedDataType = columnMetadata.getDataType().getStoredType();
             if (storedDataType.isFixedWidth()) {
               // For type of fixed width: INT, LONG, FLOAT, DOUBLE, BOOLEAN (stored as INT), TIMESTAMP (stored as LONG),
@@ -210,21 +211,26 @@ public class TablesResource {
               columnLength = storedDataType.size();
             } else if (columnMetadata.hasDictionary()) {
               // For type of variable width (String, Bytes), if it's stored using dictionary encoding, set the
-              // columnLength as the max
-              // length in dictionary.
+              // columnLength as the max length in dictionary.
               columnLength = columnMetadata.getColumnMaxLength();
             } else if (storedDataType == DataType.STRING || storedDataType == DataType.BYTES) {
               // For type of variable width (String, Bytes), if it's stored using raw bytes, set the columnLength as
-              // the length
-              // of the max value.
-              columnLength = ((String) columnMetadata.getMaxValue()).getBytes(StandardCharsets.UTF_8).length;
+              // the length of the max value.
+              if (columnMetadata.getMaxValue() != null) {
+                String maxValueString = (String) columnMetadata.getMaxValue();
+                columnLength = maxValueString.getBytes(StandardCharsets.UTF_8).length;
+              }
             } else {
               // For type of STRUCT, MAP, LIST, set the columnLength as DEFAULT_MAX_LENGTH (512).
               columnLength = FieldSpec.DEFAULT_MAX_LENGTH;
             }
-            int columnCardinality = segmentMetadata.getColumnMetadataMap().get(column).getCardinality();
+            int columnCardinality = columnMetadata.getCardinality();
             columnLengthMap.merge(column, (double) columnLength, Double::sum);
             columnCardinalityMap.merge(column, (double) columnCardinality, Double::sum);
+            if (!columnMetadata.isSingleValue()) {
+              int maxNumMultiValues = columnMetadata.getMaxNumberOfMultiValues();
+              maxNumMultiValuesMap.merge(column, (double) maxNumMultiValues, Double::sum);
+            }
           }
         }
       }
@@ -239,7 +245,7 @@ public class TablesResource {
 
     TableMetadataInfo tableMetadataInfo =
         new TableMetadataInfo(tableDataManager.getTableName(), totalSegmentSizeBytes, segmentDataManagers.size(),
-            totalNumRows, columnLengthMap, columnCardinalityMap);
+            totalNumRows, columnLengthMap, columnCardinalityMap, maxNumMultiValuesMap);
     return ResourceUtils.convertToJsonString(tableMetadataInfo);
   }
 

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org