You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@druid.apache.org by "imply-cheddar (via GitHub)" <gi...@apache.org> on 2023/03/14 07:32:38 UTC

[GitHub] [druid] imply-cheddar commented on a diff in pull request #13803: nested columns + arrays = array columns!

imply-cheddar commented on code in PR #13803:
URL: https://github.com/apache/druid/pull/13803#discussion_r1135011007


##########
processing/src/main/java/org/apache/druid/data/input/impl/DelimitedInputFormat.java:
##########
@@ -41,6 +42,18 @@
 public class DelimitedInputFormat extends FlatTextInputFormat
 {
   public static final String TYPE_KEY = "tsv";
+
+  public static DelimitedInputFormat ofColumns(String... columns)
+  {
+    return new DelimitedInputFormat(
+        Arrays.asList(columns),
+        null,
+        null,
+        false,
+        false,
+        0
+    );
+  }

Review Comment:
   Location nit: does this need to be in `main` rather than `test`?



##########
processing/src/main/java/org/apache/druid/data/input/impl/MapInputRowParser.java:
##########
@@ -77,22 +76,36 @@ public static InputRow parse(InputRowSchema inputRowSchema, Map<String, Object>
    * 3) If isIncludeAllDimensions is not set and {@link DimensionsSpec#getDimensionNames()} is empty,
    *    the dimensions in the given map is returned.
    *
-   * In any case, the returned list does not include any dimensions in {@link DimensionsSpec#getDimensionExclusions()}.
+   * In any case, the returned list does not include any dimensions in {@link DimensionsSpec#getDimensionExclusions()}
+   * or {@link TimestampSpec#getTimestampColumn()}.
    */
   private static List<String> findDimensions(
+      TimestampSpec timestampSpec,
       DimensionsSpec dimensionsSpec,
       Map<String, Object> rawInputRow
   )
   {
     if (dimensionsSpec.isIncludeAllDimensions()) {
       LinkedHashSet<String> dimensions = new LinkedHashSet<>(dimensionsSpec.getDimensionNames());
-      dimensions.addAll(Sets.difference(rawInputRow.keySet(), dimensionsSpec.getDimensionExclusions()));
+      for (String field : rawInputRow.keySet()) {
+        if (timestampSpec.getTimestampColumn().equals(field) || dimensionsSpec.getDimensionExclusions().contains(field)) {

Review Comment:
   can we just get these two things once?



##########
processing/src/main/java/org/apache/druid/segment/NestedDataColumnIndexer.java:
##########
@@ -437,41 +471,51 @@ public Class<?> classOfObject()
       this.typeSet = new NestedLiteralTypeInfo.MutableTypeSet();
     }
 
-    private StructuredDataProcessor.ProcessedLiteral<?> processValue(@Nullable Object value)
+    private StructuredDataProcessor.ProcessedLiteral<?> processValue(ExprEval<?> eval)
     {
-      // null value is always added to the global dictionary as id 0, so we can ignore them here
-      if (value != null) {
-        // why not
-        ExprEval<?> eval = ExprEval.bestEffortOf(value);
-        final ColumnType columnType = ExpressionType.toColumnType(eval.type());
-
-        switch (columnType.getType()) {
-          case LONG:
-            globalDimensionDictionary.addLongValue(eval.asLong());
-            typeSet.add(ColumnType.LONG);
-            return new StructuredDataProcessor.ProcessedLiteral<>(
-                eval.asLong(),
-                StructuredDataProcessor.getLongObjectEstimateSize()
-            );
-          case DOUBLE:
-            globalDimensionDictionary.addDoubleValue(eval.asDouble());
-            typeSet.add(ColumnType.DOUBLE);
-            return new StructuredDataProcessor.ProcessedLiteral<>(
-                eval.asDouble(),
-                StructuredDataProcessor.getDoubleObjectEstimateSize()
-            );
-          case STRING:
-          default:
-            final String asString = eval.asString();
-            globalDimensionDictionary.addStringValue(asString);
-            typeSet.add(ColumnType.STRING);
-            return new StructuredDataProcessor.ProcessedLiteral<>(
-                eval.asString(),
-                StructuredDataProcessor.estimateStringSize(asString)
-            );
-        }
+      final ColumnType columnType = ExpressionType.toColumnType(eval.type());
+      int sizeEstimate;
+      switch (columnType.getType()) {
+        case LONG:
+          typeSet.add(ColumnType.LONG);
+          sizeEstimate = globalDimensionDictionary.addLongValue(eval.asLong());
+          return new StructuredDataProcessor.ProcessedLiteral<>(eval.asLong(), sizeEstimate);
+        case DOUBLE:
+          typeSet.add(ColumnType.DOUBLE);
+          sizeEstimate = globalDimensionDictionary.addDoubleValue(eval.asDouble());
+          return new StructuredDataProcessor.ProcessedLiteral<>(eval.asDouble(), sizeEstimate);
+        case ARRAY:
+          // skip empty arrays for now, they will always be called 'string' arrays, which isn't very helpful here since
+          // it will pollute the type set
+          Preconditions.checkNotNull(columnType.getElementType(), "Array element type must not be null");

Review Comment:
   If this were to ever happen, I think I'd want to know which field was the bad one.



##########
processing/src/main/java/org/apache/druid/segment/NestedDataColumnIndexer.java:
##########
@@ -411,7 +445,7 @@ public Object getObject()
         if (0 <= dimIndex && dimIndex < dims.length) {
           final StructuredData data = (StructuredData) dims[dimIndex];
           if (data != null) {
-            return data.getValue();
+            return ExprEval.bestEffortOf(data.getValue()).value();

Review Comment:
   Is this counting on coercion or something?



##########
processing/src/main/java/org/apache/druid/data/input/impl/JsonInputFormat.java:
##########
@@ -41,6 +41,14 @@ public class JsonInputFormat extends NestedInputFormat
 {
   public static final String TYPE_KEY = "json";
 
+  public static final JsonInputFormat DEFAULT = new JsonInputFormat(
+      JSONPathSpec.DEFAULT,
+      null,
+      null,
+      null,
+      null
+  );
+

Review Comment:
   Location nit: does this need to be in `main` rather than `test`?



##########
processing/src/main/java/org/apache/druid/query/metadata/SegmentAnalyzer.java:
##########
@@ -134,6 +134,11 @@ public Map<String, ColumnAnalysis> analyze(Segment segment)
             analysis = analyzeStringColumn(capabilities, storageAdapter, columnName);
           }
           break;
+        case ARRAY:
+          // todo (clint): this is wack, but works for now because arrays are always nested complex columns...

Review Comment:
   Instead of a todo like this.  How about an if check that validates that it's a nested column and, if it's not a nested column, throws an Exception that is clearly saying that the code is currently written assuming that the arrays are delivered only by nested columns?



##########
processing/src/main/java/org/apache/druid/query/expression/NestedDataExpressions.java:
##########
@@ -480,11 +480,25 @@ public Expr apply(List<Expr> args)
       final StructuredDataProcessor processor = new StructuredDataProcessor()
       {
         @Override
-        public StructuredDataProcessor.ProcessedLiteral<?> processLiteralField(ArrayList<NestedPathPart> fieldPath, Object fieldValue)
+        public StructuredDataProcessor.ProcessedLiteral<?> processLiteralField(ArrayList<NestedPathPart> fieldPath, @Nullable Object fieldValue)
         {
           // do nothing, we only want the list of fields returned by this processor
           return StructuredDataProcessor.ProcessedLiteral.NULL_LITERAL;
         }
+
+        @Nullable
+        @Override
+        public ProcessedLiteral<?> processArrayOfLiteralsField(
+            ArrayList<NestedPathPart> fieldPath,
+            @Nullable Object maybeArrayOfLiterals
+        )
+        {
+          ExprEval<?> eval = ExprEval.bestEffortOf(maybeArrayOfLiterals);
+          if (eval.type().isArray() && eval.type().getElementType().isPrimitive()) {
+            return StructuredDataProcessor.ProcessedLiteral.NULL_LITERAL;
+          }

Review Comment:
   The if condition makes me believe that we found an array and should be joyously celebrating, but the return value is a `NULL_LITERAL`, why is it correct to return a `NULL_LITERAL` and how is that different from just returning `null`?



##########
extensions-core/parquet-extensions/src/test/java/org/apache/druid/data/input/parquet/NestedColumnParquetReaderTest.java:
##########
@@ -181,7 +181,7 @@ public void testNestedColumnSchemalessNestedTestFileNoNested() throws IOExceptio
     );
 
     List<InputRow> rows = readAllRows(reader);
-    Assert.assertEquals(ImmutableList.of("dim1", "metric1", "timestamp"), rows.get(0).getDimensions());
+    Assert.assertEquals(ImmutableList.of("dim1", "metric1"), rows.get(0).getDimensions());

Review Comment:
   Why the change in expectation?



##########
processing/src/main/java/org/apache/druid/query/expression/NestedDataExpressions.java:
##########
@@ -480,11 +480,25 @@ public Expr apply(List<Expr> args)
       final StructuredDataProcessor processor = new StructuredDataProcessor()
       {
         @Override
-        public StructuredDataProcessor.ProcessedLiteral<?> processLiteralField(ArrayList<NestedPathPart> fieldPath, Object fieldValue)
+        public StructuredDataProcessor.ProcessedLiteral<?> processLiteralField(ArrayList<NestedPathPart> fieldPath, @Nullable Object fieldValue)
         {
           // do nothing, we only want the list of fields returned by this processor
           return StructuredDataProcessor.ProcessedLiteral.NULL_LITERAL;
         }
+
+        @Nullable
+        @Override
+        public ProcessedLiteral<?> processArrayOfLiteralsField(
+            ArrayList<NestedPathPart> fieldPath,
+            @Nullable Object maybeArrayOfLiterals
+        )
+        {
+          ExprEval<?> eval = ExprEval.bestEffortOf(maybeArrayOfLiterals);

Review Comment:
   Would it make sense to break `ExprEval.bestEffortOf` into a bunch of checks for different groups of expected types (i.e. `ExprEval.maybeLiteral()` and `ExprEval.maybeArray`, etc.).  Calls to `bestEffortOf` can cascade through, but places like this that already know some of what they expect can call the one that more aligns with expectations?



##########
processing/src/main/java/org/apache/druid/segment/NestedDataColumnIndexer.java:
##########
@@ -59,16 +62,43 @@ public class NestedDataColumnIndexer implements DimensionIndexer<StructuredData,
   protected final StructuredDataProcessor indexerProcessor = new StructuredDataProcessor()
   {
     @Override
-    public ProcessedLiteral<?> processLiteralField(ArrayList<NestedPathPart> fieldPath, Object fieldValue)
+    public ProcessedLiteral<?> processLiteralField(ArrayList<NestedPathPart> fieldPath, @Nullable Object fieldValue)
     {
-      final String fieldName = NestedPathFinder.toNormalizedJsonPath(fieldPath);
-      LiteralFieldIndexer fieldIndexer = fieldIndexers.get(fieldName);
-      if (fieldIndexer == null) {
-        estimatedFieldKeySize += StructuredDataProcessor.estimateStringSize(fieldName);
-        fieldIndexer = new LiteralFieldIndexer(globalDictionary);
-        fieldIndexers.put(fieldName, fieldIndexer);
+      // null value is always added to the global dictionary as id 0, so we can ignore them here
+      if (fieldValue != null) {
+        // why not
+        final String fieldName = NestedPathFinder.toNormalizedJsonPath(fieldPath);
+        ExprEval<?> eval = ExprEval.bestEffortOf(fieldValue);
+        LiteralFieldIndexer fieldIndexer = fieldIndexers.get(fieldName);
+        if (fieldIndexer == null) {
+          estimatedFieldKeySize += StructuredDataProcessor.estimateStringSize(fieldName);
+          fieldIndexer = new LiteralFieldIndexer(globalDictionary);
+          fieldIndexers.put(fieldName, fieldIndexer);
+        }
+        return fieldIndexer.processValue(eval);
       }
-      return fieldIndexer.processValue(fieldValue);
+      return StructuredDataProcessor.ProcessedLiteral.NULL_LITERAL;
+    }
+
+    @Nullable
+    @Override
+    public ProcessedLiteral<?> processArrayOfLiteralsField(
+        ArrayList<NestedPathPart> fieldPath,
+        Object maybeArrayOfLiterals
+    )
+    {
+      final ExprEval<?> maybeLiteralArray = ExprEval.bestEffortOf(maybeArrayOfLiterals);
+      if (maybeLiteralArray.type().isArray() && maybeLiteralArray.type().getElementType().isPrimitive()) {
+        final String fieldName = NestedPathFinder.toNormalizedJsonPath(fieldPath);
+        LiteralFieldIndexer fieldIndexer = fieldIndexers.get(fieldName);
+        if (fieldIndexer == null) {
+          estimatedFieldKeySize += StructuredDataProcessor.estimateStringSize(fieldName);
+          fieldIndexer = new LiteralFieldIndexer(globalDictionary);
+          fieldIndexers.put(fieldName, fieldIndexer);
+        }
+        return fieldIndexer.processValue(maybeLiteralArray);
+      }
+      return null;

Review Comment:
   This looks a lot like code in NestedDataExpressions, except this doesn't return the `NULL_LITERAL`.  I find myself wondering if the NestedDataExpressions code shouldn't look like this?



##########
processing/src/main/java/org/apache/druid/segment/NestedDataColumnIndexer.java:
##########
@@ -59,16 +62,43 @@ public class NestedDataColumnIndexer implements DimensionIndexer<StructuredData,
   protected final StructuredDataProcessor indexerProcessor = new StructuredDataProcessor()
   {
     @Override
-    public ProcessedLiteral<?> processLiteralField(ArrayList<NestedPathPart> fieldPath, Object fieldValue)
+    public ProcessedLiteral<?> processLiteralField(ArrayList<NestedPathPart> fieldPath, @Nullable Object fieldValue)
     {
-      final String fieldName = NestedPathFinder.toNormalizedJsonPath(fieldPath);
-      LiteralFieldIndexer fieldIndexer = fieldIndexers.get(fieldName);
-      if (fieldIndexer == null) {
-        estimatedFieldKeySize += StructuredDataProcessor.estimateStringSize(fieldName);
-        fieldIndexer = new LiteralFieldIndexer(globalDictionary);
-        fieldIndexers.put(fieldName, fieldIndexer);
+      // null value is always added to the global dictionary as id 0, so we can ignore them here
+      if (fieldValue != null) {
+        // why not

Review Comment:
   Because I don't know "why?"  (if this comment is attempting to add information, more words please).



##########
processing/src/main/java/org/apache/druid/segment/NestedDataColumnIndexer.java:
##########
@@ -145,6 +175,10 @@ public DimensionSelector makeDimensionSelector(
     final int dimIndex = desc.getIndex();
     final ColumnValueSelector<?> rootLiteralSelector = getRootLiteralValueSelector(currEntry, dimIndex);
     if (rootLiteralSelector != null) {
+      final LiteralFieldIndexer root = fieldIndexers.get(NestedPathFinder.JSON_PATH_ROOT);
+      if (root.getTypes().getSingleType().isArray()) {
+        throw new UnsupportedOperationException("Not supported");
+      }

Review Comment:
   I'm reading this as "if all we have are root-level entries and they are always arrays, then throw a UOE exception".  I'm pretty sure I'm reading it wrong, but wishing the error message gave me more context without me feeling like I need to expand lines on the review to know what this is validating.



##########
processing/src/main/java/org/apache/druid/segment/NestedDataColumnIndexer.java:
##########
@@ -437,41 +471,51 @@ public Class<?> classOfObject()
       this.typeSet = new NestedLiteralTypeInfo.MutableTypeSet();
     }
 
-    private StructuredDataProcessor.ProcessedLiteral<?> processValue(@Nullable Object value)
+    private StructuredDataProcessor.ProcessedLiteral<?> processValue(ExprEval<?> eval)
     {
-      // null value is always added to the global dictionary as id 0, so we can ignore them here
-      if (value != null) {
-        // why not
-        ExprEval<?> eval = ExprEval.bestEffortOf(value);
-        final ColumnType columnType = ExpressionType.toColumnType(eval.type());
-
-        switch (columnType.getType()) {
-          case LONG:
-            globalDimensionDictionary.addLongValue(eval.asLong());
-            typeSet.add(ColumnType.LONG);
-            return new StructuredDataProcessor.ProcessedLiteral<>(
-                eval.asLong(),
-                StructuredDataProcessor.getLongObjectEstimateSize()
-            );
-          case DOUBLE:
-            globalDimensionDictionary.addDoubleValue(eval.asDouble());
-            typeSet.add(ColumnType.DOUBLE);
-            return new StructuredDataProcessor.ProcessedLiteral<>(
-                eval.asDouble(),
-                StructuredDataProcessor.getDoubleObjectEstimateSize()
-            );
-          case STRING:
-          default:
-            final String asString = eval.asString();
-            globalDimensionDictionary.addStringValue(asString);
-            typeSet.add(ColumnType.STRING);
-            return new StructuredDataProcessor.ProcessedLiteral<>(
-                eval.asString(),
-                StructuredDataProcessor.estimateStringSize(asString)
-            );
-        }
+      final ColumnType columnType = ExpressionType.toColumnType(eval.type());
+      int sizeEstimate;
+      switch (columnType.getType()) {
+        case LONG:
+          typeSet.add(ColumnType.LONG);
+          sizeEstimate = globalDimensionDictionary.addLongValue(eval.asLong());
+          return new StructuredDataProcessor.ProcessedLiteral<>(eval.asLong(), sizeEstimate);
+        case DOUBLE:
+          typeSet.add(ColumnType.DOUBLE);
+          sizeEstimate = globalDimensionDictionary.addDoubleValue(eval.asDouble());
+          return new StructuredDataProcessor.ProcessedLiteral<>(eval.asDouble(), sizeEstimate);
+        case ARRAY:
+          // skip empty arrays for now, they will always be called 'string' arrays, which isn't very helpful here since
+          // it will pollute the type set

Review Comment:
   How does it do the skipping of empties?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@druid.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@druid.apache.org
For additional commands, e-mail: commits-help@druid.apache.org