You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@druid.apache.org by cw...@apache.org on 2023/02/01 12:15:21 UTC

[druid] branch master updated: Fixing incorrect filtering of nulls in an array when ingesting for JSON and Avro (#13712)

This is an automated email from the ASF dual-hosted git repository.

cwylie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/druid.git


The following commit(s) were added to refs/heads/master by this push:
     new 74ff848ce5 Fixing  incorrect filtering of nulls in an array when ingesting for JSON and Avro (#13712)
74ff848ce5 is described below

commit 74ff848ce54cd389bdae6153322ba8cbc32d5cb2
Author: somu-imply <93...@users.noreply.github.com>
AuthorDate: Wed Feb 1 04:15:08 2023 -0800

    Fixing  incorrect filtering of nulls in an array when ingesting for JSON and Avro (#13712)
---
 .../util/common/parsers/JSONFlattenerMaker.java    |  5 +--
 .../druid/data/input/impl/JSONParseSpecTest.java   | 39 +++++++++++++++++-
 .../druid/data/input/impl/JsonLineReaderTest.java  |  6 ++-
 .../druid/data/input/avro/AvroFlattenerMaker.java  |  3 +-
 .../avro-extensions/src/test/avro/some-datum.avsc  | 12 ++++--
 .../data/input/AvroStreamInputFormatTest.java      |  7 ++--
 .../data/input/AvroStreamInputRowParserTest.java   |  7 ++--
 .../data/input/avro/AvroFlattenerMakerTest.java    | 48 +++++++++++++++++++++-
 8 files changed, 106 insertions(+), 21 deletions(-)

diff --git a/core/src/main/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMaker.java b/core/src/main/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMaker.java
index 0b8244e29b..5df9819908 100644
--- a/core/src/main/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMaker.java
+++ b/core/src/main/java/org/apache/druid/java/util/common/parsers/JSONFlattenerMaker.java
@@ -179,12 +179,11 @@ public class JSONFlattenerMaker implements ObjectFlatteners.FlattenerMaker<JsonN
       return ((BinaryNode) val).binaryValue();
     }
 
+
     if (val.isArray()) {
       List<Object> newList = new ArrayList<>();
       for (JsonNode entry : val) {
-        if (!entry.isNull()) {
-          newList.add(convertJsonNode(entry, enc));
-        }
+        newList.add(convertJsonNode(entry, enc));
       }
       return newList;
     }
diff --git a/core/src/test/java/org/apache/druid/data/input/impl/JSONParseSpecTest.java b/core/src/test/java/org/apache/druid/data/input/impl/JSONParseSpecTest.java
index d5728f7988..b89c11258b 100644
--- a/core/src/test/java/org/apache/druid/data/input/impl/JSONParseSpecTest.java
+++ b/core/src/test/java/org/apache/druid/data/input/impl/JSONParseSpecTest.java
@@ -102,7 +102,7 @@ public class JSONParseSpecTest
     );
 
     final Map<String, Object> expected = new HashMap<>();
-    expected.put("foo", new ArrayList());
+    expected.put("foo", Collections.singletonList(null));
     expected.put("baz", null);
     expected.put("bar", Collections.singletonList("test"));
 
@@ -113,6 +113,43 @@ public class JSONParseSpecTest
     Assert.assertEquals(expected, parsedRow);
   }
 
+  @Test
+  public void testParseRowWithNullsInArrays()
+  {
+    final JSONParseSpec parseSpec = new JSONParseSpec(
+        new TimestampSpec("timestamp", "iso", null),
+        new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("foo"))),
+        new JSONPathSpec(
+            true,
+            ImmutableList.of(
+                // https://github.com/apache/druid/issues/6653 $.x.y.z where y is missing
+                new JSONPathFieldSpec(JSONPathFieldType.PATH, "foo", "$.baz.[?(@.maybe_object)].maybe_object"),
+                // https://github.com/apache/druid/issues/6653 $.x.y.z where y is from an array and is null
+                new JSONPathFieldSpec(JSONPathFieldType.PATH, "nullFoo", "$.nullFoo.[?(@.value)][0].foo"),
+                new JSONPathFieldSpec(JSONPathFieldType.PATH, "baz", "$.baz"),
+                // $.x.y.z where x is from an array and is null
+                new JSONPathFieldSpec(JSONPathFieldType.PATH, "nullBaz", "$.baz[1].foo.maybe_object"),
+                new JSONPathFieldSpec(JSONPathFieldType.PATH, "bar", "$.[?(@.something_else)].something_else.foo")
+            )
+        ),
+        null,
+        false
+    );
+
+    final Map<String, Object> expected = new HashMap<>();
+    expected.put("foo", new ArrayList<>());
+    expected.put("baz", Arrays.asList("1", null, "2", null));
+    expected.put("bar", Collections.singletonList("test"));
+    expected.put("nullFoo", new ArrayList<>());
+    expected.put("nullBaz", null);
+
+    final Parser<String, Object> parser = parseSpec.makeParser();
+    final Map<String, Object> parsedRow = parser.parseToMap("{\"baz\":[\"1\",null,\"2\",null],\"nullFoo\":{\"value\":[null,null]},\"something_else\": {\"foo\": \"test\"}}");
+
+    Assert.assertNotNull(parsedRow);
+    Assert.assertEquals(expected, parsedRow);
+  }
+
   @Test
   public void testSerde() throws IOException
   {
diff --git a/core/src/test/java/org/apache/druid/data/input/impl/JsonLineReaderTest.java b/core/src/test/java/org/apache/druid/data/input/impl/JsonLineReaderTest.java
index 3ea56aa71f..5b0a3a391a 100644
--- a/core/src/test/java/org/apache/druid/data/input/impl/JsonLineReaderTest.java
+++ b/core/src/test/java/org/apache/druid/data/input/impl/JsonLineReaderTest.java
@@ -141,7 +141,11 @@ public class JsonLineReaderTest
       while (iterator.hasNext()) {
         final InputRow row = iterator.next();
         Assert.assertEquals("test", Iterables.getOnlyElement(row.getDimension("bar")));
-        Assert.assertEquals(Collections.emptyList(), row.getDimension("foo"));
+        // Since foo is in the JSONPathSpec it comes as an array of [null]
+        // row.getRaw("foo") comes out as an array of nulls but the
+        // row.getDimension("foo") stringifies it as "null". A future developer should aim to relieve this
+        Assert.assertEquals(Collections.singletonList(null), row.getRaw("foo"));
+        Assert.assertEquals(Collections.singletonList("null"), row.getDimension("foo"));
         Assert.assertTrue(row.getDimension("baz").isEmpty());
         numActualIterations++;
       }
diff --git a/extensions-core/avro-extensions/src/main/java/org/apache/druid/data/input/avro/AvroFlattenerMaker.java b/extensions-core/avro-extensions/src/main/java/org/apache/druid/data/input/avro/AvroFlattenerMaker.java
index ba9d895b1f..d557b65c84 100644
--- a/extensions-core/avro-extensions/src/main/java/org/apache/druid/data/input/avro/AvroFlattenerMaker.java
+++ b/extensions-core/avro-extensions/src/main/java/org/apache/druid/data/input/avro/AvroFlattenerMaker.java
@@ -39,7 +39,6 @@ import java.util.EnumSet;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
-import java.util.Objects;
 import java.util.Set;
 import java.util.function.Function;
 import java.util.stream.Collectors;
@@ -194,7 +193,7 @@ public class AvroFlattenerMaker implements ObjectFlatteners.FlattenerMaker<Gener
     } else if (field instanceof Utf8) {
       return field.toString();
     } else if (field instanceof List) {
-      return ((List<?>) field).stream().filter(Objects::nonNull).map(this::transformValue).collect(Collectors.toList());
+      return ((List<?>) field).stream().map(this::transformValue).collect(Collectors.toList());
     } else if (field instanceof GenericEnumSymbol) {
       return field.toString();
     } else if (field instanceof GenericFixed) {
diff --git a/extensions-core/avro-extensions/src/test/avro/some-datum.avsc b/extensions-core/avro-extensions/src/test/avro/some-datum.avsc
index f03cb708df..fe43d719ee 100644
--- a/extensions-core/avro-extensions/src/test/avro/some-datum.avsc
+++ b/extensions-core/avro-extensions/src/test/avro/some-datum.avsc
@@ -33,10 +33,14 @@
       },
       {
         "name": "someStringArray",
-        "type": {
-          "type": "array",
-          "items": "string"
-        }
+        "type": [
+          "null",
+          {
+            "type": "array",
+            "items": ["null","string"]
+          }
+        ],
+        "default":null
       },
       {
         "name": "someIntValueMap",
diff --git a/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/AvroStreamInputFormatTest.java b/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/AvroStreamInputFormatTest.java
index a59c32b810..177766510d 100644
--- a/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/AvroStreamInputFormatTest.java
+++ b/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/AvroStreamInputFormatTest.java
@@ -81,7 +81,7 @@ import static org.apache.druid.data.input.AvroStreamInputRowParserTest.buildSome
  *  "someOtherId": 6568719896,
  *  "isValid": true,
  *  "someIntArray": [1, 2, 4, 8],
- *  "someStringArray": ["8", "4", "2", "1"],
+ *  "someStringArray": ["8", "4", "2", "1", null],
  *  "someIntValueMap": {"8": 8, "1": 1, "2": 2, "4": 4},
  *  "someStringValueMap": {"8": "8", "1": "1", "2": "2", "4": "4"},
  *  "someUnion": "string as union",
@@ -108,14 +108,13 @@ public class AvroStreamInputFormatTest extends InitializedNullHandlingTest
   private static final List<String> DIMENSIONS_SCHEMALESS = Arrays.asList(
       "nested",
       SOME_OTHER_ID,
-      "someStringArray",
       "someIntArray",
       "someFloat",
+      "someUnion",
       EVENT_TYPE,
+      ID,
       "someFixed",
       "someBytes",
-      "someUnion",
-      ID,
       "someEnum",
       "someLong",
       "someInt",
diff --git a/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/AvroStreamInputRowParserTest.java b/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/AvroStreamInputRowParserTest.java
index 3f206a77a0..3bcec4e3c1 100644
--- a/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/AvroStreamInputRowParserTest.java
+++ b/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/AvroStreamInputRowParserTest.java
@@ -88,14 +88,13 @@ public class AvroStreamInputRowParserTest
   private static final List<String> DIMENSIONS_SCHEMALESS = Arrays.asList(
       "nested",
       SOME_OTHER_ID,
-      "someStringArray",
       "someIntArray",
       "someFloat",
+      "someUnion",
       EVENT_TYPE,
+      ID,
       "someFixed",
       "someBytes",
-      "someUnion",
-      ID,
       "someEnum",
       "someLong",
       "someInt",
@@ -128,7 +127,7 @@ public class AvroStreamInputRowParserTest
                                                                   .setSubInt(SUB_INT_VALUE)
                                                                   .setSubLong(SUB_LONG_VALUE)
                                                                   .build();
-  private static final List<CharSequence> SOME_STRING_ARRAY_VALUE = Arrays.asList("8", "4", "2", "1");
+  private static final List<CharSequence> SOME_STRING_ARRAY_VALUE = Arrays.asList("8", "4", "2", "1", null);
   private static final List<Integer> SOME_INT_ARRAY_VALUE = Arrays.asList(1, 2, 4, 8);
   static final Map<CharSequence, Integer> SOME_INT_VALUE_MAP_VALUE = Maps.asMap(
       new HashSet<>(Arrays.asList("8", "2", "4", "1")), new Function<CharSequence, Integer>()
diff --git a/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/avro/AvroFlattenerMakerTest.java b/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/avro/AvroFlattenerMakerTest.java
index 28caea3049..59e15deeb7 100644
--- a/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/avro/AvroFlattenerMakerTest.java
+++ b/extensions-core/avro-extensions/src/test/java/org/apache/druid/data/input/avro/AvroFlattenerMakerTest.java
@@ -164,11 +164,12 @@ public class AvroFlattenerMakerTest
     final AvroFlattenerMaker flattenerNested = new AvroFlattenerMaker(false, false, true, true);
 
     SomeAvroDatum input = AvroStreamInputRowParserTest.buildSomeAvroDatum();
-
+    // isFieldPrimitive on someStringArray is false
+    // as it contains items as nulls and strings
+    // so flattenerNested should only be able to discover it
     Assert.assertEquals(
         ImmutableSet.of(
             "someOtherId",
-            "someStringArray",
             "someIntArray",
             "someFloat",
             "eventType",
@@ -210,6 +211,49 @@ public class AvroFlattenerMakerTest
     );
   }
 
+
+  @Test
+  public void testNullsInStringArray()
+  {
+    final AvroFlattenerMaker flattenerNested = new AvroFlattenerMaker(false, false, true, true);
+
+    SomeAvroDatum input = AvroStreamInputRowParserTest.buildSomeAvroDatum();
+
+    Assert.assertEquals(
+        ImmutableSet.of(
+            "someStringValueMap",
+            "someOtherId",
+            "someStringArray",
+            "someIntArray",
+            "someFloat",
+            "isValid",
+            "someIntValueMap",
+            "eventType",
+            "someFixed",
+            "someBytes",
+            "someRecord",
+            "someMultiMemberUnion",
+            "someNull",
+            "someRecordArray",
+            "someUnion",
+            "id",
+            "someEnum",
+            "someLong",
+            "someInt",
+            "timestamp"
+        ),
+        ImmutableSet.copyOf(flattenerNested.discoverRootFields(input))
+    );
+
+    ArrayList<Object> results = (ArrayList<Object>) flattenerNested.getRootField(input, "someStringArray");
+    // 4 strings a 1 null for a total of 5
+    Assert.assertEquals("8", results.get(0).toString());
+    Assert.assertEquals("4", results.get(1).toString());
+    Assert.assertEquals("2", results.get(2).toString());
+    Assert.assertEquals("1", results.get(3).toString());
+    Assert.assertEquals(null, results.get(4));
+  }
+
   private void getRootField_common(final SomeAvroDatum record, final AvroFlattenerMaker flattener)
   {
     Assert.assertEquals(


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@druid.apache.org
For additional commands, e-mail: commits-help@druid.apache.org