You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by pg...@apache.org on 2021/10/21 09:39:14 UTC

[hive] branch master updated: HIVE-25541: JsonSerDe: TBLPROPERTY treating nested json as String (Panos Garefalakis, reviewed by Laszlo Bodor)

This is an automated email from the ASF dual-hosted git repository.

pgaref pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 4df4b76  HIVE-25541: JsonSerDe: TBLPROPERTY treating nested json as String (Panos Garefalakis, reviewed by Laszlo Bodor)
4df4b76 is described below

commit 4df4b769e8217b2a40ce74f415a859379ab13986
Author: Panagiotis Garefalakis <pg...@apache.org>
AuthorDate: Thu Oct 21 12:38:58 2021 +0300

    HIVE-25541: JsonSerDe: TBLPROPERTY treating nested json as String (Panos Garefalakis, reviewed by Laszlo Bodor)
    
    Closes #2664
---
 data/files/nested_sample_1.json                    |  3 +
 data/files/nested_sample_2.json                    |  1 +
 .../queries/clientpositive/nested_json_string.q    | 13 ++++
 .../clientpositive/llap/nested_json_string.q.out   | 52 ++++++++++++++++
 .../org/apache/hadoop/hive/serde2/JsonSerDe.java   | 10 +++-
 .../hadoop/hive/serde2/json/HiveJsonReader.java    | 34 ++++++++---
 .../apache/hadoop/hive/serde2/TestJsonSerDe.java   | 70 ++++++++++++++++++++++
 serde/src/test/resources/json/nested_sample_1.json |  1 +
 serde/src/test/resources/json/nested_sample_2.json |  1 +
 9 files changed, 176 insertions(+), 9 deletions(-)

diff --git a/data/files/nested_sample_1.json b/data/files/nested_sample_1.json
new file mode 100644
index 0000000..c2432ee
--- /dev/null
+++ b/data/files/nested_sample_1.json
@@ -0,0 +1,3 @@
+{"__dc_load_time":"2021-08-19 04:20:19","primary_contact_user_id":4226871003,"child_office_external_ids":[],"__dc_timelabel":"2021-08-19 04:10:00","name":"Brooklyn-200","location":{"name":"Brooklyn, NY"},"id":4020966003,"child_ids":[]}
+{"_dc_load_time":"2021-08-19 04:20:19","primary_contact_user_id":4226871003,"child_office_external_ids":[],"_dc_timelabel":"2021-08-19 04:10:00","name":"Brooklyn-199","location":{"name":"Brooklyn, NY"},"id":4020965003,"child_ids":[]}
+{"_dc_load_time":"2021-08-19 04:20:19","primary_contact_user_id":4226871003,"child_office_external_ids":[],"_dc_timelabel":"2021-08-19 04:10:00","name":"Brooklyn-198","location":{"name":"Brooklyn, NY"},"id":4020964003,"child_ids":[]}
\ No newline at end of file
diff --git a/data/files/nested_sample_2.json b/data/files/nested_sample_2.json
new file mode 100644
index 0000000..1d38549
--- /dev/null
+++ b/data/files/nested_sample_2.json
@@ -0,0 +1 @@
+{"data":{"H":{"event":"track_active","platform":"Android"},"B":{"device_type":"Phone","uuid":"[36ffec24-f6a4-4f5d-aa39-72e5513d2cae,11883bee-a7aa-4010-8a66-6c3c63a73f16]"}},"messageId":"2475185636801962","publish_time":1622514629783,"attributes":{"region":"IN"}}
\ No newline at end of file
diff --git a/ql/src/test/queries/clientpositive/nested_json_string.q b/ql/src/test/queries/clientpositive/nested_json_string.q
new file mode 100644
index 0000000..ce66ec9
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/nested_json_string.q
@@ -0,0 +1,13 @@
+CREATE EXTERNAL TABLE `nested_v1` (`_dc_timelabel` TIMESTAMP COMMENT 'from deserializer', `_dc_load_time` TIMESTAMP COMMENT 'from deserializer', `id` BIGINT COMMENT 'from deserializer', `name` STRING COMMENT 'from deserializer', `location` STRING COMMENT 'from deserializer', `primary_contact_user_id` BIGINT COMMENT 'from deserializer', `parent_id` BIGINT COMMENT 'from deserializer', `parent_office_external_id` STRING COMMENT 'from deserializer', `child_ids` STRING COMMENT 'from deseriali [...]
+
+LOAD DATA LOCAL INPATH '../../data/files/nested_sample_1.json' INTO TABLE `nested_v1`;
+
+SELECT * FROM nested_v1;
+--
+-- Inner nesting
+--
+CREATE EXTERNAL TABLE `nested_v2` (data STRING, messageid STRING, publish_time BIGINT, attributes STRING) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.JsonSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';
+--
+LOAD DATA LOCAL INPATH '../../data/files/nested_sample_2.json' INTO TABLE nested_v2;
+--
+SELECT * FROM nested_v2;
diff --git a/ql/src/test/results/clientpositive/llap/nested_json_string.q.out b/ql/src/test/results/clientpositive/llap/nested_json_string.q.out
new file mode 100644
index 0000000..f5c33c3
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/nested_json_string.q.out
@@ -0,0 +1,52 @@
+PREHOOK: query: CREATE EXTERNAL TABLE `nested_v1` (`_dc_timelabel` TIMESTAMP COMMENT 'from deserializer', `_dc_load_time` TIMESTAMP COMMENT 'from deserializer', `id` BIGINT COMMENT 'from deserializer', `name` STRING COMMENT 'from deserializer', `location` STRING COMMENT 'from deserializer', `primary_contact_user_id` BIGINT COMMENT 'from deserializer', `parent_id` BIGINT COMMENT 'from deserializer', `parent_office_external_id` STRING COMMENT 'from deserializer', `child_ids` STRING COMMENT [...]
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@nested_v1
+POSTHOOK: query: CREATE EXTERNAL TABLE `nested_v1` (`_dc_timelabel` TIMESTAMP COMMENT 'from deserializer', `_dc_load_time` TIMESTAMP COMMENT 'from deserializer', `id` BIGINT COMMENT 'from deserializer', `name` STRING COMMENT 'from deserializer', `location` STRING COMMENT 'from deserializer', `primary_contact_user_id` BIGINT COMMENT 'from deserializer', `parent_id` BIGINT COMMENT 'from deserializer', `parent_office_external_id` STRING COMMENT 'from deserializer', `child_ids` STRING COMMEN [...]
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@nested_v1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/nested_sample_1.json' INTO TABLE `nested_v1`
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@nested_v1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/nested_sample_1.json' INTO TABLE `nested_v1`
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@nested_v1
+PREHOOK: query: SELECT * FROM nested_v1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@nested_v1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM nested_v1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@nested_v1
+#### A masked pattern was here ####
+NULL	NULL	4020966003	Brooklyn-200	{"name":"Brooklyn, NY"}	4226871003	NULL	NULL	[]	[]	NULL
+2021-08-19 04:10:00	2021-08-19 04:20:19	4020965003	Brooklyn-199	{"name":"Brooklyn, NY"}	4226871003	NULL	NULL	[]	[]	NULL
+2021-08-19 04:10:00	2021-08-19 04:20:19	4020964003	Brooklyn-198	{"name":"Brooklyn, NY"}	4226871003	NULL	NULL	[]	[]	NULL
+PREHOOK: query: CREATE EXTERNAL TABLE `nested_v2` (data STRING, messageid STRING, publish_time BIGINT, attributes STRING) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.JsonSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@nested_v2
+POSTHOOK: query: CREATE EXTERNAL TABLE `nested_v2` (data STRING, messageid STRING, publish_time BIGINT, attributes STRING) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.JsonSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@nested_v2
+PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/nested_sample_2.json' INTO TABLE nested_v2
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@nested_v2
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/nested_sample_2.json' INTO TABLE nested_v2
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@nested_v2
+PREHOOK: query: SELECT * FROM nested_v2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@nested_v2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM nested_v2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@nested_v2
+#### A masked pattern was here ####
+{"H":{"event":"track_active","platform":"Android"},"B":{"device_type":"Phone","uuid":"[36ffec24-f6a4-4f5d-aa39-72e5513d2cae,11883bee-a7aa-4010-8a66-6c3c63a73f16]"}}	2475185636801962	1622514629783	{"region":"IN"}
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/JsonSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/JsonSerDe.java
index a584995..14d5c6e 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/JsonSerDe.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/JsonSerDe.java
@@ -50,10 +50,11 @@ import org.apache.hive.common.util.TimestampParser;
  */
 @SerDeSpec(schemaProps = { serdeConstants.LIST_COLUMNS,
     serdeConstants.LIST_COLUMN_TYPES, serdeConstants.TIMESTAMP_FORMATS,
-    JsonSerDe.BINARY_FORMAT, JsonSerDe.IGNORE_EXTRA })
+    JsonSerDe.BINARY_FORMAT, JsonSerDe.IGNORE_EXTRA, JsonSerDe.STRINGIFY_COMPLEX })
 public class JsonSerDe extends AbstractSerDe {
 
   public static final String BINARY_FORMAT = "json.binary.format";
+  public static final String STRINGIFY_COMPLEX = "json.stringify.complex.fields";
   public static final String IGNORE_EXTRA = "text.ignore.extra.fields";
   public static final String NULL_EMPTY_LINES = "text.null.empty.line";
 
@@ -147,7 +148,12 @@ public class JsonSerDe extends AbstractSerDe {
 
     final String ignoreExtras = tbl.getProperty(IGNORE_EXTRA, "true");
     if (Boolean.parseBoolean(ignoreExtras)) {
-      this.jsonReader.enable(HiveJsonReader.Feature.IGNORE_UKNOWN_FIELDS);
+      this.jsonReader.enable(HiveJsonReader.Feature.IGNORE_UNKNOWN_FIELDS);
+    }
+
+    final String stringifyComplex = tbl.getProperty(STRINGIFY_COMPLEX, "true");
+    if (Boolean.parseBoolean(stringifyComplex)) {
+      this.jsonReader.enable(HiveJsonReader.Feature.STRINGIFY_COMPLEX_FIELDS);
     }
 
     log.debug("Initialized SerDe {}", this);
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/json/HiveJsonReader.java b/serde/src/java/org/apache/hadoop/hive/serde2/json/HiveJsonReader.java
index 29be63e..b24ca0c 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/json/HiveJsonReader.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/json/HiveJsonReader.java
@@ -45,11 +45,13 @@ import org.apache.hadoop.hive.common.type.HiveDecimal;
 import org.apache.hadoop.hive.common.type.HiveVarchar;
 import org.apache.hadoop.hive.common.type.Timestamp;
 import org.apache.hadoop.hive.common.type.TimestampTZ;
+import org.apache.hadoop.hive.serde2.JsonSerDe;
 import org.apache.hadoop.hive.serde2.SerDeException;
 import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
 import org.apache.hadoop.hive.serde2.objectinspector.StructField;
 import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
@@ -118,7 +120,7 @@ public class HiveJsonReader {
    * <ul>
    * <li>{@link #COL_INDEX_PARSING}</li>
    * <li>{@link #PRIMITIVE_TO_WRITABLE}</li>
-   * <li>{@link #IGNORE_UKNOWN_FIELDS}</li>
+   * <li>{@link #IGNORE_UNKNOWN_FIELDS}</li>
    * </ul>
    */
   public enum Feature {
@@ -141,7 +143,13 @@ public class HiveJsonReader {
      * produce a log warnings. If this feature is disabled, an Exception will be
      * thrown and parsing will stop.
      */
-    IGNORE_UKNOWN_FIELDS
+    IGNORE_UNKNOWN_FIELDS,
+    /**
+     * If the JSON object being parsed includes a complex field with non defined Hive schema,
+     * enabling this feature will cause the JSON reader to treat the field as a String.
+     * If the feature is disabled, an Exception will be thrown and parsing will stop.
+     */
+    STRINGIFY_COMPLEX_FIELDS
   }
 
   /**
@@ -371,12 +379,14 @@ public class HiveJsonReader {
    */
   private Object visitLeafNode(final JsonNode leafNode,
       final ObjectInspector oi) throws SerDeException {
-    Preconditions.checkArgument(leafNode.getNodeType() != JsonNodeType.OBJECT);
-    Preconditions.checkArgument(leafNode.getNodeType() != JsonNodeType.ARRAY);
-
     final PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi;
     final PrimitiveTypeInfo typeInfo = poi.getTypeInfo();
 
+    if (typeInfo.getPrimitiveCategory() != PrimitiveCategory.STRING) {
+      Preconditions.checkArgument(leafNode.getNodeType() != JsonNodeType.OBJECT);
+      Preconditions.checkArgument(leafNode.getNodeType() != JsonNodeType.ARRAY);
+    }
+
     switch (typeInfo.getPrimitiveCategory()) {
     case INT:
       return Integer.valueOf(leafNode.asInt());
@@ -393,7 +403,17 @@ public class HiveJsonReader {
     case DOUBLE:
       return Double.valueOf(leafNode.asDouble());
     case STRING:
-      return leafNode.asText();
+      if (leafNode.isValueNode()) {
+        return leafNode.asText();
+      } else {
+        if (isEnabled(Feature.STRINGIFY_COMPLEX_FIELDS)) {
+          return leafNode.toString();
+        } else {
+          throw new SerDeException(
+              "Complex field found in JSON does not match table definition: " + typeInfo.getTypeName()
+                  + ", please consider enabling `" + JsonSerDe.STRINGIFY_COMPLEX + "` table property");
+        }
+      }
     case BINARY:
       return getByteValue(leafNode);
     case DATE:
@@ -490,7 +510,7 @@ public class HiveJsonReader {
       this.discoveredFields.put(pair, structField);
     } else {
       // Tried everything and did not discover this field
-      if (isEnabled(Feature.IGNORE_UKNOWN_FIELDS)
+      if (isEnabled(Feature.IGNORE_UNKNOWN_FIELDS)
           && this.discoveredUnknownFields.add(pair)) {
         LOG.warn("Discovered unknown field: {}. Ignoring.", fieldName);
       } else {
diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/TestJsonSerDe.java b/serde/src/test/org/apache/hadoop/hive/serde2/TestJsonSerDe.java
index dc41b2e..1b24fd9 100644
--- a/serde/src/test/org/apache/hadoop/hive/serde2/TestJsonSerDe.java
+++ b/serde/src/test/org/apache/hadoop/hive/serde2/TestJsonSerDe.java
@@ -75,6 +75,76 @@ public class TestJsonSerDe {
   }
 
   @Test
+  public void testComplexType() throws Exception {
+    Properties props = new Properties();
+    props.setProperty(serdeConstants.LIST_COLUMNS,
+        "__dc_timelabel," + "__dc_load_time," + "id," + "name," + "location," + "primary_contact_user_id,"
+            + "parent_id," + "parent_office_external_id," + "child_ids," + "child_office_external_ids," +"external_id");
+    props.setProperty(serdeConstants.LIST_COLUMN_TYPES,
+        "timestamp," + "timestamp," + "bigint," + "string," + "string," + "bigint," + "bigint," + "string," + "string,"
+            + "string," + "string");
+    props.setProperty(serdeConstants.TIMESTAMP_FORMATS, "yyyy-MM-ddHH:mm:ss");
+
+    JsonSerDe serde = new JsonSerDe();
+    serde.initialize(new Configuration(), props, null, false);
+
+    final String jsonText = loadJson("nested_sample_1.json");
+    final Text text = new Text(jsonText);
+    final List<?> results = (List<?>) serde.deserialize(text);
+
+    Assert.assertEquals(11, results.size());
+    Assert.assertEquals("Brooklyn-200", results.get(3));
+    // make sure inner struct can be decoded
+    Assert.assertEquals("{\"name\":\"Brooklyn,NY\"}", results.get(4));
+  }
+
+  @Test
+  public void testDisabledComplexType() throws Exception {
+    Properties props = new Properties();
+    props.setProperty(serdeConstants.LIST_COLUMNS,
+        "__dc_timelabel," + "__dc_load_time," + "id," + "name," + "location," + "primary_contact_user_id,"
+            + "parent_id," + "parent_office_external_id," + "child_ids," + "child_office_external_ids," +"external_id");
+    props.setProperty(serdeConstants.LIST_COLUMN_TYPES,
+        "timestamp," + "timestamp," + "bigint," + "string," + "string," + "bigint," + "bigint," + "string," + "string,"
+            + "string," + "string");
+    props.setProperty(serdeConstants.TIMESTAMP_FORMATS, "yyyy-MM-ddHH:mm:ss");
+    props.setProperty(JsonSerDe.STRINGIFY_COMPLEX, "false");
+
+    JsonSerDe serde = new JsonSerDe();
+    serde.initialize(new Configuration(), props, null, false);
+
+    final String jsonText = loadJson("nested_sample_1.json");
+    final Text text = new Text(jsonText);
+
+    Exception exception = Assert.assertThrows(SerDeException.class, () -> serde.deserialize(text));
+    String expectedMessage = "Complex field found in JSON does not match table definition: string";
+    String actualMessage = exception.getMessage();
+    Assert.assertTrue(actualMessage.contains(expectedMessage));
+  }
+
+  @Test
+  public void testMoreComplexType() throws Exception {
+    Properties props = new Properties();
+    props.setProperty(serdeConstants.LIST_COLUMNS, "data," + "messageId," + "publish_time," + "attributes");
+    props.setProperty(serdeConstants.LIST_COLUMN_TYPES, "string," + "string," + "bigint," + "string");
+
+    JsonSerDe serde = new JsonSerDe();
+    serde.initialize(new Configuration(), props, null, false);
+    final String jsonText = loadJson("nested_sample_2.json");
+
+    final Text text = new Text(jsonText);
+    final List<?> results = (List<?>) serde.deserialize(text);
+
+    Assert.assertEquals(4, results.size());
+    Assert.assertEquals("{\"H\":{\"event\":\"track_active\",\"platform\":\"Android\"},"
+        + "\"B\":{\"device_type\":\"Phone\",\"uuid\":"
+        + "\"[36ffec24-f6a4-4f5d-aa39-72e5513d2cae,11883bee-a7aa-4010-8a66-6c3c63a73f16]\"}}", results.get(0));
+    Assert.assertEquals("2475185636801962", results.get(1));
+    Assert.assertEquals(1622514629783L, results.get(2));
+    Assert.assertEquals("{\"region\":\"IN\"}", results.get(3));
+  }
+
+  @Test
   public void testArray() throws Exception {
     Properties props = new Properties();
     props.setProperty(serdeConstants.LIST_COLUMNS, "list,items");
diff --git a/serde/src/test/resources/json/nested_sample_1.json b/serde/src/test/resources/json/nested_sample_1.json
new file mode 100644
index 0000000..b8c9487
--- /dev/null
+++ b/serde/src/test/resources/json/nested_sample_1.json
@@ -0,0 +1 @@
+{"__dc_load_time":"2021-08-19 04:20:19","primary_contact_user_id":4226871003,"child_office_external_ids":[],"__dc_timelabel":"2021-08-19 04:10:00","name":"Brooklyn-200","location":{"name":"Brooklyn, NY"},"id":4020966003,"child_ids":[]}
\ No newline at end of file
diff --git a/serde/src/test/resources/json/nested_sample_2.json b/serde/src/test/resources/json/nested_sample_2.json
new file mode 100644
index 0000000..c5ace4a
--- /dev/null
+++ b/serde/src/test/resources/json/nested_sample_2.json
@@ -0,0 +1 @@
+{"data":{"H":{"event":"track_active","platform":"Android"},"B":{"device_type":"Phone","uuid":"[36ffec24-f6a4-4f5d-aa39-72e5513d2cae,11883bee-a7aa-4010-8a66-6c3c63a73f16]"}},"messageId":"2475185636801962","publish_time":1622514629783,"attributes":{"region":"IN"}}