You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by pg...@apache.org on 2021/10/21 09:39:14 UTC
[hive] branch master updated: HIVE-25541: JsonSerDe: TBLPROPERTY
treating nested json as String (Panos Garefalakis, reviewed by Laszlo Bodor)
This is an automated email from the ASF dual-hosted git repository.
pgaref pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 4df4b76 HIVE-25541: JsonSerDe: TBLPROPERTY treating nested json as String (Panos Garefalakis, reviewed by Laszlo Bodor)
4df4b76 is described below
commit 4df4b769e8217b2a40ce74f415a859379ab13986
Author: Panagiotis Garefalakis <pg...@apache.org>
AuthorDate: Thu Oct 21 12:38:58 2021 +0300
HIVE-25541: JsonSerDe: TBLPROPERTY treating nested json as String (Panos Garefalakis, reviewed by Laszlo Bodor)
Closes #2664
---
data/files/nested_sample_1.json | 3 +
data/files/nested_sample_2.json | 1 +
.../queries/clientpositive/nested_json_string.q | 13 ++++
.../clientpositive/llap/nested_json_string.q.out | 52 ++++++++++++++++
.../org/apache/hadoop/hive/serde2/JsonSerDe.java | 10 +++-
.../hadoop/hive/serde2/json/HiveJsonReader.java | 34 ++++++++---
.../apache/hadoop/hive/serde2/TestJsonSerDe.java | 70 ++++++++++++++++++++++
serde/src/test/resources/json/nested_sample_1.json | 1 +
serde/src/test/resources/json/nested_sample_2.json | 1 +
9 files changed, 176 insertions(+), 9 deletions(-)
diff --git a/data/files/nested_sample_1.json b/data/files/nested_sample_1.json
new file mode 100644
index 0000000..c2432ee
--- /dev/null
+++ b/data/files/nested_sample_1.json
@@ -0,0 +1,3 @@
+{"__dc_load_time":"2021-08-19 04:20:19","primary_contact_user_id":4226871003,"child_office_external_ids":[],"__dc_timelabel":"2021-08-19 04:10:00","name":"Brooklyn-200","location":{"name":"Brooklyn, NY"},"id":4020966003,"child_ids":[]}
+{"_dc_load_time":"2021-08-19 04:20:19","primary_contact_user_id":4226871003,"child_office_external_ids":[],"_dc_timelabel":"2021-08-19 04:10:00","name":"Brooklyn-199","location":{"name":"Brooklyn, NY"},"id":4020965003,"child_ids":[]}
+{"_dc_load_time":"2021-08-19 04:20:19","primary_contact_user_id":4226871003,"child_office_external_ids":[],"_dc_timelabel":"2021-08-19 04:10:00","name":"Brooklyn-198","location":{"name":"Brooklyn, NY"},"id":4020964003,"child_ids":[]}
\ No newline at end of file
diff --git a/data/files/nested_sample_2.json b/data/files/nested_sample_2.json
new file mode 100644
index 0000000..1d38549
--- /dev/null
+++ b/data/files/nested_sample_2.json
@@ -0,0 +1 @@
+{"data":{"H":{"event":"track_active","platform":"Android"},"B":{"device_type":"Phone","uuid":"[36ffec24-f6a4-4f5d-aa39-72e5513d2cae,11883bee-a7aa-4010-8a66-6c3c63a73f16]"}},"messageId":"2475185636801962","publish_time":1622514629783,"attributes":{"region":"IN"}}
\ No newline at end of file
diff --git a/ql/src/test/queries/clientpositive/nested_json_string.q b/ql/src/test/queries/clientpositive/nested_json_string.q
new file mode 100644
index 0000000..ce66ec9
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/nested_json_string.q
@@ -0,0 +1,13 @@
+CREATE EXTERNAL TABLE `nested_v1` (`_dc_timelabel` TIMESTAMP COMMENT 'from deserializer', `_dc_load_time` TIMESTAMP COMMENT 'from deserializer', `id` BIGINT COMMENT 'from deserializer', `name` STRING COMMENT 'from deserializer', `location` STRING COMMENT 'from deserializer', `primary_contact_user_id` BIGINT COMMENT 'from deserializer', `parent_id` BIGINT COMMENT 'from deserializer', `parent_office_external_id` STRING COMMENT 'from deserializer', `child_ids` STRING COMMENT 'from deseriali [...]
+
+LOAD DATA LOCAL INPATH '../../data/files/nested_sample_1.json' INTO TABLE `nested_v1`;
+
+SELECT * FROM nested_v1;
+--
+-- Inner nesting
+--
+CREATE EXTERNAL TABLE `nested_v2` (data STRING, messageid STRING, publish_time BIGINT, attributes STRING) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.JsonSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';
+--
+LOAD DATA LOCAL INPATH '../../data/files/nested_sample_2.json' INTO TABLE nested_v2;
+--
+SELECT * FROM nested_v2;
diff --git a/ql/src/test/results/clientpositive/llap/nested_json_string.q.out b/ql/src/test/results/clientpositive/llap/nested_json_string.q.out
new file mode 100644
index 0000000..f5c33c3
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/nested_json_string.q.out
@@ -0,0 +1,52 @@
+PREHOOK: query: CREATE EXTERNAL TABLE `nested_v1` (`_dc_timelabel` TIMESTAMP COMMENT 'from deserializer', `_dc_load_time` TIMESTAMP COMMENT 'from deserializer', `id` BIGINT COMMENT 'from deserializer', `name` STRING COMMENT 'from deserializer', `location` STRING COMMENT 'from deserializer', `primary_contact_user_id` BIGINT COMMENT 'from deserializer', `parent_id` BIGINT COMMENT 'from deserializer', `parent_office_external_id` STRING COMMENT 'from deserializer', `child_ids` STRING COMMENT [...]
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@nested_v1
+POSTHOOK: query: CREATE EXTERNAL TABLE `nested_v1` (`_dc_timelabel` TIMESTAMP COMMENT 'from deserializer', `_dc_load_time` TIMESTAMP COMMENT 'from deserializer', `id` BIGINT COMMENT 'from deserializer', `name` STRING COMMENT 'from deserializer', `location` STRING COMMENT 'from deserializer', `primary_contact_user_id` BIGINT COMMENT 'from deserializer', `parent_id` BIGINT COMMENT 'from deserializer', `parent_office_external_id` STRING COMMENT 'from deserializer', `child_ids` STRING COMMEN [...]
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@nested_v1
+PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/nested_sample_1.json' INTO TABLE `nested_v1`
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@nested_v1
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/nested_sample_1.json' INTO TABLE `nested_v1`
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@nested_v1
+PREHOOK: query: SELECT * FROM nested_v1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@nested_v1
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM nested_v1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@nested_v1
+#### A masked pattern was here ####
+NULL NULL 4020966003 Brooklyn-200 {"name":"Brooklyn, NY"} 4226871003 NULL NULL [] [] NULL
+2021-08-19 04:10:00 2021-08-19 04:20:19 4020965003 Brooklyn-199 {"name":"Brooklyn, NY"} 4226871003 NULL NULL [] [] NULL
+2021-08-19 04:10:00 2021-08-19 04:20:19 4020964003 Brooklyn-198 {"name":"Brooklyn, NY"} 4226871003 NULL NULL [] [] NULL
+PREHOOK: query: CREATE EXTERNAL TABLE `nested_v2` (data STRING, messageid STRING, publish_time BIGINT, attributes STRING) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.JsonSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@nested_v2
+POSTHOOK: query: CREATE EXTERNAL TABLE `nested_v2` (data STRING, messageid STRING, publish_time BIGINT, attributes STRING) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.JsonSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@nested_v2
+PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/nested_sample_2.json' INTO TABLE nested_v2
+PREHOOK: type: LOAD
+#### A masked pattern was here ####
+PREHOOK: Output: default@nested_v2
+POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/nested_sample_2.json' INTO TABLE nested_v2
+POSTHOOK: type: LOAD
+#### A masked pattern was here ####
+POSTHOOK: Output: default@nested_v2
+PREHOOK: query: SELECT * FROM nested_v2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@nested_v2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM nested_v2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@nested_v2
+#### A masked pattern was here ####
+{"H":{"event":"track_active","platform":"Android"},"B":{"device_type":"Phone","uuid":"[36ffec24-f6a4-4f5d-aa39-72e5513d2cae,11883bee-a7aa-4010-8a66-6c3c63a73f16]"}} 2475185636801962 1622514629783 {"region":"IN"}
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/JsonSerDe.java b/serde/src/java/org/apache/hadoop/hive/serde2/JsonSerDe.java
index a584995..14d5c6e 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/JsonSerDe.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/JsonSerDe.java
@@ -50,10 +50,11 @@ import org.apache.hive.common.util.TimestampParser;
*/
@SerDeSpec(schemaProps = { serdeConstants.LIST_COLUMNS,
serdeConstants.LIST_COLUMN_TYPES, serdeConstants.TIMESTAMP_FORMATS,
- JsonSerDe.BINARY_FORMAT, JsonSerDe.IGNORE_EXTRA })
+ JsonSerDe.BINARY_FORMAT, JsonSerDe.IGNORE_EXTRA, JsonSerDe.STRINGIFY_COMPLEX })
public class JsonSerDe extends AbstractSerDe {
public static final String BINARY_FORMAT = "json.binary.format";
+ public static final String STRINGIFY_COMPLEX = "json.stringify.complex.fields";
public static final String IGNORE_EXTRA = "text.ignore.extra.fields";
public static final String NULL_EMPTY_LINES = "text.null.empty.line";
@@ -147,7 +148,12 @@ public class JsonSerDe extends AbstractSerDe {
final String ignoreExtras = tbl.getProperty(IGNORE_EXTRA, "true");
if (Boolean.parseBoolean(ignoreExtras)) {
- this.jsonReader.enable(HiveJsonReader.Feature.IGNORE_UKNOWN_FIELDS);
+ this.jsonReader.enable(HiveJsonReader.Feature.IGNORE_UNKNOWN_FIELDS);
+ }
+
+ final String stringifyComplex = tbl.getProperty(STRINGIFY_COMPLEX, "true");
+ if (Boolean.parseBoolean(stringifyComplex)) {
+ this.jsonReader.enable(HiveJsonReader.Feature.STRINGIFY_COMPLEX_FIELDS);
}
log.debug("Initialized SerDe {}", this);
diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/json/HiveJsonReader.java b/serde/src/java/org/apache/hadoop/hive/serde2/json/HiveJsonReader.java
index 29be63e..b24ca0c 100644
--- a/serde/src/java/org/apache/hadoop/hive/serde2/json/HiveJsonReader.java
+++ b/serde/src/java/org/apache/hadoop/hive/serde2/json/HiveJsonReader.java
@@ -45,11 +45,13 @@ import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.common.type.HiveVarchar;
import org.apache.hadoop.hive.common.type.Timestamp;
import org.apache.hadoop.hive.common.type.TimestampTZ;
+import org.apache.hadoop.hive.serde2.JsonSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
@@ -118,7 +120,7 @@ public class HiveJsonReader {
* <ul>
* <li>{@link #COL_INDEX_PARSING}</li>
* <li>{@link #PRIMITIVE_TO_WRITABLE}</li>
- * <li>{@link #IGNORE_UKNOWN_FIELDS}</li>
+ * <li>{@link #IGNORE_UNKNOWN_FIELDS}</li>
* </ul>
*/
public enum Feature {
@@ -141,7 +143,13 @@ public class HiveJsonReader {
* produce a log warnings. If this feature is disabled, an Exception will be
* thrown and parsing will stop.
*/
- IGNORE_UKNOWN_FIELDS
+ IGNORE_UNKNOWN_FIELDS,
+ /**
+ * If the JSON object being parsed includes a complex field with non defined Hive schema,
+ * enabling this feature will cause the JSON reader to treat the field as a String.
+ * If the feature is disabled, an Exception will be thrown and parsing will stop.
+ */
+ STRINGIFY_COMPLEX_FIELDS
}
/**
@@ -371,12 +379,14 @@ public class HiveJsonReader {
*/
private Object visitLeafNode(final JsonNode leafNode,
final ObjectInspector oi) throws SerDeException {
- Preconditions.checkArgument(leafNode.getNodeType() != JsonNodeType.OBJECT);
- Preconditions.checkArgument(leafNode.getNodeType() != JsonNodeType.ARRAY);
-
final PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi;
final PrimitiveTypeInfo typeInfo = poi.getTypeInfo();
+ if (typeInfo.getPrimitiveCategory() != PrimitiveCategory.STRING) {
+ Preconditions.checkArgument(leafNode.getNodeType() != JsonNodeType.OBJECT);
+ Preconditions.checkArgument(leafNode.getNodeType() != JsonNodeType.ARRAY);
+ }
+
switch (typeInfo.getPrimitiveCategory()) {
case INT:
return Integer.valueOf(leafNode.asInt());
@@ -393,7 +403,17 @@ public class HiveJsonReader {
case DOUBLE:
return Double.valueOf(leafNode.asDouble());
case STRING:
- return leafNode.asText();
+ if (leafNode.isValueNode()) {
+ return leafNode.asText();
+ } else {
+ if (isEnabled(Feature.STRINGIFY_COMPLEX_FIELDS)) {
+ return leafNode.toString();
+ } else {
+ throw new SerDeException(
+ "Complex field found in JSON does not match table definition: " + typeInfo.getTypeName()
+ + ", please consider enabling `" + JsonSerDe.STRINGIFY_COMPLEX + "` table property");
+ }
+ }
case BINARY:
return getByteValue(leafNode);
case DATE:
@@ -490,7 +510,7 @@ public class HiveJsonReader {
this.discoveredFields.put(pair, structField);
} else {
// Tried everything and did not discover this field
- if (isEnabled(Feature.IGNORE_UKNOWN_FIELDS)
+ if (isEnabled(Feature.IGNORE_UNKNOWN_FIELDS)
&& this.discoveredUnknownFields.add(pair)) {
LOG.warn("Discovered unknown field: {}. Ignoring.", fieldName);
} else {
diff --git a/serde/src/test/org/apache/hadoop/hive/serde2/TestJsonSerDe.java b/serde/src/test/org/apache/hadoop/hive/serde2/TestJsonSerDe.java
index dc41b2e..1b24fd9 100644
--- a/serde/src/test/org/apache/hadoop/hive/serde2/TestJsonSerDe.java
+++ b/serde/src/test/org/apache/hadoop/hive/serde2/TestJsonSerDe.java
@@ -75,6 +75,76 @@ public class TestJsonSerDe {
}
@Test
+ public void testComplexType() throws Exception {
+ Properties props = new Properties();
+ props.setProperty(serdeConstants.LIST_COLUMNS,
+ "__dc_timelabel," + "__dc_load_time," + "id," + "name," + "location," + "primary_contact_user_id,"
+ + "parent_id," + "parent_office_external_id," + "child_ids," + "child_office_external_ids," +"external_id");
+ props.setProperty(serdeConstants.LIST_COLUMN_TYPES,
+ "timestamp," + "timestamp," + "bigint," + "string," + "string," + "bigint," + "bigint," + "string," + "string,"
+ + "string," + "string");
+ props.setProperty(serdeConstants.TIMESTAMP_FORMATS, "yyyy-MM-ddHH:mm:ss");
+
+ JsonSerDe serde = new JsonSerDe();
+ serde.initialize(new Configuration(), props, null, false);
+
+ final String jsonText = loadJson("nested_sample_1.json");
+ final Text text = new Text(jsonText);
+ final List<?> results = (List<?>) serde.deserialize(text);
+
+ Assert.assertEquals(11, results.size());
+ Assert.assertEquals("Brooklyn-200", results.get(3));
+ // make sure inner struct can be decoded
+ Assert.assertEquals("{\"name\":\"Brooklyn,NY\"}", results.get(4));
+ }
+
+ @Test
+ public void testDisabledComplexType() throws Exception {
+ Properties props = new Properties();
+ props.setProperty(serdeConstants.LIST_COLUMNS,
+ "__dc_timelabel," + "__dc_load_time," + "id," + "name," + "location," + "primary_contact_user_id,"
+ + "parent_id," + "parent_office_external_id," + "child_ids," + "child_office_external_ids," +"external_id");
+ props.setProperty(serdeConstants.LIST_COLUMN_TYPES,
+ "timestamp," + "timestamp," + "bigint," + "string," + "string," + "bigint," + "bigint," + "string," + "string,"
+ + "string," + "string");
+ props.setProperty(serdeConstants.TIMESTAMP_FORMATS, "yyyy-MM-ddHH:mm:ss");
+ props.setProperty(JsonSerDe.STRINGIFY_COMPLEX, "false");
+
+ JsonSerDe serde = new JsonSerDe();
+ serde.initialize(new Configuration(), props, null, false);
+
+ final String jsonText = loadJson("nested_sample_1.json");
+ final Text text = new Text(jsonText);
+
+ Exception exception = Assert.assertThrows(SerDeException.class, () -> serde.deserialize(text));
+ String expectedMessage = "Complex field found in JSON does not match table definition: string";
+ String actualMessage = exception.getMessage();
+ Assert.assertTrue(actualMessage.contains(expectedMessage));
+ }
+
+ @Test
+ public void testMoreComplexType() throws Exception {
+ Properties props = new Properties();
+ props.setProperty(serdeConstants.LIST_COLUMNS, "data," + "messageId," + "publish_time," + "attributes");
+ props.setProperty(serdeConstants.LIST_COLUMN_TYPES, "string," + "string," + "bigint," + "string");
+
+ JsonSerDe serde = new JsonSerDe();
+ serde.initialize(new Configuration(), props, null, false);
+ final String jsonText = loadJson("nested_sample_2.json");
+
+ final Text text = new Text(jsonText);
+ final List<?> results = (List<?>) serde.deserialize(text);
+
+ Assert.assertEquals(4, results.size());
+ Assert.assertEquals("{\"H\":{\"event\":\"track_active\",\"platform\":\"Android\"},"
+ + "\"B\":{\"device_type\":\"Phone\",\"uuid\":"
+ + "\"[36ffec24-f6a4-4f5d-aa39-72e5513d2cae,11883bee-a7aa-4010-8a66-6c3c63a73f16]\"}}", results.get(0));
+ Assert.assertEquals("2475185636801962", results.get(1));
+ Assert.assertEquals(1622514629783L, results.get(2));
+ Assert.assertEquals("{\"region\":\"IN\"}", results.get(3));
+ }
+
+ @Test
public void testArray() throws Exception {
Properties props = new Properties();
props.setProperty(serdeConstants.LIST_COLUMNS, "list,items");
diff --git a/serde/src/test/resources/json/nested_sample_1.json b/serde/src/test/resources/json/nested_sample_1.json
new file mode 100644
index 0000000..b8c9487
--- /dev/null
+++ b/serde/src/test/resources/json/nested_sample_1.json
@@ -0,0 +1 @@
+{"__dc_load_time":"2021-08-19 04:20:19","primary_contact_user_id":4226871003,"child_office_external_ids":[],"__dc_timelabel":"2021-08-19 04:10:00","name":"Brooklyn-200","location":{"name":"Brooklyn, NY"},"id":4020966003,"child_ids":[]}
\ No newline at end of file
diff --git a/serde/src/test/resources/json/nested_sample_2.json b/serde/src/test/resources/json/nested_sample_2.json
new file mode 100644
index 0000000..c5ace4a
--- /dev/null
+++ b/serde/src/test/resources/json/nested_sample_2.json
@@ -0,0 +1 @@
+{"data":{"H":{"event":"track_active","platform":"Android"},"B":{"device_type":"Phone","uuid":"[36ffec24-f6a4-4f5d-aa39-72e5513d2cae,11883bee-a7aa-4010-8a66-6c3c63a73f16]"}},"messageId":"2475185636801962","publish_time":1622514629783,"attributes":{"region":"IN"}}