You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by kg...@apache.org on 2018/07/10 15:11:56 UTC
[2/2] hive git commit: HIVE-18545: Add UDF to parse complex types
from json (Zoltan Haindrich reviewed by Ashutosh Chauhan)
HIVE-18545: Add UDF to parse complex types from json (Zoltan Haindrich reviewed by Ashutosh Chauhan)
Signed-off-by: Zoltan Haindrich <ki...@rxd.hu>
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/1105ef39
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/1105ef39
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/1105ef39
Branch: refs/heads/master
Commit: 1105ef3974d8a324637d3d35881a739af3aeb382
Parents: 18fb1b3
Author: Zoltan Haindrich <ki...@rxd.hu>
Authored: Tue Jul 10 16:05:10 2018 +0200
Committer: Zoltan Haindrich <ki...@rxd.hu>
Committed: Tue Jul 10 16:05:10 2018 +0200
----------------------------------------------------------------------
.../apache/hive/hcatalog/data/JsonSerDe.java | 612 ++----------------
.../hive/hcatalog/data/TestJsonSerDe.java | 12 +-
.../benchmark/udf/json_read/JsonReadBench.java | 83 +++
.../hive/benchmark/udf/json_read/val1.json | 86 +++
.../hive/benchmark/udf/json_read/val1.type | 1 +
.../hadoop/hive/ql/exec/FunctionRegistry.java | 1 +
.../hive/ql/udf/generic/GenericUDFJsonRead.java | 92 +++
.../ql/udf/generic/TestGenericUDFJsonRead.java | 204 ++++++
.../test/queries/clientpositive/json_serde2.q | 37 ++
.../test/queries/clientpositive/udf_json_read.q | 44 ++
.../results/clientpositive/json_serde2.q.out | 113 ++++
.../results/clientpositive/show_functions.q.out | 1 +
.../results/clientpositive/udf_json_read.q.out | 107 +++
.../apache/hadoop/hive/serde2/JsonSerDe.java | 645 ++++++-------------
.../hive/serde2/json/HiveJsonStructReader.java | 402 ++++++++++++
.../apache/hive/streaming/StrictJsonWriter.java | 2 +-
16 files changed, 1428 insertions(+), 1014 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java
----------------------------------------------------------------------
diff --git a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java
index af80c02..87611ad 100644
--- a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java
+++ b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/data/JsonSerDe.java
@@ -18,125 +18,52 @@
*/
package org.apache.hive.hcatalog.data;
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.nio.charset.CharacterCodingException;
import java.util.ArrayList;
import java.util.Arrays;
-import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
+import java.util.Map.Entry;
import java.util.Properties;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
+import java.util.Set;
+import org.apache.commons.lang3.ArrayUtils;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.hive.common.type.Date;
-import org.apache.hadoop.hive.common.type.HiveChar;
-import org.apache.hadoop.hive.common.type.HiveDecimal;
-import org.apache.hadoop.hive.common.type.HiveVarchar;
-import org.apache.hadoop.hive.common.type.Timestamp;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeSpec;
import org.apache.hadoop.hive.serde2.SerDeStats;
-import org.apache.hadoop.hive.serde2.SerDeUtils;
-import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.StructField;
-import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveCharObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector;
-import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
-import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
-import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
-import org.apache.hive.common.util.HiveStringUtils;
-import org.apache.hive.common.util.TimestampParser;
import org.apache.hive.hcatalog.common.HCatException;
-import org.apache.hive.hcatalog.data.schema.HCatFieldSchema;
-import org.apache.hive.hcatalog.data.schema.HCatFieldSchema.Type;
import org.apache.hive.hcatalog.data.schema.HCatSchema;
import org.apache.hive.hcatalog.data.schema.HCatSchemaUtils;
-import org.codehaus.jackson.JsonFactory;
-import org.codehaus.jackson.JsonParseException;
-import org.codehaus.jackson.JsonParser;
-import org.codehaus.jackson.JsonToken;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@SerDeSpec(schemaProps = {serdeConstants.LIST_COLUMNS,
serdeConstants.LIST_COLUMN_TYPES,
serdeConstants.TIMESTAMP_FORMATS})
-
public class JsonSerDe extends AbstractSerDe {
private static final Logger LOG = LoggerFactory.getLogger(JsonSerDe.class);
- private List<String> columnNames;
private HCatSchema schema;
- private JsonFactory jsonFactory = null;
-
private HCatRecordObjectInspector cachedObjectInspector;
- private TimestampParser tsParser;
+ private org.apache.hadoop.hive.serde2.JsonSerDe jsonSerde = new org.apache.hadoop.hive.serde2.JsonSerDe();
@Override
public void initialize(Configuration conf, Properties tbl)
throws SerDeException {
- List<TypeInfo> columnTypes;
- StructTypeInfo rowTypeInfo;
-
- LOG.debug("Initializing JsonSerDe: {}", tbl.entrySet());
-
- // Get column names and types
- String columnNameProperty = tbl.getProperty(serdeConstants.LIST_COLUMNS);
- String columnTypeProperty = tbl.getProperty(serdeConstants.LIST_COLUMN_TYPES);
- final String columnNameDelimiter = tbl.containsKey(serdeConstants.COLUMN_NAME_DELIMITER) ? tbl
- .getProperty(serdeConstants.COLUMN_NAME_DELIMITER) : String.valueOf(SerDeUtils.COMMA);
- // all table column names
- if (columnNameProperty.isEmpty()) {
- columnNames = Collections.emptyList();
- } else {
- columnNames = Arrays.asList(columnNameProperty.split(columnNameDelimiter));
- }
-
- // all column types
- if (columnTypeProperty.isEmpty()) {
- columnTypes = Collections.emptyList();
- } else {
- columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
- }
- LOG.debug("columns: {}, {}", columnNameProperty, columnNames);
- LOG.debug("types: {}, {} ", columnTypeProperty, columnTypes);
-
- assert (columnNames.size() == columnTypes.size());
-
- rowTypeInfo = (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes);
+ jsonSerde.initialize(conf, tbl);
+ jsonSerde.setWriteablesUsage(false);
+ StructTypeInfo rowTypeInfo = jsonSerde.getTypeInfo();
cachedObjectInspector = HCatRecordObjectInspectorFactory.getHCatRecordObjectInspector(rowTypeInfo);
try {
schema = HCatSchemaUtils.getHCatSchema(rowTypeInfo).get(0).getStructSubSchema();
@@ -145,10 +72,6 @@ public class JsonSerDe extends AbstractSerDe {
} catch (HCatException e) {
throw new SerDeException(e);
}
-
- jsonFactory = new JsonFactory();
- tsParser = new TimestampParser(
- HiveStringUtils.splitAndUnEscape(tbl.getProperty(serdeConstants.TIMESTAMP_FORMATS)));
}
/**
@@ -160,274 +83,82 @@ public class JsonSerDe extends AbstractSerDe {
*/
@Override
public Object deserialize(Writable blob) throws SerDeException {
-
- Text t = (Text) blob;
- JsonParser p;
- List<Object> r = new ArrayList<Object>(Collections.nCopies(columnNames.size(), null));
try {
- p = jsonFactory.createJsonParser(new ByteArrayInputStream((t.getBytes())));
- if (p.nextToken() != JsonToken.START_OBJECT) {
- throw new IOException("Start token not found where expected");
- }
- JsonToken token;
- while (((token = p.nextToken()) != JsonToken.END_OBJECT) && (token != null)) {
- // iterate through each token, and create appropriate object here.
- populateRecord(r, token, p, schema);
- }
- } catch (JsonParseException e) {
- LOG.warn("Error [{}] parsing json text [{}].", e, t);
- throw new SerDeException(e);
- } catch (IOException e) {
- LOG.warn("Error [{}] parsing json text [{}].", e, t);
+ Object row = jsonSerde.deserialize(blob);
+ List fatRow = fatLand((Object[]) row);
+ return new DefaultHCatRecord(fatRow);
+ } catch (Exception e) {
throw new SerDeException(e);
}
-
- return new DefaultHCatRecord(r);
}
- private void populateRecord(List<Object> r, JsonToken token, JsonParser p, HCatSchema s) throws IOException {
- if (token != JsonToken.FIELD_NAME) {
- throw new IOException("Field name expected");
- }
- String fieldName = p.getText();
- Integer fpos = s.getPosition(fieldName);
- if (fpos == null) {
- fpos = getPositionFromHiveInternalColumnName(fieldName);
- LOG.debug("NPE finding position for field [{}] in schema [{}],"
- +" attempting to check if it is an internal column name like _col0", fieldName, s);
- if (fpos == -1) {
- skipValue(p);
- return; // unknown field, we return. We'll continue from the next field onwards.
- }
- // If we get past this, then the column name did match the hive pattern for an internal
- // column name, such as _col0, etc, so it *MUST* match the schema for the appropriate column.
- // This means people can't use arbitrary column names such as _col0, and expect us to ignore it
- // if we find it.
- if (!fieldName.equalsIgnoreCase(getHiveInternalColumnName(fpos))) {
- LOG.error("Hive internal column name {} and position "
- + "encoding {} for the column name are at odds", fieldName, fpos);
- throw new IOException("Hive internal column name ("+ fieldName
- + ") and position encoding (" + fpos
- + ") for the column name are at odds");
+ @SuppressWarnings({"rawtypes", "unchecked" })
+ private static List fatLand(Object[] arr) {
+ List ret = new ArrayList<>();
+ for (Object o : arr) {
+ if (o != null && o instanceof Map<?, ?>) {
+ ret.add(fatMap(((Map) o)));
+ } else if (o != null && o instanceof List<?>) {
+ ret.add(fatLand(((List) o).toArray()));
+ } else if (o != null && o.getClass().isArray() && o.getClass().getComponentType() != byte.class) {
+ Class<?> ct = o.getClass().getComponentType();
+ if (ct.isPrimitive()) {
+ ret.add(primitiveArrayToList(o));
+ } else {
+ ret.add(fatLand((Object[]) o));
+ }
+ } else {
+ ret.add(o);
}
- // If we reached here, then we were successful at finding an alternate internal
- // column mapping, and we're about to proceed.
}
- HCatFieldSchema hcatFieldSchema = s.getFields().get(fpos);
- Object currField = extractCurrentField(p, hcatFieldSchema, false);
- r.set(fpos, currField);
+ return ret;
}
- public String getHiveInternalColumnName(int fpos) {
- return HiveConf.getColumnInternalName(fpos);
- }
-
- public int getPositionFromHiveInternalColumnName(String internalName) {
-// return HiveConf.getPositionFromInternalName(fieldName);
- // The above line should have been all the implementation that
- // we need, but due to a bug in that impl which recognizes
- // only single-digit columns, we need another impl here.
- Pattern internalPattern = Pattern.compile("_col([0-9]+)");
- Matcher m = internalPattern.matcher(internalName);
- if (!m.matches()) {
- return -1;
- } else {
- return Integer.parseInt(m.group(1));
+ @SuppressWarnings("rawtypes")
+ private static Object fatMap(Map<Object, Object> map) {
+ Map ret = new LinkedHashMap<>();
+ Set<Entry<Object, Object>> es = map.entrySet();
+ for (Entry<Object, Object> e : es) {
+ Object oldV = e.getValue();
+ Object newV;
+ if (oldV != null && oldV.getClass().isArray()) {
+ newV = fatLand((Object[]) oldV);
+ } else {
+ newV = oldV;
+ }
+ ret.put(e.getKey(), newV);
}
+ return ret;
}
- /**
- * Utility method to extract (and forget) the next value token from the JsonParser,
- * as a whole. The reason this function gets called is to yank out the next value altogether,
- * because it corresponds to a field name that we do not recognize, and thus, do not have
- * a schema/type for. Thus, this field is to be ignored.
- * @throws IOException
- * @throws JsonParseException
- */
- private void skipValue(JsonParser p) throws JsonParseException, IOException {
- JsonToken valueToken = p.nextToken();
-
- if ((valueToken == JsonToken.START_ARRAY) || (valueToken == JsonToken.START_OBJECT)){
- // if the currently read token is a beginning of an array or object, move stream forward
- // skipping any child tokens till we're at the corresponding END_ARRAY or END_OBJECT token
- p.skipChildren();
+ private static Object primitiveArrayToList(Object arr) {
+ Class<?> ct = arr.getClass().getComponentType();
+ if (int.class.equals(ct)) {
+ return Arrays.asList(ArrayUtils.toObject((int[]) arr));
}
- // At the end of this function, the stream should be pointing to the last token that
- // corresponds to the value being skipped. This way, the next call to nextToken
- // will advance it to the next field name.
- }
-
- /**
- * Utility method to extract current expected field from given JsonParser
- *
- * isTokenCurrent is a boolean variable also passed in, which determines
- * if the JsonParser is already at the token we expect to read next, or
- * needs advancing to the next before we read.
- */
- private Object extractCurrentField(JsonParser p, HCatFieldSchema hcatFieldSchema,
- boolean isTokenCurrent) throws IOException {
- Object val = null;
- JsonToken valueToken;
- if (isTokenCurrent) {
- valueToken = p.getCurrentToken();
- } else {
- valueToken = p.nextToken();
+ if (long.class.equals(ct)) {
+ return Arrays.asList(ArrayUtils.toObject((long[]) arr));
}
- switch (hcatFieldSchema.getType()) {
- case INT:
- val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getIntValue();
- break;
- case TINYINT:
- val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getByteValue();
- break;
- case SMALLINT:
- val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getShortValue();
- break;
- case BIGINT:
- val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getLongValue();
- break;
- case BOOLEAN:
- String bval = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText();
- if (bval != null) {
- val = Boolean.valueOf(bval);
- } else {
- val = null;
- }
- break;
- case FLOAT:
- val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getFloatValue();
- break;
- case DOUBLE:
- val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getDoubleValue();
- break;
- case STRING:
- val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText();
- break;
- case BINARY:
- String b = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText();
- if (b != null) {
- try {
- String t = Text.decode(b.getBytes(), 0, b.getBytes().length);
- return t.getBytes();
- } catch (CharacterCodingException e) {
- LOG.warn("Error generating json binary type from object.", e);
- return null;
- }
- } else {
- val = null;
- }
- break;
- case DATE:
- val = (valueToken == JsonToken.VALUE_NULL) ? null : Date.valueOf(p.getText());
- break;
- case TIMESTAMP:
- val = (valueToken == JsonToken.VALUE_NULL) ? null : tsParser.parseTimestamp(p.getText());
- break;
- case DECIMAL:
- val = (valueToken == JsonToken.VALUE_NULL) ? null : HiveDecimal.create(p.getText());
- break;
- case VARCHAR:
- int vLen = ((BaseCharTypeInfo)hcatFieldSchema.getTypeInfo()).getLength();
- val = (valueToken == JsonToken.VALUE_NULL) ? null : new HiveVarchar(p.getText(), vLen);
- break;
- case CHAR:
- int cLen = ((BaseCharTypeInfo)hcatFieldSchema.getTypeInfo()).getLength();
- val = (valueToken == JsonToken.VALUE_NULL) ? null : new HiveChar(p.getText(), cLen);
- break;
- case ARRAY:
- if (valueToken == JsonToken.VALUE_NULL) {
- val = null;
- break;
- }
- if (valueToken != JsonToken.START_ARRAY) {
- throw new IOException("Start of Array expected");
- }
- List<Object> arr = new ArrayList<Object>();
- while ((valueToken = p.nextToken()) != JsonToken.END_ARRAY) {
- arr.add(extractCurrentField(p, hcatFieldSchema.getArrayElementSchema().get(0), true));
- }
- val = arr;
- break;
- case MAP:
- if (valueToken == JsonToken.VALUE_NULL) {
- val = null;
- break;
- }
- if (valueToken != JsonToken.START_OBJECT) {
- throw new IOException("Start of Object expected");
- }
- Map<Object, Object> map = new LinkedHashMap<Object, Object>();
- HCatFieldSchema valueSchema = hcatFieldSchema.getMapValueSchema().get(0);
- while ((valueToken = p.nextToken()) != JsonToken.END_OBJECT) {
- Object k = getObjectOfCorrespondingPrimitiveType(p.getCurrentName(), hcatFieldSchema.getMapKeyTypeInfo());
- Object v = extractCurrentField(p, valueSchema, false);
- map.put(k, v);
- }
- val = map;
- break;
- case STRUCT:
- if (valueToken == JsonToken.VALUE_NULL) {
- val = null;
- break;
- }
- if (valueToken != JsonToken.START_OBJECT) {
- throw new IOException("Start of Object expected");
- }
- HCatSchema subSchema = hcatFieldSchema.getStructSubSchema();
- int sz = subSchema.getFieldNames().size();
-
- List<Object> struct = new ArrayList<Object>(Collections.nCopies(sz, null));
- while ((valueToken = p.nextToken()) != JsonToken.END_OBJECT) {
- populateRecord(struct, valueToken, p, subSchema);
- }
- val = struct;
- break;
- default:
- LOG.error("Unknown type found: " + hcatFieldSchema.getType());
- return null;
+ if (char.class.equals(ct)) {
+ return Arrays.asList(ArrayUtils.toObject((char[]) arr));
+ }
+ if (byte.class.equals(ct)) {
+ return Arrays.asList(ArrayUtils.toObject((byte[]) arr));
+ }
+ if (short.class.equals(ct)) {
+ return Arrays.asList(ArrayUtils.toObject((short[]) arr));
}
- return val;
+ if (float.class.equals(ct)) {
+ return Arrays.asList(ArrayUtils.toObject((float[]) arr));
+ }
+ if (double.class.equals(ct)) {
+ return Arrays.asList(ArrayUtils.toObject((double[]) arr));
+ }
+ throw new RuntimeException("Unhandled primitiveArrayToList for type: " + ct);
}
- private Object getObjectOfCorrespondingPrimitiveType(String s, PrimitiveTypeInfo mapKeyType)
- throws IOException {
- switch (Type.getPrimitiveHType(mapKeyType)) {
- case INT:
- return Integer.valueOf(s);
- case TINYINT:
- return Byte.valueOf(s);
- case SMALLINT:
- return Short.valueOf(s);
- case BIGINT:
- return Long.valueOf(s);
- case BOOLEAN:
- return (s.equalsIgnoreCase("true"));
- case FLOAT:
- return Float.valueOf(s);
- case DOUBLE:
- return Double.valueOf(s);
- case STRING:
- return s;
- case BINARY:
- try {
- String t = Text.decode(s.getBytes(), 0, s.getBytes().length);
- return t.getBytes();
- } catch (CharacterCodingException e) {
- LOG.warn("Error generating json binary type from object.", e);
- return null;
- }
- case DATE:
- return Date.valueOf(s);
- case TIMESTAMP:
- return Timestamp.valueOf(s);
- case DECIMAL:
- return HiveDecimal.create(s);
- case VARCHAR:
- return new HiveVarchar(s, ((BaseCharTypeInfo)mapKeyType).getLength());
- case CHAR:
- return new HiveChar(s, ((BaseCharTypeInfo)mapKeyType).getLength());
- }
- throw new IOException("Could not convert from string to map type " + mapKeyType.getTypeName());
+ public String getHiveInternalColumnName(int fpos) {
+ return HiveConf.getColumnInternalName(fpos);
}
/**
@@ -437,217 +168,8 @@ public class JsonSerDe extends AbstractSerDe {
@Override
public Writable serialize(Object obj, ObjectInspector objInspector)
throws SerDeException {
- StringBuilder sb = new StringBuilder();
- try {
-
- StructObjectInspector soi = (StructObjectInspector) objInspector;
- List<? extends StructField> structFields = soi.getAllStructFieldRefs();
- assert (columnNames.size() == structFields.size());
- if (obj == null) {
- sb.append("null");
- } else {
- sb.append(SerDeUtils.LBRACE);
- for (int i = 0; i < structFields.size(); i++) {
- if (i > 0) {
- sb.append(SerDeUtils.COMMA);
- }
- appendWithQuotes(sb, columnNames.get(i));
- sb.append(SerDeUtils.COLON);
- buildJSONString(sb, soi.getStructFieldData(obj, structFields.get(i)),
- structFields.get(i).getFieldObjectInspector());
- }
- sb.append(SerDeUtils.RBRACE);
- }
-
- } catch (IOException e) {
- LOG.warn("Error generating json text from object.", e);
- throw new SerDeException(e);
- }
- return new Text(sb.toString());
- }
- private static StringBuilder appendWithQuotes(StringBuilder sb, String value) {
- return sb == null ? null : sb.append(SerDeUtils.QUOTE).append(value).append(SerDeUtils.QUOTE);
+ return jsonSerde.serialize(obj, objInspector);
}
- // TODO : code section copied over from SerDeUtils because of non-standard json production there
- // should use quotes for all field names. We should fix this there, and then remove this copy.
- // See http://jackson.codehaus.org/1.7.3/javadoc/org/codehaus/jackson/JsonParser.Feature.html#ALLOW_UNQUOTED_FIELD_NAMES
- // for details - trying to enable Jackson to ignore that doesn't seem to work(compilation failure
- // when attempting to use that feature, so having to change the production itself.
- // Also, throws IOException when Binary is detected.
- private static void buildJSONString(StringBuilder sb, Object o, ObjectInspector oi) throws IOException {
-
- switch (oi.getCategory()) {
- case PRIMITIVE: {
- PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi;
- if (o == null) {
- sb.append("null");
- } else {
- switch (poi.getPrimitiveCategory()) {
- case BOOLEAN: {
- boolean b = ((BooleanObjectInspector) poi).get(o);
- sb.append(b ? "true" : "false");
- break;
- }
- case BYTE: {
- sb.append(((ByteObjectInspector) poi).get(o));
- break;
- }
- case SHORT: {
- sb.append(((ShortObjectInspector) poi).get(o));
- break;
- }
- case INT: {
- sb.append(((IntObjectInspector) poi).get(o));
- break;
- }
- case LONG: {
- sb.append(((LongObjectInspector) poi).get(o));
- break;
- }
- case FLOAT: {
- sb.append(((FloatObjectInspector) poi).get(o));
- break;
- }
- case DOUBLE: {
- sb.append(((DoubleObjectInspector) poi).get(o));
- break;
- }
- case STRING: {
- String s =
- SerDeUtils.escapeString(((StringObjectInspector) poi).getPrimitiveJavaObject(o));
- appendWithQuotes(sb, s);
- break;
- }
- case BINARY:
- byte[] b = ((BinaryObjectInspector) oi).getPrimitiveJavaObject(o);
- Text txt = new Text();
- txt.set(b, 0, b.length);
- appendWithQuotes(sb, SerDeUtils.escapeString(txt.toString()));
- break;
- case DATE:
- Date d = ((DateObjectInspector)poi).getPrimitiveJavaObject(o);
- appendWithQuotes(sb, d.toString());
- break;
- case TIMESTAMP: {
- Timestamp t = ((TimestampObjectInspector) poi).getPrimitiveJavaObject(o);
- appendWithQuotes(sb, t.toString());
- break;
- }
- case DECIMAL:
- sb.append(((HiveDecimalObjectInspector)poi).getPrimitiveJavaObject(o));
- break;
- case VARCHAR: {
- String s = SerDeUtils.escapeString(
- ((HiveVarcharObjectInspector) poi).getPrimitiveJavaObject(o).toString());
- appendWithQuotes(sb, s);
- break;
- }
- case CHAR: {
- //this should use HiveChar.getPaddedValue() but it's protected; currently (v0.13)
- // HiveChar.toString() returns getPaddedValue()
- String s = SerDeUtils.escapeString(
- ((HiveCharObjectInspector) poi).getPrimitiveJavaObject(o).toString());
- appendWithQuotes(sb, s);
- break;
- }
- default:
- throw new RuntimeException("Unknown primitive type: " + poi.getPrimitiveCategory());
- }
- }
- break;
- }
- case LIST: {
- ListObjectInspector loi = (ListObjectInspector) oi;
- ObjectInspector listElementObjectInspector = loi
- .getListElementObjectInspector();
- List<?> olist = loi.getList(o);
- if (olist == null) {
- sb.append("null");
- } else {
- sb.append(SerDeUtils.LBRACKET);
- for (int i = 0; i < olist.size(); i++) {
- if (i > 0) {
- sb.append(SerDeUtils.COMMA);
- }
- buildJSONString(sb, olist.get(i), listElementObjectInspector);
- }
- sb.append(SerDeUtils.RBRACKET);
- }
- break;
- }
- case MAP: {
- MapObjectInspector moi = (MapObjectInspector) oi;
- ObjectInspector mapKeyObjectInspector = moi.getMapKeyObjectInspector();
- ObjectInspector mapValueObjectInspector = moi
- .getMapValueObjectInspector();
- Map<?, ?> omap = moi.getMap(o);
- if (omap == null) {
- sb.append("null");
- } else {
- sb.append(SerDeUtils.LBRACE);
- boolean first = true;
- for (Object entry : omap.entrySet()) {
- if (first) {
- first = false;
- } else {
- sb.append(SerDeUtils.COMMA);
- }
- Map.Entry<?, ?> e = (Map.Entry<?, ?>) entry;
- StringBuilder keyBuilder = new StringBuilder();
- buildJSONString(keyBuilder, e.getKey(), mapKeyObjectInspector);
- String keyString = keyBuilder.toString().trim();
- if((!keyString.isEmpty()) && (keyString.charAt(0) != SerDeUtils.QUOTE)) {
- appendWithQuotes(sb, keyString);
- }
- else {
- sb.append(keyString);
- }
- sb.append(SerDeUtils.COLON);
- buildJSONString(sb, e.getValue(), mapValueObjectInspector);
- }
- sb.append(SerDeUtils.RBRACE);
- }
- break;
- }
- case STRUCT: {
- StructObjectInspector soi = (StructObjectInspector) oi;
- List<? extends StructField> structFields = soi.getAllStructFieldRefs();
- if (o == null) {
- sb.append("null");
- } else {
- sb.append(SerDeUtils.LBRACE);
- for (int i = 0; i < structFields.size(); i++) {
- if (i > 0) {
- sb.append(SerDeUtils.COMMA);
- }
- appendWithQuotes(sb, structFields.get(i).getFieldName());
- sb.append(SerDeUtils.COLON);
- buildJSONString(sb, soi.getStructFieldData(o, structFields.get(i)),
- structFields.get(i).getFieldObjectInspector());
- }
- sb.append(SerDeUtils.RBRACE);
- }
- break;
- }
- case UNION: {
- UnionObjectInspector uoi = (UnionObjectInspector) oi;
- if (o == null) {
- sb.append("null");
- } else {
- sb.append(SerDeUtils.LBRACE);
- sb.append(uoi.getTag(o));
- sb.append(SerDeUtils.COLON);
- buildJSONString(sb, uoi.getField(o),
- uoi.getObjectInspectors().get(uoi.getTag(o)));
- sb.append(SerDeUtils.RBRACE);
- }
- break;
- }
- default:
- throw new RuntimeException("Unknown type in ObjectInspector!");
- }
- }
-
/**
* Returns an object inspector for the specified schema that
http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/hcatalog/core/src/test/java/org/apache/hive/hcatalog/data/TestJsonSerDe.java
----------------------------------------------------------------------
diff --git a/hcatalog/core/src/test/java/org/apache/hive/hcatalog/data/TestJsonSerDe.java b/hcatalog/core/src/test/java/org/apache/hive/hcatalog/data/TestJsonSerDe.java
index 6770d44..d476b43 100644
--- a/hcatalog/core/src/test/java/org/apache/hive/hcatalog/data/TestJsonSerDe.java
+++ b/hcatalog/core/src/test/java/org/apache/hive/hcatalog/data/TestJsonSerDe.java
@@ -27,8 +27,6 @@ import java.util.List;
import java.util.Map;
import java.util.Properties;
-import junit.framework.TestCase;
-
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.common.type.Date;
import org.apache.hadoop.hive.common.type.HiveChar;
@@ -43,6 +41,8 @@ import org.apache.hadoop.io.Writable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import junit.framework.TestCase;
+
public class TestJsonSerDe extends TestCase {
private static final Logger LOG = LoggerFactory.getLogger(TestJsonSerDe.class);
@@ -158,17 +158,17 @@ public class TestJsonSerDe extends TestCase {
Writable s = hrsd.serialize(r, hrsd.getObjectInspector());
LOG.info("ONE:{}", s);
- Object o1 = hrsd.deserialize(s);
+ HCatRecord o1 = (HCatRecord) hrsd.deserialize(s);
StringBuilder msg = new StringBuilder();
- boolean isEqual = HCatDataCheckUtil.recordsEqual(r, (HCatRecord) o1);
+ boolean isEqual = HCatDataCheckUtil.recordsEqual(r, o1);
assertTrue(msg.toString(), isEqual);
Writable s2 = jsde.serialize(o1, hrsd.getObjectInspector());
LOG.info("TWO:{}", s2);
- Object o2 = jsde.deserialize(s2);
+ HCatRecord o2 = (HCatRecord) jsde.deserialize(s2);
LOG.info("deserialized TWO : {} ", o2);
msg.setLength(0);
- isEqual = HCatDataCheckUtil.recordsEqual(r, (HCatRecord) o2, msg);
+ isEqual = HCatDataCheckUtil.recordsEqual(r, o2, msg);
assertTrue(msg.toString(), isEqual);
}
http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/udf/json_read/JsonReadBench.java
----------------------------------------------------------------------
diff --git a/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/udf/json_read/JsonReadBench.java b/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/udf/json_read/JsonReadBench.java
new file mode 100644
index 0000000..aae247c
--- /dev/null
+++ b/itests/hive-jmh/src/main/java/org/apache/hive/benchmark/udf/json_read/JsonReadBench.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hive.benchmark.udf.json_read;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFJsonRead;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
+import org.apache.hadoop.io.Text;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.State;
+
+public class JsonReadBench {
+
+ @State(Scope.Thread)
+ public static class MyState {
+
+ public final String json;
+ public final String type;
+
+ public MyState() {
+ try {
+ json = getResource("val1.json");
+ type = getResource("val1.type").toLowerCase().trim();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private String getResource(String fname) throws IOException {
+ return IOUtils.toString(JsonReadBench.class.getResourceAsStream(fname), Charset.defaultCharset());
+ }
+ }
+
+ public void checkBenchMarkMethod() throws Exception {
+ benchmarkMethod(new MyState());
+ }
+
+ @Benchmark
+ public void benchmarkMethod(MyState state) throws Exception {
+ try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) {
+ ObjectInspector[] arguments = buildArguments(state.type);
+ udf.initialize(arguments);
+
+ udf.evaluate(evalArgs(state.json));
+ }
+ }
+
+ private DeferredObject[] evalArgs(String string) {
+ return new DeferredObject[] { new GenericUDF.DeferredJavaObject(new Text(string)), null };
+ }
+
+ private ObjectInspector[] buildArguments(String typeStr) {
+ ObjectInspector valueOI = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
+ ObjectInspector[] arguments = { valueOI, PrimitiveObjectInspectorFactory
+ .getPrimitiveWritableConstantObjectInspector(TypeInfoFactory.stringTypeInfo, new Text(typeStr)) };
+ return arguments;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/itests/hive-jmh/src/main/resources/org/apache/hive/benchmark/udf/json_read/val1.json
----------------------------------------------------------------------
diff --git a/itests/hive-jmh/src/main/resources/org/apache/hive/benchmark/udf/json_read/val1.json b/itests/hive-jmh/src/main/resources/org/apache/hive/benchmark/udf/json_read/val1.json
new file mode 100644
index 0000000..4466539
--- /dev/null
+++ b/itests/hive-jmh/src/main/resources/org/apache/hive/benchmark/udf/json_read/val1.json
@@ -0,0 +1,86 @@
+[
+{
+ "t0":"2017-08-1414:45:23.522000",
+ "business_id": "vcNAWiLM4dR7D2nwwJ7nCA",
+ "hours": {
+ "Tuesday": {
+ "close": "17:00",
+ "open": "08:00"
+ },
+ "Friday": {
+ "close": "17:00",
+ "open": "08:00"
+ }
+ },
+ "open": true,
+ "categories": [
+ "Doctors",
+ "Health & Medical"
+ ],
+ "review_count": 9,
+ "name": "Eric Goldberg, MD",
+ "neighborhoods": [],
+ "attributes": {
+ "By Appointment Only": true,
+ "Accepts Credit Cards": true,
+ "Good For Groups": 1
+ },
+ "type": "business"
+}
+,
+{
+ "business_id": "vcNAWiLM4dR7D2nwwJ7nCA",
+ "hours": {
+ "Tuesday": {
+ "close": "17:00",
+ "open": "08:00"
+ },
+ "Friday": {
+ "close": "17:00",
+ "open": "08:00"
+ }
+ },
+ "open": true,
+ "categories": [
+ "Doctors",
+ "Health & Medical"
+ ],
+ "review_count": 9,
+ "name": "Eric Goldberg, MD",
+ "neighborhoods": [],
+ "attributes": {
+ "By Appointment Only": true,
+ "Accepts Credit Cards": true,
+ "Good For Groups": 1
+ },
+ "type": "business"
+}
+,
+{
+ "business_id": "vcNAWiLM4dR7D2nwwJ7nCA",
+ "hours": {
+ "Tuesday": {
+ "close": "17:00",
+ "open": "08:00"
+ },
+ "Friday": {
+ "close": "17:00",
+ "open": "08:00"
+ }
+ },
+ "open": true,
+ "categories": [
+ "Doctors",
+ "Health & Medical"
+ ],
+ "review_count": 9,
+ "name": "Eric Goldberg, MD",
+ "neighborhoods": [],
+ "attributes": {
+ "By Appointment Only": true,
+ "Accepts Credit Cards": true,
+ "Good For Groups": 1
+ },
+ "type": "business"
+}
+]
http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/itests/hive-jmh/src/main/resources/org/apache/hive/benchmark/udf/json_read/val1.type
----------------------------------------------------------------------
diff --git a/itests/hive-jmh/src/main/resources/org/apache/hive/benchmark/udf/json_read/val1.type b/itests/hive-jmh/src/main/resources/org/apache/hive/benchmark/udf/json_read/val1.type
new file mode 100644
index 0000000..3543223
--- /dev/null
+++ b/itests/hive-jmh/src/main/resources/org/apache/hive/benchmark/udf/json_read/val1.type
@@ -0,0 +1 @@
+array<struct<t0:timestamp,attributes:struct<accepts credit cards:boolean,by appointment only:boolean,good for groups:int>,business_id:string,categories:array<string>,hours:map<string,struct<close:string,open:string>>,name:string,neighborhoods:array<string>,open:boolean,review_count:int,type:string>>
http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
index 10517ad..0800a10 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
@@ -213,6 +213,7 @@ public final class FunctionRegistry {
system.registerGenericUDF("ceiling", GenericUDFCeil.class);
system.registerUDF("rand", UDFRand.class, false);
system.registerGenericUDF("abs", GenericUDFAbs.class);
+ system.registerGenericUDF("json_read", GenericUDFJsonRead.class);
system.registerGenericUDF("sq_count_check", GenericUDFSQCountCheck.class);
system.registerGenericUDF("enforce_constraint", GenericUDFEnforceConstraint.class);
system.registerGenericUDF("pmod", GenericUDFPosMod.class);
http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFJsonRead.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFJsonRead.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFJsonRead.java
new file mode 100644
index 0000000..f5814ed
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFJsonRead.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.udf.generic;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.serde2.json.HiveJsonStructReader;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorConverter.TextConverter;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
+
+/**
+ * Parses a json string representation into a Hive struct.
+ */
+@Description(name = "json_read", value = "_FUNC_(json,type) - "
+ + "Parses the given json according to the given complex type specification", extended = ""
+ + "Parsed as null: if the json is null, it is the empty string or if it contains only whitespaces\n"
+ + "Example:\n" + "select _FUNC_('[]','array<struct<a:string>>' ")
+public class GenericUDFJsonRead extends GenericUDF {
+
+ private TextConverter inputConverter;
+ private HiveJsonStructReader jsonReader;
+
+ @Override
+ public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
+
+ checkArgsSize(arguments, 2, 2);
+ checkArgPrimitive(arguments, 0);
+ checkArgPrimitive(arguments, 1);
+ if (!ObjectInspectorUtils.isConstantObjectInspector(arguments[1])) {
+ throw new UDFArgumentTypeException(1, getFuncName() + " argument 2 may only be a constant");
+ }
+
+ inputConverter = new TextConverter((PrimitiveObjectInspector) arguments[0]);
+ String typeStr = getConstantStringValue(arguments, 1);
+
+ try {
+ TypeInfo t = TypeInfoUtils.getTypeInfoFromTypeString(typeStr);
+ jsonReader = new HiveJsonStructReader(t);
+ jsonReader.setWritablesUsage(true);
+ } catch (Exception e) {
+ throw new UDFArgumentException(getFuncName() + ": Error parsing typestring: " + e.getMessage());
+ }
+
+ return jsonReader.getObjectInspector();
+ }
+
+ @Override
+ public Object evaluate(DeferredObject[] arguments) throws HiveException {
+ Object valObject = arguments[0].get();
+ if (valObject == null) {
+ return null;
+ }
+
+ try {
+ String text = inputConverter.convert(valObject).toString();
+ if (text.trim().length() == 0) {
+ return null;
+ }
+ return jsonReader.parseStruct(text);
+ } catch (Exception e) {
+ throw new HiveException("Error parsing json: " + e.getMessage(), e);
+ }
+ }
+
+ @Override
+ public String getDisplayString(String[] children) {
+ return getStandardDisplayString("json_read", children);
+ }
+
+
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFJsonRead.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFJsonRead.java b/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFJsonRead.java
new file mode 100644
index 0000000..3016eaf
--- /dev/null
+++ b/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFJsonRead.java
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.udf.generic;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredJavaObject;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
+import org.apache.hadoop.io.Text;
+import org.junit.Test;
+
+public class TestGenericUDFJsonRead {
+
+ @Test(expected = UDFArgumentException.class)
+ public void testArgCnt1() throws Exception {
+ try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) {
+ ObjectInspector valueOI = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
+ ObjectInspector[] arguments = { valueOI };
+ udf.initialize(arguments);
+ }
+ }
+
+ @Test(expected = UDFArgumentException.class)
+ public void testArgCnt3() throws Exception {
+ try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) {
+ ObjectInspector valueOI = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
+ ObjectInspector[] arguments = { valueOI, valueOI };
+ udf.initialize(arguments);
+ }
+ }
+
+ @Test(expected = UDFArgumentException.class)
+ public void testArgInvalidType() throws Exception {
+ try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) {
+ ObjectInspector[] arguments = buildArguments("__invalid__type__");
+ udf.initialize(arguments);
+ }
+ }
+
+ @Test
+ public void testList() throws Exception {
+ try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) {
+ ObjectInspector[] arguments = buildArguments("array<string>");
+ udf.initialize(arguments);
+
+ Object res = udf.evaluate(evalArgs("[\"a\",\"b\",null]"));
+ assertTrue(res instanceof List<?>);
+ List<?> l = (List<?>) res;
+ assertEquals(3, l.size());
+ assertEquals(new Text("a"), l.get(0));
+ assertEquals(new Text("b"), l.get(1));
+ assertEquals(null, l.get(2));
+ }
+ }
+
+ @Test
+ public void testListNull() throws Exception {
+ try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) {
+ ObjectInspector[] arguments = buildArguments("array<string>");
+ udf.initialize(arguments);
+
+ Object res = udf.evaluate(evalArgs("null"));
+ assertNull(res);
+ }
+ }
+
+ @Test
+ public void testSimpleStruct() throws Exception {
+ try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) {
+ ObjectInspector[] arguments = buildArguments("struct<a:string>");
+ udf.initialize(arguments);
+
+ Object res = udf.evaluate(evalArgs("{\"a\":\"b\"}"));
+ assertTrue(res instanceof Object[]);
+ Object o[] = (Object[]) res;
+ assertEquals(new Text("b"), o[0]);
+ }
+ }
+
+ @Test
+ public void testStructNullField() throws Exception {
+ try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) {
+ ObjectInspector[] arguments = buildArguments("struct<a:string>");
+ udf.initialize(arguments);
+
+ Object res = udf.evaluate(evalArgs("{\"a\":null}"));
+ assertTrue(res instanceof Object[]);
+ Object o[] = (Object[]) res;
+ assertEquals(null, o[0]);
+ }
+ }
+
+ @Test
+ public void testStructEmptyString() throws Exception {
+ try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) {
+ ObjectInspector[] arguments = buildArguments("struct<a:string>");
+ udf.initialize(arguments);
+
+ Object res = udf.evaluate(evalArgs(""));
+ assertNull(res);
+ }
+ }
+
+ @Test
+ public void testStructNull() throws Exception {
+ try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) {
+ ObjectInspector[] arguments = buildArguments("struct<a:string>");
+ udf.initialize(arguments);
+
+ Object res = udf.evaluate(new DeferredObject[] { new DeferredJavaObject(null), null });
+ assertNull(res);
+ }
+ }
+
+ @Test
+ public void testStructNullComplexField() throws Exception {
+ try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) {
+ ObjectInspector[] arguments = buildArguments("struct<a:struct<x:string>>");
+ udf.initialize(arguments);
+
+ Object res = udf.evaluate(evalArgs("{\"a\":null}"));
+ assertTrue(res instanceof Object[]);
+ Object o[] = (Object[]) res;
+ assertEquals(null, o[0]);
+ }
+ }
+
+ @Test(expected = HiveException.class)
+ public void testUndeclaredStructField() throws Exception {
+ try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) {
+ ObjectInspector[] arguments = buildArguments("struct<a:int>");
+ udf.initialize(arguments);
+
+ Object res = udf.evaluate(evalArgs("{\"b\":null}"));
+ assertTrue(res instanceof Object[]);
+ Object o[] = (Object[]) res;
+ assertEquals(null, o[0]);
+ }
+ }
+
+ @Test(expected = HiveException.class)
+ public void testUnexpectedStruct() throws Exception {
+ try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) {
+ ObjectInspector[] arguments = buildArguments("array<int>");
+ udf.initialize(arguments);
+
+ Object res = udf.evaluate(evalArgs("[1,22,2,{\"b\":null}]"));
+ assertTrue(res instanceof Object[]);
+ Object o[] = (Object[]) res;
+ assertEquals(null, o[0]);
+ }
+ }
+
+ @Test
+ public void testMap() throws Exception {
+ try (GenericUDFJsonRead udf = new GenericUDFJsonRead()) {
+ ObjectInspector[] arguments = buildArguments("map<string,string>");
+ udf.initialize(arguments);
+
+ Object res = udf.evaluate(evalArgs("{\"a\":\"v\"}"));
+ assertTrue(res instanceof Map);
+ Map o = (Map) res;
+ assertEquals(1, o.size());
+ assertEquals(new Text("v"), o.get(new Text("a")));
+ }
+ }
+
+ private DeferredObject[] evalArgs(String string) {
+ return new DeferredObject[] { new DeferredJavaObject(new Text(string)), null };
+ }
+
+ private ObjectInspector[] buildArguments(String typeStr) {
+ ObjectInspector valueOI = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
+ ObjectInspector[] arguments = { valueOI, PrimitiveObjectInspectorFactory
+ .getPrimitiveWritableConstantObjectInspector(TypeInfoFactory.stringTypeInfo, new Text(typeStr)) };
+ return arguments;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/ql/src/test/queries/clientpositive/json_serde2.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/json_serde2.q b/ql/src/test/queries/clientpositive/json_serde2.q
new file mode 100644
index 0000000..c6088b7
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/json_serde2.q
@@ -0,0 +1,37 @@
+--! qt:dataset:src
+
+add jar ${system:maven.local.repository}/org/apache/hive/hcatalog/hive-hcatalog-core/${system:hive.version}/hive-hcatalog-core-${system:hive.version}.jar;
+
+drop table if exists json_serde1_1;
+drop table if exists json_serde1_2;
+
+create table json_serde1_1 (a array<string>,b map<string,int>)
+ row format serde 'org.apache.hadoop.hive.serde2.JsonSerDe';
+
+insert into table json_serde1_1
+ select array('aaa'),map('aaa',1) from src limit 2;
+
+select * from json_serde1_1;
+
+create table json_serde1_2 (
+ a array<int>,
+ b map<int,date>,
+ c struct<c1:int, c2:string, c3:array<string>, c4:map<string, int>, c5:struct<c5_1:string, c5_2:int>>
+) row format serde 'org.apache.hadoop.hive.serde2.JsonSerDe';
+
+insert into table json_serde1_2
+ select
+ array(3, 2, 1),
+ map(1, date '2001-01-01', 2, null),
+ named_struct(
+ 'c1', 123456,
+ 'c2', 'hello',
+ 'c3', array('aa', 'bb', 'cc'),
+ 'c4', map('abc', 123, 'xyz', 456),
+ 'c5', named_struct('c5_1', 'bye', 'c5_2', 88))
+ from src limit 2;
+
+select * from json_serde1_2;
+
+drop table json_serde1_1;
+drop table json_serde1_2;
http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/ql/src/test/queries/clientpositive/udf_json_read.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/udf_json_read.q b/ql/src/test/queries/clientpositive/udf_json_read.q
new file mode 100644
index 0000000..30c297b
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/udf_json_read.q
@@ -0,0 +1,44 @@
+DESCRIBE FUNCTION java_read;
+DESCRIBE FUNCTION EXTENDED java_read;
+
+
+select json_read('[{"name":"john","alias":"j","address":{"city":"LA"}},{"name":"kinga","alias":"binga","age":2}]',
+ 'array<struct<name:string,age:int,alias:string,address:struct<city:string,street:string>>>');
+
+create table t (info array<struct<name:string,age:int,alias:string,address:struct<city:string,street:string>>>);
+
+insert into t
+ select json_read('[{"name":"john","alias":"j","address":{"city":"LA"}},{"name":"kinga","alias":"binga","age":2}]',
+ 'array<struct<name:string,age:int,alias:string,address:struct<city:string,street:string>>>');
+
+
+
+select json_read('[
+{
+ "business_id": "vcNAWiLM4dR7D2nwwJ7nCA",
+ "hours": {
+ "Tuesday": {
+ "close": "17:00",
+ "open": "08:00"
+ },
+ "Friday": {
+ "close": "17:00",
+ "open": "08:00"
+ }
+ },
+ "open": true,
+ "categories": [
+ "Doctors",
+ "Health & Medical"
+ ],
+ "review_count": 9,
+ "name": "Eric Goldberg, MD",
+ "neighborhoods": [],
+ "attributes": {
+ "By Appointment Only": true,
+ "Accepts Credit Cards": true,
+ "Good For Groups": 1
+ },
+ "type": "business"
+}
+]','array<struct<attributes:struct<accepts credit cards:boolean,by appointment only:boolean,good for groups:int>,business_id:string,categories:array<string>,hours:map<string,struct<close:string,open:string>>,name:string,neighborhoods:array<string>,open:boolean,review_count:int,type:string>>');
http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/ql/src/test/results/clientpositive/json_serde2.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/json_serde2.q.out b/ql/src/test/results/clientpositive/json_serde2.q.out
new file mode 100644
index 0000000..cfdb051
--- /dev/null
+++ b/ql/src/test/results/clientpositive/json_serde2.q.out
@@ -0,0 +1,113 @@
+PREHOOK: query: drop table if exists json_serde1_1
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table if exists json_serde1_1
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: drop table if exists json_serde1_2
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table if exists json_serde1_2
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create table json_serde1_1 (a array<string>,b map<string,int>)
+ row format serde 'org.apache.hadoop.hive.serde2.JsonSerDe'
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@json_serde1_1
+POSTHOOK: query: create table json_serde1_1 (a array<string>,b map<string,int>)
+ row format serde 'org.apache.hadoop.hive.serde2.JsonSerDe'
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@json_serde1_1
+PREHOOK: query: insert into table json_serde1_1
+ select array('aaa'),map('aaa',1) from src limit 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@json_serde1_1
+POSTHOOK: query: insert into table json_serde1_1
+ select array('aaa'),map('aaa',1) from src limit 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@json_serde1_1
+POSTHOOK: Lineage: json_serde1_1.a EXPRESSION []
+POSTHOOK: Lineage: json_serde1_1.b EXPRESSION []
+PREHOOK: query: select * from json_serde1_1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@json_serde1_1
+#### A masked pattern was here ####
+POSTHOOK: query: select * from json_serde1_1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@json_serde1_1
+#### A masked pattern was here ####
+["aaa"] {"aaa":1}
+["aaa"] {"aaa":1}
+PREHOOK: query: create table json_serde1_2 (
+ a array<int>,
+ b map<int,date>,
+ c struct<c1:int, c2:string, c3:array<string>, c4:map<string, int>, c5:struct<c5_1:string, c5_2:int>>
+) row format serde 'org.apache.hadoop.hive.serde2.JsonSerDe'
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@json_serde1_2
+POSTHOOK: query: create table json_serde1_2 (
+ a array<int>,
+ b map<int,date>,
+ c struct<c1:int, c2:string, c3:array<string>, c4:map<string, int>, c5:struct<c5_1:string, c5_2:int>>
+) row format serde 'org.apache.hadoop.hive.serde2.JsonSerDe'
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@json_serde1_2
+PREHOOK: query: insert into table json_serde1_2
+ select
+ array(3, 2, 1),
+ map(1, date '2001-01-01', 2, null),
+ named_struct(
+ 'c1', 123456,
+ 'c2', 'hello',
+ 'c3', array('aa', 'bb', 'cc'),
+ 'c4', map('abc', 123, 'xyz', 456),
+ 'c5', named_struct('c5_1', 'bye', 'c5_2', 88))
+ from src limit 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@json_serde1_2
+POSTHOOK: query: insert into table json_serde1_2
+ select
+ array(3, 2, 1),
+ map(1, date '2001-01-01', 2, null),
+ named_struct(
+ 'c1', 123456,
+ 'c2', 'hello',
+ 'c3', array('aa', 'bb', 'cc'),
+ 'c4', map('abc', 123, 'xyz', 456),
+ 'c5', named_struct('c5_1', 'bye', 'c5_2', 88))
+ from src limit 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@json_serde1_2
+POSTHOOK: Lineage: json_serde1_2.a EXPRESSION []
+POSTHOOK: Lineage: json_serde1_2.b EXPRESSION []
+POSTHOOK: Lineage: json_serde1_2.c EXPRESSION []
+PREHOOK: query: select * from json_serde1_2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@json_serde1_2
+#### A masked pattern was here ####
+POSTHOOK: query: select * from json_serde1_2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@json_serde1_2
+#### A masked pattern was here ####
+[3,2,1] {1:"2001-01-01",2:null} {"c1":123456,"c2":"hello","c3":["aa","bb","cc"],"c4":{"abc":123,"xyz":456},"c5":{"c5_1":"bye","c5_2":88}}
+[3,2,1] {1:"2001-01-01",2:null} {"c1":123456,"c2":"hello","c3":["aa","bb","cc"],"c4":{"abc":123,"xyz":456},"c5":{"c5_1":"bye","c5_2":88}}
+PREHOOK: query: drop table json_serde1_1
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@json_serde1_1
+PREHOOK: Output: default@json_serde1_1
+POSTHOOK: query: drop table json_serde1_1
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@json_serde1_1
+POSTHOOK: Output: default@json_serde1_1
+PREHOOK: query: drop table json_serde1_2
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@json_serde1_2
+PREHOOK: Output: default@json_serde1_2
+POSTHOOK: query: drop table json_serde1_2
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@json_serde1_2
+POSTHOOK: Output: default@json_serde1_2
http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/ql/src/test/results/clientpositive/show_functions.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/show_functions.q.out b/ql/src/test/results/clientpositive/show_functions.q.out
index e91cffd..629781a 100644
--- a/ql/src/test/results/clientpositive/show_functions.q.out
+++ b/ql/src/test/results/clientpositive/show_functions.q.out
@@ -130,6 +130,7 @@ isnottrue
isnull
istrue
java_method
+json_read
json_tuple
lag
last_day
http://git-wip-us.apache.org/repos/asf/hive/blob/1105ef39/ql/src/test/results/clientpositive/udf_json_read.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/udf_json_read.q.out b/ql/src/test/results/clientpositive/udf_json_read.q.out
new file mode 100644
index 0000000..05b1eb8
--- /dev/null
+++ b/ql/src/test/results/clientpositive/udf_json_read.q.out
@@ -0,0 +1,107 @@
+PREHOOK: query: DESCRIBE FUNCTION java_read
+PREHOOK: type: DESCFUNCTION
+POSTHOOK: query: DESCRIBE FUNCTION java_read
+POSTHOOK: type: DESCFUNCTION
+Function 'java_read' does not exist.
+PREHOOK: query: DESCRIBE FUNCTION EXTENDED java_read
+PREHOOK: type: DESCFUNCTION
+POSTHOOK: query: DESCRIBE FUNCTION EXTENDED java_read
+POSTHOOK: type: DESCFUNCTION
+Function 'java_read' does not exist.
+PREHOOK: query: select json_read('[{"name":"john","alias":"j","address":{"city":"LA"}},{"name":"kinga","alias":"binga","age":2}]',
+ 'array<struct<name:string,age:int,alias:string,address:struct<city:string,street:string>>>')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+POSTHOOK: query: select json_read('[{"name":"john","alias":"j","address":{"city":"LA"}},{"name":"kinga","alias":"binga","age":2}]',
+ 'array<struct<name:string,age:int,alias:string,address:struct<city:string,street:string>>>')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+[{"name":"john","age":null,"alias":"j","address":{"city":"LA","street":null}},{"name":"kinga","age":2,"alias":"binga","address":null}]
+PREHOOK: query: create table t (info array<struct<name:string,age:int,alias:string,address:struct<city:string,street:string>>>)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t
+POSTHOOK: query: create table t (info array<struct<name:string,age:int,alias:string,address:struct<city:string,street:string>>>)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t
+PREHOOK: query: insert into t
+ select json_read('[{"name":"john","alias":"j","address":{"city":"LA"}},{"name":"kinga","alias":"binga","age":2}]',
+ 'array<struct<name:string,age:int,alias:string,address:struct<city:string,street:string>>>')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@t
+POSTHOOK: query: insert into t
+ select json_read('[{"name":"john","alias":"j","address":{"city":"LA"}},{"name":"kinga","alias":"binga","age":2}]',
+ 'array<struct<name:string,age:int,alias:string,address:struct<city:string,street:string>>>')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@t
+POSTHOOK: Lineage: t.info EXPRESSION []
+PREHOOK: query: select json_read('[
+{
+ "business_id": "vcNAWiLM4dR7D2nwwJ7nCA",
+ "hours": {
+ "Tuesday": {
+ "close": "17:00",
+ "open": "08:00"
+ },
+ "Friday": {
+ "close": "17:00",
+ "open": "08:00"
+ }
+ },
+ "open": true,
+ "categories": [
+ "Doctors",
+ "Health & Medical"
+ ],
+ "review_count": 9,
+ "name": "Eric Goldberg, MD",
+ "neighborhoods": [],
+ "attributes": {
+ "By Appointment Only": true,
+ "Accepts Credit Cards": true,
+ "Good For Groups": 1
+ },
+ "type": "business"
+}
+]','array<struct<attributes:struct<accepts credit cards:boolean,by appointment only:boolean,good for groups:int>,business_id:string,categories:array<string>,hours:map<string,struct<close:string,open:string>>,name:string,neighborhoods:array<string>,open:boolean,review_count:int,type:string>>')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+POSTHOOK: query: select json_read('[
+{
+ "business_id": "vcNAWiLM4dR7D2nwwJ7nCA",
+ "hours": {
+ "Tuesday": {
+ "close": "17:00",
+ "open": "08:00"
+ },
+ "Friday": {
+ "close": "17:00",
+ "open": "08:00"
+ }
+ },
+ "open": true,
+ "categories": [
+ "Doctors",
+ "Health & Medical"
+ ],
+ "review_count": 9,
+ "name": "Eric Goldberg, MD",
+ "neighborhoods": [],
+ "attributes": {
+ "By Appointment Only": true,
+ "Accepts Credit Cards": true,
+ "Good For Groups": 1
+ },
+ "type": "business"
+}
+]','array<struct<attributes:struct<accepts credit cards:boolean,by appointment only:boolean,good for groups:int>,business_id:string,categories:array<string>,hours:map<string,struct<close:string,open:string>>,name:string,neighborhoods:array<string>,open:boolean,review_count:int,type:string>>')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+[{"attributes":{"accepts credit cards":true,"by appointment only":true,"good for groups":1},"business_id":"vcNAWiLM4dR7D2nwwJ7nCA","categories":["Doctors","Health & Medical"],"hours":{"Tuesday":{"close":"17:00","open":"08:00"},"Friday":{"close":"17:00","open":"08:00"}},"name":"Eric Goldberg, MD","neighborhoods":[],"open":true,"review_count":9,"type":"business"}]