You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2017/02/27 22:18:51 UTC
orc git commit: ORC-150. Add tools for finding schema from JSON
documents and converting JSON into ORC files.
Repository: orc
Updated Branches:
refs/heads/master 9291a06e0 -> 9805e139b
ORC-150. Add tools for finding schema from JSON documents and converting JSON
into ORC files.
Fixes #95
Signed-off-by: Owen O'Malley <om...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/9805e139
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/9805e139
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/9805e139
Branch: refs/heads/master
Commit: 9805e139bf2041ddfd63b67ed85370bc3f4b09da
Parents: 9291a06
Author: Owen O'Malley <om...@apache.org>
Authored: Mon Feb 20 20:37:18 2017 -0800
Committer: Owen O'Malley <om...@apache.org>
Committed: Mon Feb 27 14:17:25 2017 -0800
----------------------------------------------------------------------
.../src/java/org/apache/orc/RecordReader.java | 2 +-
.../orc/mapred/OrcMapredRecordReader.java | 8 +
.../orc/mapreduce/OrcMapreduceRecordReader.java | 9 +
java/pom.xml | 2 +-
.../src/java/org/apache/orc/tools/Driver.java | 8 +
.../apache/orc/tools/convert/ConvertTool.java | 95 +++++
.../apache/orc/tools/convert/JsonReader.java | 306 ++++++++++++++++
.../org/apache/orc/tools/json/BooleanType.java | 50 +++
.../org/apache/orc/tools/json/HiveType.java | 95 +++++
.../apache/orc/tools/json/JsonSchemaFinder.java | 300 ++++++++++++++++
.../org/apache/orc/tools/json/JsonShredder.java | 116 +++++++
.../org/apache/orc/tools/json/ListType.java | 87 +++++
.../org/apache/orc/tools/json/NullType.java | 50 +++
.../org/apache/orc/tools/json/NumericType.java | 114 ++++++
.../org/apache/orc/tools/json/StringType.java | 76 ++++
.../org/apache/orc/tools/json/StructType.java | 114 ++++++
.../org/apache/orc/tools/json/UnionType.java | 122 +++++++
.../orc/tools/json/TestJsonSchemaFinder.java | 346 +++++++++++++++++++
18 files changed, 1898 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/core/src/java/org/apache/orc/RecordReader.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/RecordReader.java b/java/core/src/java/org/apache/orc/RecordReader.java
index 09ba0f0..2621a8e 100644
--- a/java/core/src/java/org/apache/orc/RecordReader.java
+++ b/java/core/src/java/org/apache/orc/RecordReader.java
@@ -24,7 +24,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
/**
* A row-by-row iterator for ORC files.
*/
-public interface RecordReader {
+public interface RecordReader extends AutoCloseable {
/**
* Read the next row batch. The size of the batch to read cannot be
* controlled by the callers. Caller need to look at
http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/mapreduce/src/java/org/apache/orc/mapred/OrcMapredRecordReader.java
----------------------------------------------------------------------
diff --git a/java/mapreduce/src/java/org/apache/orc/mapred/OrcMapredRecordReader.java b/java/mapreduce/src/java/org/apache/orc/mapred/OrcMapredRecordReader.java
index ddbc396..4c3c0d3 100644
--- a/java/mapreduce/src/java/org/apache/orc/mapred/OrcMapredRecordReader.java
+++ b/java/mapreduce/src/java/org/apache/orc/mapred/OrcMapredRecordReader.java
@@ -60,6 +60,14 @@ public class OrcMapredRecordReader<V extends WritableComparable>
private final VectorizedRowBatch batch;
private int rowInBatch;
+ public OrcMapredRecordReader(RecordReader reader,
+ TypeDescription schema) throws IOException {
+ this.batchReader = reader;
+ this.batch = schema.createRowBatch();
+ this.schema = schema;
+ rowInBatch = 0;
+ }
+
protected OrcMapredRecordReader(Reader fileReader,
Reader.Options options) throws IOException {
this.batchReader = fileReader.rows(options);
http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/mapreduce/src/java/org/apache/orc/mapreduce/OrcMapreduceRecordReader.java
----------------------------------------------------------------------
diff --git a/java/mapreduce/src/java/org/apache/orc/mapreduce/OrcMapreduceRecordReader.java b/java/mapreduce/src/java/org/apache/orc/mapreduce/OrcMapreduceRecordReader.java
index f686e05..21a5e01 100644
--- a/java/mapreduce/src/java/org/apache/orc/mapreduce/OrcMapreduceRecordReader.java
+++ b/java/mapreduce/src/java/org/apache/orc/mapreduce/OrcMapreduceRecordReader.java
@@ -45,6 +45,15 @@ public class OrcMapreduceRecordReader<V extends WritableComparable>
private int rowInBatch;
private final V row;
+ public OrcMapreduceRecordReader(RecordReader reader,
+ TypeDescription schema) throws IOException {
+ this.batchReader = reader;
+ this.batch = schema.createRowBatch();
+ this.schema = schema;
+ rowInBatch = 0;
+ this.row = (V) OrcStruct.createValue(schema);
+ }
+
public OrcMapreduceRecordReader(Reader fileReader,
Reader.Options options) throws IOException {
this.batchReader = fileReader.rows(options);
http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/pom.xml
----------------------------------------------------------------------
diff --git a/java/pom.xml b/java/pom.xml
index 2c9485f..5c9ab04 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -272,7 +272,7 @@
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
- <version>1.2</version>
+ <version>1.3.1</version>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/Driver.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/Driver.java b/java/tools/src/java/org/apache/orc/tools/Driver.java
index 37bedbe..9bba013 100644
--- a/java/tools/src/java/org/apache/orc/tools/Driver.java
+++ b/java/tools/src/java/org/apache/orc/tools/Driver.java
@@ -24,6 +24,8 @@ import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configuration;
+import org.apache.orc.tools.convert.ConvertTool;
+import org.apache.orc.tools.json.JsonSchemaFinder;
import java.util.Map;
import java.util.Properties;
@@ -87,6 +89,8 @@ public class Driver {
System.err.println(" meta - print the metadata about the ORC file");
System.err.println(" data - print the data from the ORC file");
System.err.println(" scan - scan the ORC file");
+ System.err.println(" convert - convert JSON files to ORC");
+ System.err.println(" json-schema - scan JSON files to determine their schema");
System.err.println();
System.err.println("To get more help, provide -h to the command");
System.exit(1);
@@ -102,6 +106,10 @@ public class Driver {
PrintData.main(conf, options.commandArgs);
} else if ("scan".equals(options.command)) {
ScanData.main(conf, options.commandArgs);
+ } else if ("json-schema".equals(options.command)) {
+ JsonSchemaFinder.main(conf, options.commandArgs);
+ } else if ("convert".equals(options.command)) {
+ ConvertTool.main(conf, options.commandArgs);
} else {
System.err.println("Unknown subcommand: " + options.command);
System.exit(1);
http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java b/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java
new file mode 100644
index 0000000..81fc2ec
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.tools.convert;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.OrcFile;
+import org.apache.orc.RecordReader;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.Writer;
+import org.apache.orc.tools.json.JsonSchemaFinder;
+
+import java.io.IOException;
+
+/**
+ * A conversion tool to convert JSON files into ORC files.
+ */
+public class ConvertTool {
+
+ static TypeDescription computeSchema(String[] filename) throws IOException {
+ JsonSchemaFinder schemaFinder = new JsonSchemaFinder();
+ for(String file: filename) {
+ System.err.println("Scanning " + file + " for schema");
+ schemaFinder.addFile(file);
+ }
+ return schemaFinder.getSchema();
+ }
+
+ public static void main(Configuration conf,
+ String[] args) throws IOException, ParseException {
+ CommandLine opts = parseOptions(args);
+ TypeDescription schema;
+ if (opts.hasOption('s')) {
+ schema = TypeDescription.fromString(opts.getOptionValue('s'));
+ } else {
+ schema = computeSchema(opts.getArgs());
+ }
+ String outFilename = opts.hasOption('o')
+ ? opts.getOptionValue('o') : "output.orc";
+ Writer writer = OrcFile.createWriter(new Path(outFilename),
+ OrcFile.writerOptions(conf).setSchema(schema));
+ VectorizedRowBatch batch = schema.createRowBatch();
+ for (String file: opts.getArgs()) {
+ System.err.println("Processing " + file);
+ RecordReader reader = new JsonReader(new Path(file), schema, conf);
+ while (reader.nextBatch(batch)) {
+ writer.addRowBatch(batch);
+ }
+ reader.close();
+ }
+ writer.close();
+ }
+
+ static CommandLine parseOptions(String[] args) throws ParseException {
+ Options options = new Options();
+
+ options.addOption(
+ Option.builder("h").longOpt("help").desc("Provide help").build());
+ options.addOption(
+ Option.builder("s").longOpt("schema").hasArg()
+ .desc("The schema to write in to the file").build());
+ options.addOption(
+ Option.builder("o").longOpt("output").desc("Output filename")
+ .hasArg().build());
+ CommandLine cli = new GnuParser().parse(options, args);
+ if (cli.hasOption('h')) {
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.printHelp("convert", options);
+ System.exit(1);
+ }
+ return cli;
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/convert/JsonReader.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/convert/JsonReader.java b/java/tools/src/java/org/apache/orc/tools/convert/JsonReader.java
new file mode 100644
index 0000000..2cc5711
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/convert/JsonReader.java
@@ -0,0 +1,306 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.tools.convert;
+
+import com.google.gson.JsonArray;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+import com.google.gson.JsonStreamParser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.RecordReader;
+import org.apache.orc.TypeDescription;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.sql.Timestamp;
+import java.util.List;
+import java.util.zip.GZIPInputStream;
+
+public class JsonReader implements RecordReader {
+ private final TypeDescription schema;
+ private final JsonStreamParser parser;
+ private final JsonConverter[] converters;
+ private final long totalSize;
+ private final FSDataInputStream rawStream;
+ private long rowNumber = 0;
+
+ interface JsonConverter {
+ void convert(JsonElement value, ColumnVector vect, int row);
+ }
+
+ static class BooleanColumnConverter implements JsonConverter {
+ public void convert(JsonElement value, ColumnVector vect, int row) {
+ if (value == null || value.isJsonNull()) {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ } else {
+ LongColumnVector vector = (LongColumnVector) vect;
+ vector.vector[row] = value.getAsBoolean() ? 1 : 0;
+ }
+ }
+ }
+
+ static class LongColumnConverter implements JsonConverter {
+ public void convert(JsonElement value, ColumnVector vect, int row) {
+ if (value == null || value.isJsonNull()) {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ } else {
+ LongColumnVector vector = (LongColumnVector) vect;
+ vector.vector[row] = value.getAsLong();
+ }
+ }
+ }
+
+ static class DoubleColumnConverter implements JsonConverter {
+ public void convert(JsonElement value, ColumnVector vect, int row) {
+ if (value == null || value.isJsonNull()) {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ } else {
+ DoubleColumnVector vector = (DoubleColumnVector) vect;
+ vector.vector[row] = value.getAsDouble();
+ }
+ }
+ }
+
+ static class StringColumnConverter implements JsonConverter {
+ public void convert(JsonElement value, ColumnVector vect, int row) {
+ if (value == null || value.isJsonNull()) {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ } else {
+ BytesColumnVector vector = (BytesColumnVector) vect;
+ byte[] bytes = value.getAsString().getBytes(StandardCharsets.UTF_8);
+ vector.setRef(row, bytes, 0, bytes.length);
+ }
+ }
+ }
+
+ static class BinaryColumnConverter implements JsonConverter {
+ public void convert(JsonElement value, ColumnVector vect, int row) {
+ if (value == null || value.isJsonNull()) {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ } else {
+ BytesColumnVector vector = (BytesColumnVector) vect;
+ String binStr = value.getAsString();
+ byte[] bytes = new byte[binStr.length()/2];
+ for(int i=0; i < bytes.length; ++i) {
+ bytes[i] = (byte) Integer.parseInt(binStr.substring(i*2, i*2+2), 16);
+ }
+ vector.setRef(row, bytes, 0, bytes.length);
+ }
+ }
+ }
+
+ static class TimestampColumnConverter implements JsonConverter {
+ public void convert(JsonElement value, ColumnVector vect, int row) {
+ if (value == null || value.isJsonNull()) {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ } else {
+ TimestampColumnVector vector = (TimestampColumnVector) vect;
+ vector.set(row, Timestamp.valueOf(value.getAsString()
+ .replaceAll("[TZ]", " ")));
+ }
+ }
+ }
+
+ static class DecimalColumnConverter implements JsonConverter {
+ public void convert(JsonElement value, ColumnVector vect, int row) {
+ if (value == null || value.isJsonNull()) {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ } else {
+ DecimalColumnVector vector = (DecimalColumnVector) vect;
+ vector.vector[row].set(HiveDecimal.create(value.getAsString()));
+ }
+ }
+ }
+
+ static class StructColumnConverter implements JsonConverter {
+ private JsonConverter[] childrenConverters;
+ private List<String> fieldNames;
+
+ public StructColumnConverter(TypeDescription schema) {
+ List<TypeDescription> kids = schema.getChildren();
+ childrenConverters = new JsonConverter[kids.size()];
+ for(int c=0; c < childrenConverters.length; ++c) {
+ childrenConverters[c] = createConverter(kids.get(c));
+ }
+ fieldNames = schema.getFieldNames();
+ }
+
+ public void convert(JsonElement value, ColumnVector vect, int row) {
+ if (value == null || value.isJsonNull()) {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ } else {
+ StructColumnVector vector = (StructColumnVector) vect;
+ JsonObject obj = value.getAsJsonObject();
+ for(int c=0; c < childrenConverters.length; ++c) {
+ JsonElement elem = obj.get(fieldNames.get(c));
+ childrenConverters[c].convert(elem, vector.fields[c], row);
+ }
+ }
+ }
+ }
+
+ static class ListColumnConverter implements JsonConverter {
+ private JsonConverter childrenConverter;
+
+ public ListColumnConverter(TypeDescription schema) {
+ childrenConverter = createConverter(schema.getChildren().get(0));
+ }
+
+ public void convert(JsonElement value, ColumnVector vect, int row) {
+ if (value == null || value.isJsonNull()) {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ } else {
+ ListColumnVector vector = (ListColumnVector) vect;
+ JsonArray obj = value.getAsJsonArray();
+ vector.lengths[row] = obj.size();
+ vector.offsets[row] = vector.childCount;
+ vector.childCount += vector.lengths[row];
+ vector.child.ensureSize(vector.childCount, true);
+ for(int c=0; c < obj.size(); ++c) {
+ childrenConverter.convert(obj.get(c), vector.child,
+ (int) vector.offsets[row] + c);
+ }
+ }
+ }
+ }
+
+ static JsonConverter createConverter(TypeDescription schema) {
+ switch (schema.getCategory()) {
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ return new LongColumnConverter();
+ case FLOAT:
+ case DOUBLE:
+ return new DoubleColumnConverter();
+ case CHAR:
+ case VARCHAR:
+ case STRING:
+ return new StringColumnConverter();
+ case DECIMAL:
+ return new DecimalColumnConverter();
+ case TIMESTAMP:
+ return new TimestampColumnConverter();
+ case BINARY:
+ return new BinaryColumnConverter();
+ case BOOLEAN:
+ return new BooleanColumnConverter();
+ case STRUCT:
+ return new StructColumnConverter(schema);
+ case LIST:
+ return new ListColumnConverter(schema);
+ default:
+ throw new IllegalArgumentException("Unhandled type " + schema);
+ }
+ }
+
+ public JsonReader(Path path,
+ TypeDescription schema,
+ Configuration conf) throws IOException {
+ this.schema = schema;
+ FileSystem fs = path.getFileSystem(conf);
+ totalSize = fs.getFileStatus(path).getLen();
+ rawStream = fs.open(path);
+ String name = path.getName();
+ int lastDot = name.lastIndexOf(".");
+ InputStream input = rawStream;
+ if (lastDot >= 0) {
+ if (".gz".equals(name.substring(lastDot))) {
+ input = new GZIPInputStream(rawStream);
+ }
+ }
+ parser = new JsonStreamParser(new InputStreamReader(input,
+ StandardCharsets.UTF_8));
+ if (schema.getCategory() != TypeDescription.Category.STRUCT) {
+ throw new IllegalArgumentException("Root must be struct - " + schema);
+ }
+ List<TypeDescription> fieldTypes = schema.getChildren();
+ converters = new JsonConverter[fieldTypes.size()];
+ for(int c = 0; c < converters.length; ++c) {
+ converters[c] = createConverter(fieldTypes.get(c));
+ }
+ }
+
+ public boolean nextBatch(VectorizedRowBatch batch) throws IOException {
+ batch.reset();
+ int maxSize = batch.getMaxSize();
+ List<String> fieldNames = schema.getFieldNames();
+ while (parser.hasNext() && batch.size < maxSize) {
+ JsonObject elem = parser.next().getAsJsonObject();
+ for(int c=0; c < converters.length; ++c) {
+ // look up each field to see if it is in the input, otherwise
+ // set it to null.
+ JsonElement field = elem.get(fieldNames.get(c));
+ if (field == null) {
+ batch.cols[c].noNulls = false;
+ batch.cols[c].isNull[batch.size] = true;
+ } else {
+ converters[c].convert(field, batch.cols[c], batch.size);
+ }
+ }
+ batch.size++;
+ }
+ rowNumber += batch.size;
+ return batch.size != 0;
+ }
+
+ @Override
+ public long getRowNumber() throws IOException {
+ return rowNumber;
+ }
+
+ @Override
+ public float getProgress() throws IOException {
+ long pos = rawStream.getPos();
+ return totalSize != 0 && pos < totalSize ? (float) pos / totalSize : 1;
+ }
+
+ public void close() throws IOException {
+ rawStream.close();
+ }
+
+ @Override
+ public void seekToRow(long rowCount) throws IOException {
+ throw new UnsupportedOperationException("Seek is not supported by JsonReader");
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/json/BooleanType.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/json/BooleanType.java b/java/tools/src/java/org/apache/orc/tools/json/BooleanType.java
new file mode 100644
index 0000000..916fe30
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/json/BooleanType.java
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools.json;
+
+import org.apache.orc.TypeDescription;
+
+/**
+ * A type that represents true, false, and null.
+ */
+class BooleanType extends HiveType {
+ BooleanType() {
+ super(Kind.BOOLEAN);
+ }
+
+ @Override
+ public String toString() {
+ return "boolean";
+ }
+
+ @Override
+ public boolean subsumes(HiveType other) {
+ return other.kind == Kind.BOOLEAN || other.kind == Kind.NULL;
+ }
+
+ @Override
+ public void merge(HiveType other) {
+ // nothing to do to merge boolean types
+ }
+
+ @Override
+ public TypeDescription getSchema() {
+ return TypeDescription.createBoolean();
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/json/HiveType.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/json/HiveType.java b/java/tools/src/java/org/apache/orc/tools/json/HiveType.java
new file mode 100644
index 0000000..6222aca
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/json/HiveType.java
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools.json;
+
+import org.apache.orc.TypeDescription;
+
+import java.io.PrintStream;
+
+/**
+ * The internal representation of what we have discovered about a given
+ * field's type.
+ */
+abstract class HiveType {
+ enum Kind {
+ NULL(0),
+ BOOLEAN(1),
+ BYTE(1), SHORT(2), INT(3), LONG(4), DECIMAL(5), FLOAT(6), DOUBLE(7),
+ BINARY(1), DATE(1), TIMESTAMP(1), STRING(2),
+ STRUCT(1, false),
+ LIST(1, false),
+ UNION(8, false);
+
+ // for types that subsume each other, establish a ranking.
+ final int rank;
+ final boolean isPrimitive;
+ Kind(int rank, boolean isPrimitive) {
+ this.rank = rank;
+ this.isPrimitive = isPrimitive;
+ }
+ Kind(int rank) {
+ this(rank, true);
+ }
+ }
+
+ protected Kind kind;
+
+ HiveType(Kind kind) {
+ this.kind = kind;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (other == null || other.getClass() != getClass()) {
+ return false;
+ }
+ return ((HiveType) other).kind.equals(kind);
+ }
+
+ @Override
+ public int hashCode() {
+ return kind.hashCode();
+ }
+
+ /**
+ * Does this type include all of the values of the other type?
+ * @param other the other type to compare against
+ * @return true, if this type includes all of the values of the other type
+ */
+ public abstract boolean subsumes(HiveType other);
+
+ /**
+ * Merge the other type into this one. It assumes that subsubes(other) is
+ * true.
+ * @param other
+ */
+ public abstract void merge(HiveType other);
+
+ /**
+ * Print this type into the stream using a flat structure given the
+ * prefix on each element.
+ * @param out the stream to print to
+ * @param prefix the prefix to add to each field name
+ */
+ public void printFlat(PrintStream out, String prefix) {
+ out.println(prefix + ": " + toString());
+ }
+
+ public abstract TypeDescription getSchema();
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/json/JsonSchemaFinder.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/json/JsonSchemaFinder.java b/java/tools/src/java/org/apache/orc/tools/json/JsonSchemaFinder.java
new file mode 100644
index 0000000..40841fc
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/json/JsonSchemaFinder.java
@@ -0,0 +1,300 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.tools.json;
+
+import com.google.gson.JsonArray;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+import com.google.gson.JsonPrimitive;
+import com.google.gson.JsonStreamParser;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.orc.TypeDescription;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.PrintStream;
+import java.math.BigInteger;
+import java.nio.charset.StandardCharsets;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * This class determines the equivalent Hive schema for a group of JSON
+ * documents.
+ * boolean
+ */
+public class JsonSchemaFinder {
+ private static final Logger LOG = LoggerFactory.getLogger(JsonSchemaFinder.class);
+
+ private static final Pattern HEX_PATTERN =
+ Pattern.compile("^([0-9a-fA-F][0-9a-fA-F])+$");
+ private static final Pattern TIMESTAMP_PATTERN =
+ Pattern.compile("^[\"]?([0-9]{4}[-/][0-9]{2}[-/][0-9]{2})[T ]" +
+ "([0-9]{2}:[0-9]{2}:[0-9]{2})" +
+ "(([ ][-+]?[0-9]{2}([:][0-9]{2})?)|Z)?[\"]?$");
+ private static final Pattern DECIMAL_PATTERN =
+ Pattern.compile("^-?(?<int>[0-9]+)([.](?<fraction>[0-9]+))?$");
+ private static final int INDENT = 2;
+ private static final int MAX_DECIMAL_DIGITS = 38;
+
+ static final BigInteger MIN_LONG = new BigInteger("-9223372036854775808");
+ static final BigInteger MAX_LONG = new BigInteger("9223372036854775807");
+
+ private HiveType mergedType = null;
+ private long records = 0;
+
+ static HiveType pickType(JsonElement json) {
+ if (json.isJsonPrimitive()) {
+ JsonPrimitive prim = (JsonPrimitive) json;
+ if (prim.isBoolean()) {
+ return new BooleanType();
+ } else if (prim.isNumber()) {
+ Matcher matcher = DECIMAL_PATTERN.matcher(prim.getAsString());
+ if (matcher.matches()) {
+ int intDigits = matcher.group("int").length();
+ String fraction = matcher.group("fraction");
+ int scale = fraction == null ? 0 : fraction.length();
+ if (scale == 0) {
+ if (intDigits < 19) {
+ long value = prim.getAsLong();
+ if (value >= -128 && value < 128) {
+ return new NumericType(HiveType.Kind.BYTE, intDigits, scale);
+ } else if (value >= -32768 && value < 32768) {
+ return new NumericType(HiveType.Kind.SHORT, intDigits, scale);
+ } else if (value >= -2147483648 && value < 2147483648L) {
+ return new NumericType(HiveType.Kind.INT, intDigits, scale);
+ } else {
+ return new NumericType(HiveType.Kind.LONG, intDigits, scale);
+ }
+ } else if (intDigits == 19) {
+ // at 19 digits, it may fit inside a long, but we need to check
+ BigInteger val = prim.getAsBigInteger();
+ if (val.compareTo(MIN_LONG) >= 0 && val.compareTo(MAX_LONG) <= 0) {
+ return new NumericType(HiveType.Kind.LONG, intDigits, scale);
+ }
+ }
+ }
+ if (intDigits + scale <= MAX_DECIMAL_DIGITS) {
+ return new NumericType(HiveType.Kind.DECIMAL, intDigits, scale);
+ }
+ }
+ double value = prim.getAsDouble();
+ if (value >= Float.MIN_VALUE && value <= Float.MAX_VALUE) {
+ return new NumericType(HiveType.Kind.FLOAT, 0, 0);
+ } else {
+ return new NumericType(HiveType.Kind.DOUBLE, 0, 0);
+ }
+ } else {
+ String str = prim.getAsString();
+ if (TIMESTAMP_PATTERN.matcher(str).matches()) {
+ return new StringType(HiveType.Kind.TIMESTAMP);
+ } else if (HEX_PATTERN.matcher(str).matches()) {
+ return new StringType(HiveType.Kind.BINARY);
+ } else {
+ return new StringType(HiveType.Kind.STRING);
+ }
+ }
+ } else if (json.isJsonNull()) {
+ return new NullType();
+ } else if (json.isJsonArray()) {
+ ListType result = new ListType();
+ result.elementType = new NullType();
+ for(JsonElement child: ((JsonArray) json)) {
+ HiveType sub = pickType(child);
+ if (result.elementType.subsumes(sub)) {
+ result.elementType.merge(sub);
+ } else if (sub.subsumes(result.elementType)) {
+ sub.merge(result.elementType);
+ result.elementType = sub;
+ } else {
+ result.elementType = new UnionType(result.elementType, sub);
+ }
+ }
+ return result;
+ } else {
+ JsonObject obj = (JsonObject) json;
+ StructType result = new StructType();
+ for(Map.Entry<String,JsonElement> field: obj.entrySet()) {
+ String fieldName = field.getKey();
+ HiveType type = pickType(field.getValue());
+ result.fields.put(fieldName, type);
+ }
+ return result;
+ }
+ }
+
+ static HiveType mergeType(HiveType previous, HiveType type) {
+ if (previous == null) {
+ return type;
+ } else if (type == null) {
+ return previous;
+ }
+ if (previous.subsumes(type)) {
+ previous.merge(type);
+ } else if (type.subsumes(previous)) {
+ type.merge(previous);
+ previous = type;
+ } else {
+ previous = new UnionType(previous, type);
+ }
+ return previous;
+ }
+
+ static void printType(PrintStream out, HiveType type, int margin) {
+ if (type == null) {
+ out.print("void");
+ } else if (type.kind.isPrimitive) {
+ out.print(type.toString());
+ } else {
+ switch (type.kind) {
+ case STRUCT:
+ out.println("struct <");
+ boolean first = true;
+ for(Map.Entry<String, HiveType> field:
+ ((StructType) type).fields.entrySet()) {
+ if (!first) {
+ out.println(",");
+ } else {
+ first = false;
+ }
+ for(int i=0; i < margin; i++) {
+ out.print(' ');
+ }
+ out.print(field.getKey());
+ out.print(": ");
+ printType(out, field.getValue(), margin + INDENT);
+ }
+ out.print(">");
+ break;
+ case LIST:
+ out.print("array <");
+ printType(out, ((ListType) type).elementType, margin + INDENT);
+ out.print(">");
+ break;
+ case UNION:
+ out.print("uniontype <");
+ first = true;
+ for(HiveType child: ((UnionType) type).children) {
+ if (!first) {
+ out.print(',');
+ } else {
+ first = false;
+ }
+ printType(out, child, margin + INDENT);
+ }
+ out.print(">");
+ break;
+ default:
+ throw new IllegalArgumentException("Unknown kind " + type.kind);
+ }
+ }
+ }
+
+ static void printAsTable(PrintStream out, StructType type) {
+ out.println("create table tbl (");
+ boolean first = true;
+ for(Map.Entry<String, HiveType> field: type.fields.entrySet()) {
+ if (!first) {
+ out.println(",");
+ } else {
+ first = false;
+ }
+ for(int i=0; i < INDENT; ++i) {
+ out.print(' ');
+ }
+ out.print(field.getKey());
+ out.print(" ");
+ printType(out, field.getValue(), 2 * INDENT);
+ }
+ out.println();
+ out.println(")");
+ }
+
+ public void addFile(String filename) throws IOException {
+ java.io.Reader reader;
+ FileInputStream inputStream = new FileInputStream(filename);
+ if (filename.endsWith(".gz")) {
+ reader = new InputStreamReader(new GZIPInputStream(inputStream),
+ StandardCharsets.UTF_8);
+ } else {
+ reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8);
+ }
+ JsonStreamParser parser = new JsonStreamParser(reader);
+ while (parser.hasNext()) {
+ records += 1;
+ mergedType = mergeType(mergedType, pickType(parser.next()));
+ }
+ }
+
+ public TypeDescription getSchema() {
+ return mergedType.getSchema();
+ }
+
+ public static void main(Configuration conf,
+ String[] args) throws Exception {
+ JsonSchemaFinder result = new JsonSchemaFinder();
+ CommandLine cli = parseArguments(args);
+ for (String filename: cli.getArgs()) {
+ System.err.println("Reading file " + filename);
+ result.addFile(filename);
+ }
+ System.err.println(result.records + " records read");
+ System.err.println();
+ if (cli.hasOption('f')) {
+ result.mergedType.printFlat(System.out, "root");
+ } else if (cli.hasOption('t')) {
+ printAsTable(System.out, (StructType) result.mergedType);
+ } else {
+ System.out.println(result.getSchema());
+ }
+ }
+
+ static CommandLine parseArguments(String[] args) throws ParseException {
+ Options options = new Options();
+
+ options.addOption(Option.builder("h").longOpt("help")
+ .desc("Provide help").build());
+ options.addOption(Option.builder("f").longOpt("flat")
+ .desc("Print types as flat list of types").build());
+ options.addOption(Option.builder("t").longOpt("table")
+ .desc("Print types as Hive table declaration").build());
+ CommandLine cli = new GnuParser().parse(options, args);
+ if (cli.hasOption('h')) {
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.printHelp("json-schema", options);
+ System.exit(1);
+ }
+ return cli;
+ }
+
+ public static void main(String[] args) throws Exception {
+ main(new Configuration(), args);
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/json/JsonShredder.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/json/JsonShredder.java b/java/tools/src/java/org/apache/orc/tools/json/JsonShredder.java
new file mode 100644
index 0000000..2f626a5
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/json/JsonShredder.java
@@ -0,0 +1,116 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.tools.json;
+
+import com.google.gson.JsonArray;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+import com.google.gson.JsonPrimitive;
+import com.google.gson.JsonStreamParser;
+
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.PrintStream;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * This class takes a set of JSON documents and shreds them into a file per
+ * a primitive column. This is useful when trying to understand a set of
+ * documents by providing sample values for each of the columns.
+ *
+ * For example, a document that looks like:
+ * {'a': 'aaaa', 'b': { 'c': 12, 'd': true}, e: 'eeee'}
+ *
+ * Will produce 4 files with the given contents:
+ * root.a: aaaa
+ * root.b.c: 12
+ * root.b.d: true
+ * root.e: eeee
+ */
+public class JsonShredder {
+
+ private final Map<String, PrintStream> files =
+ new HashMap<String, PrintStream>();
+
+ private PrintStream getFile(String name) throws IOException {
+ PrintStream result = files.get(name);
+ if (result == null) {
+ result = new PrintStream(new FileOutputStream(name + ".txt"), false,
+ StandardCharsets.UTF_8.name());
+ files.put(name, result);
+ }
+ return result;
+ }
+
+ private void shredObject(String name, JsonElement json) throws IOException {
+ if (json.isJsonPrimitive()) {
+ JsonPrimitive primitive = (JsonPrimitive) json;
+ getFile(name).println(primitive.getAsString());
+ } else if (json.isJsonNull()) {
+ // just skip it
+ } else if (json.isJsonArray()) {
+ for(JsonElement child: ((JsonArray) json)) {
+ shredObject(name + ".list", child);
+ }
+ } else {
+ JsonObject obj = (JsonObject) json;
+ for(Map.Entry<String,JsonElement> field: obj.entrySet()) {
+ String fieldName = field.getKey();
+ shredObject(name + "." + fieldName, field.getValue());
+ }
+ }
+ }
+
+ private void close() throws IOException {
+ for(Map.Entry<String, PrintStream> file: files.entrySet()) {
+ file.getValue().close();
+ }
+ }
+
+ public static void main(String[] args) throws Exception {
+ int count = 0;
+ JsonShredder shredder = new JsonShredder();
+ for (String filename: args) {
+ System.out.println("Reading " + filename);
+ System.out.flush();
+ java.io.Reader reader;
+ FileInputStream inStream = new FileInputStream(filename);
+ if (filename.endsWith(".gz")) {
+ reader = new InputStreamReader(new GZIPInputStream(inStream),
+ StandardCharsets.UTF_8);
+ } else {
+ reader = new InputStreamReader(inStream, StandardCharsets.UTF_8);
+ }
+ JsonStreamParser parser = new JsonStreamParser(reader);
+ while (parser.hasNext()) {
+ count += 1;
+ JsonElement item = parser.next();
+ shredder.shredObject("root", item);
+ }
+ }
+ shredder.close();
+ System.out.println(count + " records read");
+ System.out.println();
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/json/ListType.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/json/ListType.java b/java/tools/src/java/org/apache/orc/tools/json/ListType.java
new file mode 100644
index 0000000..7ef80fd
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/json/ListType.java
@@ -0,0 +1,87 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools.json;
+
+import org.apache.orc.TypeDescription;
+
+import java.io.PrintStream;
+
+/**
+ * A model for types that are lists.
+ */
+class ListType extends HiveType {
+ HiveType elementType;
+
+ public ListType() {
+ super(Kind.LIST);
+ }
+
+ public ListType(HiveType child) {
+ super(Kind.LIST);
+ this.elementType = child;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buf = new StringBuilder("list<");
+ buf.append(elementType.toString());
+ buf.append(">");
+ return buf.toString();
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ return super.equals(other) &&
+ elementType.equals(((ListType) other).elementType);
+ }
+
+ @Override
+ public int hashCode() {
+ return super.hashCode() * 3 + elementType.hashCode();
+ }
+
+ @Override
+ public boolean subsumes(HiveType other) {
+ return other.kind == Kind.NULL || other.kind == Kind.LIST;
+ }
+
+ @Override
+ public void merge(HiveType other) {
+ if (other instanceof ListType) {
+ ListType otherList = (ListType) other;
+ if (elementType.subsumes(otherList.elementType)) {
+ elementType.merge(otherList.elementType);
+ } else if (otherList.elementType.subsumes(elementType)) {
+ otherList.elementType.merge(elementType);
+ elementType = otherList.elementType;
+ } else {
+ elementType = new UnionType(elementType, otherList.elementType);
+ }
+ }
+ }
+
+ public void printFlat(PrintStream out, String prefix) {
+ elementType.printFlat(out, prefix + "._list");
+ }
+
+ @Override
+ public TypeDescription getSchema() {
+ return TypeDescription.createList(elementType.getSchema());
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/json/NullType.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/json/NullType.java b/java/tools/src/java/org/apache/orc/tools/json/NullType.java
new file mode 100644
index 0000000..fa22a3b
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/json/NullType.java
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools.json;
+
+import org.apache.orc.TypeDescription;
+
+/**
+ * The type that only includes the null value.
+ */
+class NullType extends HiveType {
+ NullType() {
+ super(Kind.NULL);
+ }
+
+ @Override
+ public String toString() {
+ return "void";
+ }
+
+ @Override
+ public boolean subsumes(HiveType other) {
+ return other.kind == Kind.NULL;
+ }
+
+ @Override
+ public void merge(HiveType other) {
+ // nothing to do to merge null types *smile*
+ }
+
+ @Override
+ public TypeDescription getSchema() {
+ return TypeDescription.createUnion();
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/json/NumericType.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/json/NumericType.java b/java/tools/src/java/org/apache/orc/tools/json/NumericType.java
new file mode 100644
index 0000000..172cb4c
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/json/NumericType.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools.json;
+
+import org.apache.orc.TypeDescription;
+
+/**
+ * A type that represents all of the numeric types: byte, short, int, long,
+ * float, double, and decimal.
+ */
+class NumericType extends HiveType {
+ // the maximum number of digits before the decimal
+ int intDigits;
+ // the maximum number of digits after the decimal
+ int scale;
+
+ NumericType(Kind kind, int intDigits, int scale) {
+ super(kind);
+ this.intDigits = intDigits;
+ this.scale = scale;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (super.equals(other)) {
+ NumericType otherNumber = (NumericType) other;
+ return intDigits == otherNumber.intDigits || scale == otherNumber.scale;
+ }
+ return false;
+ }
+
+ @Override
+ public int hashCode() {
+ return super.hashCode() * 41 + (intDigits * 17) + scale;
+ }
+
+ @Override
+ public String toString() {
+ switch (kind) {
+ case BYTE:
+ return "tinyint";
+ case SHORT:
+ return "smallint";
+ case INT:
+ return "int";
+ case LONG:
+ return "bigint";
+ case DECIMAL:
+ return "decimal(" + (intDigits + scale) + "," + scale + ")";
+ case FLOAT:
+ return "float";
+ case DOUBLE:
+ return "double";
+ default:
+ throw new IllegalArgumentException("Unknown kind " + kind);
+ }
+ }
+
+ @Override
+ public boolean subsumes(HiveType other) {
+ return other.getClass() == NumericType.class || other.kind == Kind.NULL;
+ }
+
+ @Override
+ public void merge(HiveType other) {
+ if (other.getClass() == NumericType.class) {
+ NumericType otherNumber = (NumericType) other;
+ this.intDigits = Math.max(this.intDigits, otherNumber.intDigits);
+ this.scale = Math.max(this.scale, otherNumber.scale);
+ if (kind.rank < other.kind.rank) {
+ kind = other.kind;
+ }
+ }
+ }
+
+ @Override
+ public TypeDescription getSchema() {
+ switch (kind) {
+ case BYTE:
+ return TypeDescription.createByte();
+ case SHORT:
+ return TypeDescription.createShort();
+ case INT:
+ return TypeDescription.createInt();
+ case LONG:
+ return TypeDescription.createLong();
+ case DECIMAL:
+ return TypeDescription.createDecimal()
+ .withPrecision(intDigits+scale).withScale(scale);
+ case FLOAT:
+ return TypeDescription.createFloat();
+ case DOUBLE:
+ return TypeDescription.createDouble();
+ default:
+ throw new IllegalArgumentException("Unknown kind " + kind);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/json/StringType.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/json/StringType.java b/java/tools/src/java/org/apache/orc/tools/json/StringType.java
new file mode 100644
index 0000000..32cb73d
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/json/StringType.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools.json;
+
+import org.apache.orc.TypeDescription;
+
+/**
+ * These are the types that correspond the the JSON string values: string,
+ * binary, timestamp, and date.
+ */
+class StringType extends HiveType {
+ StringType(Kind kind) {
+ super(kind);
+ }
+
+ @Override
+ public String toString() {
+ switch (kind) {
+ case BINARY:
+ return "binary";
+ case STRING:
+ return "string";
+ case TIMESTAMP:
+ return "timestamp";
+ case DATE:
+ return "date";
+ default:
+ throw new IllegalArgumentException("Unknown kind " + kind);
+ }
+ }
+
+ @Override
+ public boolean subsumes(HiveType other) {
+ return other.getClass() == StringType.class || other.kind == Kind.NULL;
+ }
+
+ @Override
+ public void merge(HiveType other) {
+ // the general case is that everything is a string.
+ if (other.getClass() == StringType.class && kind != other.kind) {
+ kind = Kind.STRING;
+ }
+ }
+
+ @Override
+ public TypeDescription getSchema() {
+ switch (kind) {
+ case BINARY:
+ return TypeDescription.createBinary();
+ case STRING:
+ return TypeDescription.createString();
+ case TIMESTAMP:
+ return TypeDescription.createTimestamp();
+ case DATE:
+ return TypeDescription.createDate();
+ default:
+ throw new IllegalArgumentException("Unknown kind " + kind);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/json/StructType.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/json/StructType.java b/java/tools/src/java/org/apache/orc/tools/json/StructType.java
new file mode 100644
index 0000000..c79146a
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/json/StructType.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools.json;
+
+import org.apache.orc.TypeDescription;
+
+import java.io.PrintStream;
+import java.util.Map;
+import java.util.TreeMap;
+
+/**
+ * Model structs.
+ */
+class StructType extends HiveType {
+ final Map<String, HiveType> fields = new TreeMap<String, HiveType>();
+
+ StructType() {
+ super(Kind.STRUCT);
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buf = new StringBuilder("struct<");
+ boolean first = true;
+ for (Map.Entry<String, HiveType> field : fields.entrySet()) {
+ if (!first) {
+ buf.append(',');
+ } else {
+ first = false;
+ }
+ buf.append(field.getKey());
+ buf.append(':');
+ buf.append(field.getValue().toString());
+ }
+ buf.append(">");
+ return buf.toString();
+ }
+
+ public StructType addField(String name, HiveType fieldType) {
+ fields.put(name, fieldType);
+ return this;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ return super.equals(other) && fields.equals(((StructType) other).fields);
+ }
+
+ @Override
+ public int hashCode() {
+ int result = super.hashCode() * 3;
+ for (Map.Entry<String, HiveType> pair : fields.entrySet()) {
+ result += pair.getKey().hashCode() * 17 + pair.getValue().hashCode();
+ }
+ return result;
+ }
+
+ @Override
+ public boolean subsumes(HiveType other) {
+ return other.kind == Kind.NULL || other.kind == Kind.STRUCT;
+ }
+
+ @Override
+ public void merge(HiveType other) {
+ if (other.getClass() == StructType.class) {
+ StructType otherStruct = (StructType) other;
+ for (Map.Entry<String, HiveType> pair : otherStruct.fields.entrySet()) {
+ HiveType ourField = fields.get(pair.getKey());
+ if (ourField == null) {
+ fields.put(pair.getKey(), pair.getValue());
+ } else if (ourField.subsumes(pair.getValue())) {
+ ourField.merge(pair.getValue());
+ } else if (pair.getValue().subsumes(ourField)) {
+ pair.getValue().merge(ourField);
+ fields.put(pair.getKey(), pair.getValue());
+ } else {
+ fields.put(pair.getKey(), new UnionType(ourField, pair.getValue()));
+ }
+ }
+ }
+ }
+
+ public void printFlat(PrintStream out, String prefix) {
+ prefix = prefix + ".";
+ for (Map.Entry<String, HiveType> field : fields.entrySet()) {
+ field.getValue().printFlat(out, prefix + field.getKey());
+ }
+ }
+
+ @Override
+ public TypeDescription getSchema() {
+ TypeDescription result = TypeDescription.createStruct();
+ for (Map.Entry<String, HiveType> child: fields.entrySet()) {
+ result.addField(child.getKey(), child.getValue().getSchema());
+ }
+ return result;
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/json/UnionType.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/json/UnionType.java b/java/tools/src/java/org/apache/orc/tools/json/UnionType.java
new file mode 100644
index 0000000..bd2fd89
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/json/UnionType.java
@@ -0,0 +1,122 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools.json;
+
+import org.apache.orc.TypeDescription;
+
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * A union type to represent types that don't fit together.
+ */
+class UnionType extends HiveType {
+ final List<HiveType> children = new ArrayList<HiveType>();
+
+ UnionType() {
+ super(Kind.UNION);
+ }
+
+ UnionType(HiveType left, HiveType right) {
+ super(Kind.UNION);
+ children.add(left);
+ children.add(right);
+ }
+
+ UnionType addType(HiveType type) {
+ children.add(type);
+ return this;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buf = new StringBuilder("uniontype<");
+ boolean first = true;
+ for (HiveType child : children) {
+ if (!first) {
+ buf.append(',');
+ } else {
+ first = false;
+ }
+ buf.append(child.toString());
+ }
+ buf.append(">");
+ return buf.toString();
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ return super.equals(other) &&
+ children.equals(((UnionType) other).children);
+ }
+
+ @Override
+ public int hashCode() {
+ int result = super.hashCode();
+ for (HiveType child : children) {
+ result += child.hashCode() * 17;
+ }
+ return result;
+ }
+
+ @Override
+ public boolean subsumes(HiveType other) {
+ return true;
+ }
+
+ @Override
+ public void merge(HiveType other) {
+ if (other instanceof UnionType) {
+ for (HiveType otherChild : ((UnionType) other).children) {
+ merge(otherChild);
+ }
+ } else {
+ for (int i = 0; i < children.size(); ++i) {
+ HiveType child = children.get(i);
+ if (child.subsumes(other)) {
+ child.merge(other);
+ return;
+ } else if (other.subsumes(child)) {
+ other.merge(child);
+ children.set(i, other);
+ return;
+ }
+ }
+ addType(other);
+ }
+ }
+
+ public void printFlat(PrintStream out, String prefix) {
+ prefix = prefix + ".";
+ int id = 0;
+ for (HiveType child : children) {
+ child.printFlat(out, prefix + (id++));
+ }
+ }
+
+ @Override
+ public TypeDescription getSchema() {
+ TypeDescription result = TypeDescription.createUnion();
+ for (HiveType child: children) {
+ result.addUnionChild(child.getSchema());
+ }
+ return result;
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/test/org/apache/orc/tools/json/TestJsonSchemaFinder.java
----------------------------------------------------------------------
diff --git a/java/tools/src/test/org/apache/orc/tools/json/TestJsonSchemaFinder.java b/java/tools/src/test/org/apache/orc/tools/json/TestJsonSchemaFinder.java
new file mode 100644
index 0000000..fac092a
--- /dev/null
+++ b/java/tools/src/test/org/apache/orc/tools/json/TestJsonSchemaFinder.java
@@ -0,0 +1,346 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools.json;
+
+import com.google.gson.JsonArray;
+import com.google.gson.JsonNull;
+import com.google.gson.JsonObject;
+import com.google.gson.JsonPrimitive;
+import com.google.gson.internal.LazilyParsedNumber;
+import org.junit.Test;
+
+import static junit.framework.Assert.assertEquals;
+
+public class TestJsonSchemaFinder {
+
+ @Test
+ public void testBinaryPatterns() throws Exception {
+ assertEquals("binary",
+ JsonSchemaFinder.pickType(new JsonPrimitive("00000000")).toString());
+ assertEquals("string",
+ JsonSchemaFinder.pickType(new JsonPrimitive("0000000")).toString());
+ assertEquals("string",
+ JsonSchemaFinder.pickType(new JsonPrimitive("")).toString());
+ assertEquals("binary",
+ JsonSchemaFinder.pickType(new JsonPrimitive("0123456789abcdefABCDEF")).toString());
+ assertEquals("string",
+ JsonSchemaFinder.pickType(new JsonPrimitive("00x0")).toString());
+ }
+
+ @Test
+ public void testTimestampPatterns() throws Exception {
+ assertEquals("timestamp",
+ JsonSchemaFinder.pickType(new JsonPrimitive("2016-01-05T12:34:56Z")).toString());
+ assertEquals("timestamp",
+ JsonSchemaFinder.pickType(new JsonPrimitive("2016/01/05 12:34:56")).toString());
+ assertEquals("string",
+ JsonSchemaFinder.pickType(new JsonPrimitive("2016/01/05")).toString());
+ assertEquals("timestamp",
+ JsonSchemaFinder.pickType(new JsonPrimitive("2016-01-01 56:00:00 +08")).toString());
+ assertEquals("timestamp",
+ JsonSchemaFinder.pickType(new JsonPrimitive("2016-01-01 56:00:00 -08:30")).toString());
+ }
+
+ @Test
+ public void testBooleans() throws Exception {
+ assertEquals("boolean",
+ JsonSchemaFinder.pickType(new JsonPrimitive(true)).toString());
+ assertEquals("void",
+ JsonSchemaFinder.pickType(JsonNull.INSTANCE).toString());
+ assertEquals("boolean",
+ JsonSchemaFinder.pickType(new JsonPrimitive(false)).toString());
+ }
+
+ @Test
+ public void testNumbers() throws Exception {
+ assertEquals("tinyint",
+ JsonSchemaFinder.pickType(new JsonPrimitive
+ (new LazilyParsedNumber("120"))).toString());
+ assertEquals("tinyint",
+ JsonSchemaFinder.pickType(new JsonPrimitive
+ (new LazilyParsedNumber("-128"))).toString());
+ assertEquals("smallint",
+ JsonSchemaFinder.pickType(new JsonPrimitive
+ (new LazilyParsedNumber("-24120"))).toString());
+ assertEquals("smallint",
+ JsonSchemaFinder.pickType(new JsonPrimitive
+ (new LazilyParsedNumber("128"))).toString());
+ assertEquals("int",
+ JsonSchemaFinder.pickType(new JsonPrimitive
+ (new LazilyParsedNumber("60000"))).toString());
+ assertEquals("bigint",
+ JsonSchemaFinder.pickType(new JsonPrimitive
+ (new LazilyParsedNumber("-4294967296"))).toString());
+ assertEquals("bigint",
+ JsonSchemaFinder.pickType(new JsonPrimitive
+ (new LazilyParsedNumber("-9223372036854775808"))).toString());
+ assertEquals("bigint",
+ JsonSchemaFinder.pickType(new JsonPrimitive
+ (new LazilyParsedNumber("9223372036854775807"))).toString());
+ assertEquals("decimal(19,0)",
+ JsonSchemaFinder.pickType(new JsonPrimitive
+ (new LazilyParsedNumber("9223372036854775808"))).toString());
+ assertEquals("decimal(19,0)",
+ JsonSchemaFinder.pickType(new JsonPrimitive
+ (new LazilyParsedNumber("-9223372036854775809"))).toString());
+ assertEquals("decimal(10,6)",
+ JsonSchemaFinder.pickType(new JsonPrimitive
+ (new LazilyParsedNumber("1234.567890"))).toString());
+ assertEquals("decimal(20,10)",
+ JsonSchemaFinder.pickType(new JsonPrimitive
+ (new LazilyParsedNumber("-1234567890.1234567890"))).toString());
+ assertEquals("float",
+ JsonSchemaFinder.pickType(new JsonPrimitive
+ (new LazilyParsedNumber("1.2e9"))).toString());
+ assertEquals("double",
+ JsonSchemaFinder.pickType(new JsonPrimitive
+ (new LazilyParsedNumber("1234567890123456789012345678901234567890"))).toString());
+ assertEquals("double",
+ JsonSchemaFinder.pickType(new JsonPrimitive
+ (new LazilyParsedNumber("1.2E40"))).toString());
+ }
+
+ @Test
+ public void testLists() throws Exception {
+ assertEquals("list<void>",
+ JsonSchemaFinder.pickType(new JsonArray()).toString());
+ JsonArray list = new JsonArray();
+ list.add(new JsonPrimitive(50000));
+ assertEquals("list<int>", JsonSchemaFinder.pickType(list).toString());
+ list = new JsonArray();
+ list.add(new JsonPrimitive(127));
+ list.add(new JsonPrimitive(50000));
+ list.add(new JsonPrimitive(50000000000L));
+ list.add(new JsonPrimitive(-100));
+ assertEquals("list<bigint>", JsonSchemaFinder.pickType(list).toString());
+ }
+
+ @Test
+ public void testStructs() throws Exception {
+ assertEquals("struct<>",
+ JsonSchemaFinder.pickType(new JsonObject()).toString());
+ JsonObject struct = new JsonObject();
+ struct.addProperty("bool", true);
+ assertEquals("struct<bool:boolean>",
+ JsonSchemaFinder.pickType(struct).toString());
+ struct = new JsonObject();
+ struct.addProperty("str", "value");
+ struct.addProperty("i", new LazilyParsedNumber("124567"));
+ assertEquals("struct<i:int,str:string>",
+ JsonSchemaFinder.pickType(struct).toString());
+ }
+
+ @Test
+ public void testNullMerges() throws Exception {
+ assertEquals("void", JsonSchemaFinder.mergeType(
+ new NullType(),
+ new NullType()).toString());
+ assertEquals("boolean", JsonSchemaFinder.mergeType(
+ new BooleanType(),
+ new NullType()).toString());
+ assertEquals("int", JsonSchemaFinder.mergeType(
+ new NullType(),
+ new NumericType(HiveType.Kind.INT, 4, 0)
+ ).toString());
+ assertEquals("string", JsonSchemaFinder.mergeType(
+ new NullType(),
+ new StringType(HiveType.Kind.STRING)
+ ).toString());
+ assertEquals("struct<i:int>", JsonSchemaFinder.mergeType(
+ new StructType().addField("i", new NumericType(HiveType.Kind.INT, 5, 0)),
+ new NullType()
+ ).toString());
+ assertEquals("list<int>", JsonSchemaFinder.mergeType(
+ new ListType(new NumericType(HiveType.Kind.INT, 5, 0)),
+ new NullType()
+ ).toString());
+ assertEquals("uniontype<int>", JsonSchemaFinder.mergeType(
+ new UnionType().addType(new NumericType(HiveType.Kind.INT, 5, 0)),
+ new NullType()
+ ).toString());
+ }
+
+ @Test
+ public void testBooleanMerges() throws Exception {
+ assertEquals("boolean", JsonSchemaFinder.mergeType(
+ new BooleanType(),
+ new BooleanType()).toString());
+ assertEquals("uniontype<boolean,int>", JsonSchemaFinder.mergeType(
+ new BooleanType(),
+ new NumericType(HiveType.Kind.INT, 4, 0)
+ ).toString());
+ assertEquals("uniontype<boolean,string>", JsonSchemaFinder.mergeType(
+ new BooleanType(),
+ new StringType(HiveType.Kind.STRING)
+ ).toString());
+ assertEquals("uniontype<struct<i:int>,boolean>", JsonSchemaFinder.mergeType(
+ new StructType().addField("i", new NumericType(HiveType.Kind.INT, 5, 0)),
+ new BooleanType()
+ ).toString());
+ assertEquals("uniontype<list<int>,boolean>", JsonSchemaFinder.mergeType(
+ new ListType(new NumericType(HiveType.Kind.INT, 5, 0)),
+ new BooleanType()
+ ).toString());
+ assertEquals("uniontype<int,boolean>", JsonSchemaFinder.mergeType(
+ new UnionType().addType(new NumericType(HiveType.Kind.INT, 5, 0)),
+ new BooleanType()
+ ).toString());
+ }
+
+ @Test
+ public void testNumericMerges() throws Exception {
+ assertEquals("smallint", JsonSchemaFinder.mergeType(
+ new NumericType(HiveType.Kind.BYTE, 2, 0),
+ new NumericType(HiveType.Kind.SHORT, 4, 0)
+ ).toString());
+ assertEquals("int", JsonSchemaFinder.mergeType(
+ new NumericType(HiveType.Kind.INT, 6, 0),
+ new NumericType(HiveType.Kind.SHORT, 4, 0)
+ ).toString());
+ assertEquals("bigint", JsonSchemaFinder.mergeType(
+ new NumericType(HiveType.Kind.INT, 6, 0),
+ new NumericType(HiveType.Kind.LONG, 10, 0)
+ ).toString());
+ assertEquals("decimal(20,0)", JsonSchemaFinder.mergeType(
+ new NumericType(HiveType.Kind.SHORT, 4, 0),
+ new NumericType(HiveType.Kind.DECIMAL, 20, 0)
+ ).toString());
+ assertEquals("float", JsonSchemaFinder.mergeType(
+ new NumericType(HiveType.Kind.FLOAT, 21, 4),
+ new NumericType(HiveType.Kind.DECIMAL, 20, 0)
+ ).toString());
+ assertEquals("double", JsonSchemaFinder.mergeType(
+ new NumericType(HiveType.Kind.DOUBLE, 31, 4),
+ new NumericType(HiveType.Kind.DECIMAL, 20, 10)
+ ).toString());
+ assertEquals("uniontype<decimal(30,10),string>", JsonSchemaFinder.mergeType(
+ new NumericType(HiveType.Kind.DECIMAL, 20, 10),
+ new StringType(HiveType.Kind.STRING)
+ ).toString());
+ assertEquals("uniontype<struct<i:int>,smallint>", JsonSchemaFinder.mergeType(
+ new StructType().addField("i", new NumericType(HiveType.Kind.INT, 5, 0)),
+ new NumericType(HiveType.Kind.SHORT, 4, 0)
+ ).toString());
+ assertEquals("uniontype<smallint,list<int>>", JsonSchemaFinder.mergeType(
+ new NumericType(HiveType.Kind.SHORT, 4, 0),
+ new ListType(new NumericType(HiveType.Kind.INT, 5, 0))
+ ).toString());
+ assertEquals("uniontype<decimal(20,0),string>", JsonSchemaFinder.mergeType(
+ new UnionType()
+ .addType(new NumericType(HiveType.Kind.INT, 5, 0))
+ .addType(new StringType(HiveType.Kind.STRING)),
+ new NumericType(HiveType.Kind.DECIMAL, 20, 0)
+ ).toString());
+ }
+
+ @Test
+ public void testStringMerges() throws Exception {
+ assertEquals("string", JsonSchemaFinder.mergeType(
+ new StringType(HiveType.Kind.BINARY),
+ new StringType(HiveType.Kind.STRING)
+ ).toString());
+ assertEquals("string", JsonSchemaFinder.mergeType(
+ new StringType(HiveType.Kind.STRING),
+ new StringType(HiveType.Kind.TIMESTAMP)
+ ).toString());
+ assertEquals("uniontype<struct<i:int>,timestamp>", JsonSchemaFinder.mergeType(
+ new StructType().addField("i", new NumericType(HiveType.Kind.INT, 5, 0)),
+ new StringType(HiveType.Kind.TIMESTAMP)
+ ).toString());
+ assertEquals("uniontype<binary,list<int>>", JsonSchemaFinder.mergeType(
+ new StringType(HiveType.Kind.BINARY),
+ new ListType(new NumericType(HiveType.Kind.INT, 5, 0))
+ ).toString());
+ assertEquals("uniontype<int,string>", JsonSchemaFinder.mergeType(
+ new UnionType()
+ .addType(new NumericType(HiveType.Kind.INT, 5, 0))
+ .addType(new StringType(HiveType.Kind.STRING)),
+ new StringType(HiveType.Kind.TIMESTAMP)
+ ).toString());
+ }
+
+ @Test
+ public void testListMerges() throws Exception {
+ assertEquals("list<bigint>", JsonSchemaFinder.mergeType(
+ new ListType(new NumericType(HiveType.Kind.INT, 10, 0)),
+ new ListType(new NumericType(HiveType.Kind.LONG, 20, 0))
+ ).toString());
+ assertEquals("list<uniontype<int,string>>", JsonSchemaFinder.mergeType(
+ new ListType(new NumericType(HiveType.Kind.INT, 10, 0)),
+ new ListType(new StringType(HiveType.Kind.STRING))
+ ).toString());
+ assertEquals("uniontype<struct<foo:int>,list<int>>", JsonSchemaFinder.mergeType(
+ new StructType().addField("foo", new NumericType(HiveType.Kind.INT, 10, 0)),
+ new ListType(new NumericType(HiveType.Kind.INT, 5, 0))
+ ).toString());
+ assertEquals("uniontype<int,string,list<boolean>>", JsonSchemaFinder.mergeType(
+ new UnionType()
+ .addType(new NumericType(HiveType.Kind.INT, 5, 0))
+ .addType(new StringType(HiveType.Kind.STRING)),
+ new ListType(new BooleanType())
+ ).toString());
+ }
+
+ @Test
+ public void testStructMerges() throws Exception {
+ assertEquals("struct<bar:timestamp,foo:int>", JsonSchemaFinder.mergeType(
+ new StructType().addField("foo", new NumericType(HiveType.Kind.INT, 10, 0)),
+ new StructType().addField("bar", new StringType(HiveType.Kind.TIMESTAMP))
+ ).toString());
+ assertEquals("struct<bar:string,foo:int>", JsonSchemaFinder.mergeType(
+ new StructType()
+ .addField("foo", new NumericType(HiveType.Kind.INT, 10, 0))
+ .addField("bar", new StringType(HiveType.Kind.BINARY)),
+ new StructType()
+ .addField("bar", new StringType(HiveType.Kind.TIMESTAMP))
+ ).toString());
+ assertEquals("uniontype<int,string,struct<foo:boolean>>", JsonSchemaFinder.mergeType(
+ new UnionType()
+ .addType(new NumericType(HiveType.Kind.INT, 5, 0))
+ .addType(new StringType(HiveType.Kind.STRING)),
+ new StructType().addField("foo", new BooleanType())
+ ).toString());
+ }
+
+ @Test
+ public void testUnionMerges() throws Exception {
+ assertEquals("uniontype<decimal(15,10),boolean,string>", JsonSchemaFinder.mergeType(
+ new UnionType()
+ .addType(new NumericType(HiveType.Kind.DECIMAL, 2, 10))
+ .addType(new BooleanType())
+ .addType(new StringType(HiveType.Kind.BINARY)),
+ new UnionType()
+ .addType(new StringType(HiveType.Kind.TIMESTAMP))
+ .addType(new NumericType(HiveType.Kind.INT, 5, 0))
+ ).toString());
+ assertEquals("uniontype<int,binary,struct<bar:timestamp>>", JsonSchemaFinder.mergeType(
+ new UnionType()
+ .addType(new NumericType(HiveType.Kind.INT, 10, 0))
+ .addType(new StringType(HiveType.Kind.BINARY)),
+ new StructType()
+ .addField("bar", new StringType(HiveType.Kind.TIMESTAMP))
+ ).toString());
+ assertEquals("uniontype<int,string>", JsonSchemaFinder.mergeType(
+ new UnionType()
+ .addType(new NumericType(HiveType.Kind.INT, 5, 0))
+ .addType(new StringType(HiveType.Kind.BINARY)),
+ new StringType(HiveType.Kind.TIMESTAMP)
+ ).toString());
+ }
+}