You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2017/02/27 22:18:51 UTC
orc git commit: ORC-150. Add tools for finding schema from JSON documents and converting JSON into ORC files.

Repository: orc
Updated Branches:
  refs/heads/master 9291a06e0 -> 9805e139b


ORC-150. Add tools for finding schema from JSON documents and converting JSON
into ORC files.

Fixes #95

Signed-off-by: Owen O'Malley <om...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/9805e139
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/9805e139
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/9805e139

Branch: refs/heads/master
Commit: 9805e139bf2041ddfd63b67ed85370bc3f4b09da
Parents: 9291a06
Author: Owen O'Malley <om...@apache.org>
Authored: Mon Feb 20 20:37:18 2017 -0800
Committer: Owen O'Malley <om...@apache.org>
Committed: Mon Feb 27 14:17:25 2017 -0800

----------------------------------------------------------------------
 .../src/java/org/apache/orc/RecordReader.java   |   2 +-
 .../orc/mapred/OrcMapredRecordReader.java       |   8 +
 .../orc/mapreduce/OrcMapreduceRecordReader.java |   9 +
 java/pom.xml                                    |   2 +-
 .../src/java/org/apache/orc/tools/Driver.java   |   8 +
 .../apache/orc/tools/convert/ConvertTool.java   |  95 +++++
 .../apache/orc/tools/convert/JsonReader.java    | 306 ++++++++++++++++
 .../org/apache/orc/tools/json/BooleanType.java  |  50 +++
 .../org/apache/orc/tools/json/HiveType.java     |  95 +++++
 .../apache/orc/tools/json/JsonSchemaFinder.java | 300 ++++++++++++++++
 .../org/apache/orc/tools/json/JsonShredder.java | 116 +++++++
 .../org/apache/orc/tools/json/ListType.java     |  87 +++++
 .../org/apache/orc/tools/json/NullType.java     |  50 +++
 .../org/apache/orc/tools/json/NumericType.java  | 114 ++++++
 .../org/apache/orc/tools/json/StringType.java   |  76 ++++
 .../org/apache/orc/tools/json/StructType.java   | 114 ++++++
 .../org/apache/orc/tools/json/UnionType.java    | 122 +++++++
 .../orc/tools/json/TestJsonSchemaFinder.java    | 346 +++++++++++++++++++
 18 files changed, 1898 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/core/src/java/org/apache/orc/RecordReader.java
----------------------------------------------------------------------
diff --git a/java/core/src/java/org/apache/orc/RecordReader.java b/java/core/src/java/org/apache/orc/RecordReader.java
index 09ba0f0..2621a8e 100644
--- a/java/core/src/java/org/apache/orc/RecordReader.java
+++ b/java/core/src/java/org/apache/orc/RecordReader.java
@@ -24,7 +24,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
 /**
  * A row-by-row iterator for ORC files.
  */
-public interface RecordReader {
+public interface RecordReader extends AutoCloseable {
   /**
    * Read the next row batch. The size of the batch to read cannot be
    * controlled by the callers. Caller need to look at

http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/mapreduce/src/java/org/apache/orc/mapred/OrcMapredRecordReader.java
----------------------------------------------------------------------
diff --git a/java/mapreduce/src/java/org/apache/orc/mapred/OrcMapredRecordReader.java b/java/mapreduce/src/java/org/apache/orc/mapred/OrcMapredRecordReader.java
index ddbc396..4c3c0d3 100644
--- a/java/mapreduce/src/java/org/apache/orc/mapred/OrcMapredRecordReader.java
+++ b/java/mapreduce/src/java/org/apache/orc/mapred/OrcMapredRecordReader.java
@@ -60,6 +60,14 @@ public class OrcMapredRecordReader<V extends WritableComparable>
   private final VectorizedRowBatch batch;
   private int rowInBatch;
 
+  public OrcMapredRecordReader(RecordReader reader,
+                               TypeDescription schema) throws IOException {
+    this.batchReader = reader;
+    this.batch = schema.createRowBatch();
+    this.schema = schema;
+    rowInBatch = 0;
+  }
+
   protected OrcMapredRecordReader(Reader fileReader,
                                   Reader.Options options) throws IOException {
     this.batchReader = fileReader.rows(options);

http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/mapreduce/src/java/org/apache/orc/mapreduce/OrcMapreduceRecordReader.java
----------------------------------------------------------------------
diff --git a/java/mapreduce/src/java/org/apache/orc/mapreduce/OrcMapreduceRecordReader.java b/java/mapreduce/src/java/org/apache/orc/mapreduce/OrcMapreduceRecordReader.java
index f686e05..21a5e01 100644
--- a/java/mapreduce/src/java/org/apache/orc/mapreduce/OrcMapreduceRecordReader.java
+++ b/java/mapreduce/src/java/org/apache/orc/mapreduce/OrcMapreduceRecordReader.java
@@ -45,6 +45,15 @@ public class OrcMapreduceRecordReader<V extends WritableComparable>
   private int rowInBatch;
   private final V row;
 
+  public OrcMapreduceRecordReader(RecordReader reader,
+                                  TypeDescription schema) throws IOException {
+    this.batchReader = reader;
+    this.batch = schema.createRowBatch();
+    this.schema = schema;
+    rowInBatch = 0;
+    this.row = (V) OrcStruct.createValue(schema);
+  }
+
   public OrcMapreduceRecordReader(Reader fileReader,
                                   Reader.Options options) throws IOException {
     this.batchReader = fileReader.rows(options);

http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/pom.xml
----------------------------------------------------------------------
diff --git a/java/pom.xml b/java/pom.xml
index 2c9485f..5c9ab04 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -272,7 +272,7 @@
       <dependency>
         <groupId>commons-cli</groupId>
         <artifactId>commons-cli</artifactId>
-        <version>1.2</version>
+        <version>1.3.1</version>
       </dependency>
       <dependency>
         <groupId>commons-codec</groupId>

http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/Driver.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/Driver.java b/java/tools/src/java/org/apache/orc/tools/Driver.java
index 37bedbe..9bba013 100644
--- a/java/tools/src/java/org/apache/orc/tools/Driver.java
+++ b/java/tools/src/java/org/apache/orc/tools/Driver.java
@@ -24,6 +24,8 @@ import org.apache.commons.cli.OptionBuilder;
 import org.apache.commons.cli.Options;
 import org.apache.commons.cli.ParseException;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.orc.tools.convert.ConvertTool;
+import org.apache.orc.tools.json.JsonSchemaFinder;
 
 import java.util.Map;
 import java.util.Properties;
@@ -87,6 +89,8 @@ public class Driver {
       System.err.println("   meta - print the metadata about the ORC file");
       System.err.println("   data - print the data from the ORC file");
       System.err.println("   scan - scan the ORC file");
+      System.err.println("   convert - convert JSON files to ORC");
+      System.err.println("   json-schema - scan JSON files to determine their schema");
       System.err.println();
       System.err.println("To get more help, provide -h to the command");
       System.exit(1);
@@ -102,6 +106,10 @@ public class Driver {
       PrintData.main(conf, options.commandArgs);
     } else if ("scan".equals(options.command)) {
       ScanData.main(conf, options.commandArgs);
+    } else if ("json-schema".equals(options.command)) {
+      JsonSchemaFinder.main(conf, options.commandArgs);
+    } else if ("convert".equals(options.command)) {
+      ConvertTool.main(conf, options.commandArgs);
     } else {
       System.err.println("Unknown subcommand: " + options.command);
       System.exit(1);

http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java b/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java
new file mode 100644
index 0000000..81fc2ec
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.tools.convert;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.OrcFile;
+import org.apache.orc.RecordReader;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.Writer;
+import org.apache.orc.tools.json.JsonSchemaFinder;
+
+import java.io.IOException;
+
+/**
+ * A conversion tool to convert JSON files into ORC files.
+ */
+public class ConvertTool {
+
+  static TypeDescription computeSchema(String[] filename) throws IOException {
+    JsonSchemaFinder schemaFinder = new JsonSchemaFinder();
+    for(String file: filename) {
+      System.err.println("Scanning " + file + " for schema");
+      schemaFinder.addFile(file);
+    }
+    return schemaFinder.getSchema();
+  }
+
+  public static void main(Configuration conf,
+                          String[] args) throws IOException, ParseException {
+    CommandLine opts = parseOptions(args);
+    TypeDescription schema;
+    if (opts.hasOption('s')) {
+      schema = TypeDescription.fromString(opts.getOptionValue('s'));
+    } else {
+      schema = computeSchema(opts.getArgs());
+    }
+    String outFilename = opts.hasOption('o')
+        ? opts.getOptionValue('o') : "output.orc";
+    Writer writer = OrcFile.createWriter(new Path(outFilename),
+        OrcFile.writerOptions(conf).setSchema(schema));
+    VectorizedRowBatch batch = schema.createRowBatch();
+    for (String file: opts.getArgs()) {
+      System.err.println("Processing " + file);
+      RecordReader reader = new JsonReader(new Path(file), schema, conf);
+      while (reader.nextBatch(batch)) {
+        writer.addRowBatch(batch);
+      }
+      reader.close();
+    }
+    writer.close();
+  }
+
+  static CommandLine parseOptions(String[] args) throws ParseException {
+    Options options = new Options();
+
+    options.addOption(
+        Option.builder("h").longOpt("help").desc("Provide help").build());
+    options.addOption(
+        Option.builder("s").longOpt("schema").hasArg()
+            .desc("The schema to write in to the file").build());
+    options.addOption(
+        Option.builder("o").longOpt("output").desc("Output filename")
+            .hasArg().build());
+    CommandLine cli = new GnuParser().parse(options, args);
+    if (cli.hasOption('h')) {
+      HelpFormatter formatter = new HelpFormatter();
+      formatter.printHelp("convert", options);
+      System.exit(1);
+    }
+    return cli;
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/convert/JsonReader.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/convert/JsonReader.java b/java/tools/src/java/org/apache/orc/tools/convert/JsonReader.java
new file mode 100644
index 0000000..2cc5711
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/convert/JsonReader.java
@@ -0,0 +1,306 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.tools.convert;
+
+import com.google.gson.JsonArray;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+import com.google.gson.JsonStreamParser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.RecordReader;
+import org.apache.orc.TypeDescription;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.sql.Timestamp;
+import java.util.List;
+import java.util.zip.GZIPInputStream;
+
+public class JsonReader implements RecordReader {
+  private final TypeDescription schema;
+  private final JsonStreamParser parser;
+  private final JsonConverter[] converters;
+  private final long totalSize;
+  private final FSDataInputStream rawStream;
+  private long rowNumber = 0;
+
+  interface JsonConverter {
+    void convert(JsonElement value, ColumnVector vect, int row);
+  }
+
+  static class BooleanColumnConverter implements JsonConverter {
+    public void convert(JsonElement value, ColumnVector vect, int row) {
+      if (value == null || value.isJsonNull()) {
+        vect.noNulls = false;
+        vect.isNull[row] = true;
+      } else {
+        LongColumnVector vector = (LongColumnVector) vect;
+        vector.vector[row] = value.getAsBoolean() ? 1 : 0;
+      }
+    }
+  }
+
+  static class LongColumnConverter implements JsonConverter {
+    public void convert(JsonElement value, ColumnVector vect, int row) {
+      if (value == null || value.isJsonNull()) {
+        vect.noNulls = false;
+        vect.isNull[row] = true;
+      } else {
+        LongColumnVector vector = (LongColumnVector) vect;
+        vector.vector[row] = value.getAsLong();
+      }
+    }
+  }
+
+  static class DoubleColumnConverter implements JsonConverter {
+    public void convert(JsonElement value, ColumnVector vect, int row) {
+      if (value == null || value.isJsonNull()) {
+        vect.noNulls = false;
+        vect.isNull[row] = true;
+      } else {
+        DoubleColumnVector vector = (DoubleColumnVector) vect;
+        vector.vector[row] = value.getAsDouble();
+      }
+    }
+  }
+
+  static class StringColumnConverter implements JsonConverter {
+    public void convert(JsonElement value, ColumnVector vect, int row) {
+      if (value == null || value.isJsonNull()) {
+        vect.noNulls = false;
+        vect.isNull[row] = true;
+      } else {
+        BytesColumnVector vector = (BytesColumnVector) vect;
+        byte[] bytes = value.getAsString().getBytes(StandardCharsets.UTF_8);
+        vector.setRef(row, bytes, 0, bytes.length);
+      }
+    }
+  }
+
+  static class BinaryColumnConverter implements JsonConverter {
+    public void convert(JsonElement value, ColumnVector vect, int row) {
+      if (value == null || value.isJsonNull()) {
+        vect.noNulls = false;
+        vect.isNull[row] = true;
+      } else {
+        BytesColumnVector vector = (BytesColumnVector) vect;
+        String binStr = value.getAsString();
+        byte[] bytes = new byte[binStr.length()/2];
+        for(int i=0; i < bytes.length; ++i) {
+          bytes[i] = (byte) Integer.parseInt(binStr.substring(i*2, i*2+2), 16);
+        }
+        vector.setRef(row, bytes, 0, bytes.length);
+      }
+    }
+  }
+
+  static class TimestampColumnConverter implements JsonConverter {
+    public void convert(JsonElement value, ColumnVector vect, int row) {
+      if (value == null || value.isJsonNull()) {
+        vect.noNulls = false;
+        vect.isNull[row] = true;
+      } else {
+        TimestampColumnVector vector = (TimestampColumnVector) vect;
+        vector.set(row, Timestamp.valueOf(value.getAsString()
+            .replaceAll("[TZ]", " ")));
+      }
+    }
+  }
+
+  static class DecimalColumnConverter implements JsonConverter {
+    public void convert(JsonElement value, ColumnVector vect, int row) {
+      if (value == null || value.isJsonNull()) {
+        vect.noNulls = false;
+        vect.isNull[row] = true;
+      } else {
+        DecimalColumnVector vector = (DecimalColumnVector) vect;
+        vector.vector[row].set(HiveDecimal.create(value.getAsString()));
+      }
+    }
+  }
+
+  static class StructColumnConverter implements JsonConverter {
+    private JsonConverter[] childrenConverters;
+    private List<String> fieldNames;
+
+    public StructColumnConverter(TypeDescription schema) {
+      List<TypeDescription> kids = schema.getChildren();
+      childrenConverters = new JsonConverter[kids.size()];
+      for(int c=0; c < childrenConverters.length; ++c) {
+        childrenConverters[c] = createConverter(kids.get(c));
+      }
+      fieldNames = schema.getFieldNames();
+    }
+
+    public void convert(JsonElement value, ColumnVector vect, int row) {
+      if (value == null || value.isJsonNull()) {
+        vect.noNulls = false;
+        vect.isNull[row] = true;
+      } else {
+        StructColumnVector vector = (StructColumnVector) vect;
+        JsonObject obj = value.getAsJsonObject();
+        for(int c=0; c < childrenConverters.length; ++c) {
+          JsonElement elem = obj.get(fieldNames.get(c));
+          childrenConverters[c].convert(elem, vector.fields[c], row);
+        }
+      }
+    }
+  }
+
+  static class ListColumnConverter implements JsonConverter {
+    private JsonConverter childrenConverter;
+
+    public ListColumnConverter(TypeDescription schema) {
+      childrenConverter = createConverter(schema.getChildren().get(0));
+    }
+
+    public void convert(JsonElement value, ColumnVector vect, int row) {
+      if (value == null || value.isJsonNull()) {
+        vect.noNulls = false;
+        vect.isNull[row] = true;
+      } else {
+        ListColumnVector vector = (ListColumnVector) vect;
+        JsonArray obj = value.getAsJsonArray();
+        vector.lengths[row] = obj.size();
+        vector.offsets[row] = vector.childCount;
+        vector.childCount += vector.lengths[row];
+        vector.child.ensureSize(vector.childCount, true);
+        for(int c=0; c < obj.size(); ++c) {
+          childrenConverter.convert(obj.get(c), vector.child,
+              (int) vector.offsets[row] + c);
+        }
+      }
+    }
+  }
+
+  static JsonConverter createConverter(TypeDescription schema) {
+    switch (schema.getCategory()) {
+      case BYTE:
+      case SHORT:
+      case INT:
+      case LONG:
+        return new LongColumnConverter();
+      case FLOAT:
+      case DOUBLE:
+        return new DoubleColumnConverter();
+      case CHAR:
+      case VARCHAR:
+      case STRING:
+        return new StringColumnConverter();
+      case DECIMAL:
+        return new DecimalColumnConverter();
+      case TIMESTAMP:
+        return new TimestampColumnConverter();
+      case BINARY:
+        return new BinaryColumnConverter();
+      case BOOLEAN:
+        return new BooleanColumnConverter();
+      case STRUCT:
+        return new StructColumnConverter(schema);
+      case LIST:
+        return new ListColumnConverter(schema);
+      default:
+        throw new IllegalArgumentException("Unhandled type " + schema);
+    }
+  }
+
+  public JsonReader(Path path,
+                    TypeDescription schema,
+                    Configuration conf) throws IOException {
+    this.schema = schema;
+    FileSystem fs = path.getFileSystem(conf);
+    totalSize = fs.getFileStatus(path).getLen();
+    rawStream = fs.open(path);
+    String name = path.getName();
+    int lastDot = name.lastIndexOf(".");
+    InputStream input = rawStream;
+    if (lastDot >= 0) {
+      if (".gz".equals(name.substring(lastDot))) {
+        input = new GZIPInputStream(rawStream);
+      }
+    }
+    parser = new JsonStreamParser(new InputStreamReader(input,
+        StandardCharsets.UTF_8));
+    if (schema.getCategory() != TypeDescription.Category.STRUCT) {
+      throw new IllegalArgumentException("Root must be struct - " + schema);
+    }
+    List<TypeDescription> fieldTypes = schema.getChildren();
+    converters = new JsonConverter[fieldTypes.size()];
+    for(int c = 0; c < converters.length; ++c) {
+      converters[c] = createConverter(fieldTypes.get(c));
+    }
+  }
+
+  public boolean nextBatch(VectorizedRowBatch batch) throws IOException {
+    batch.reset();
+    int maxSize = batch.getMaxSize();
+    List<String> fieldNames = schema.getFieldNames();
+    while (parser.hasNext() && batch.size < maxSize) {
+      JsonObject elem = parser.next().getAsJsonObject();
+      for(int c=0; c < converters.length; ++c) {
+        // look up each field to see if it is in the input, otherwise
+        // set it to null.
+        JsonElement field = elem.get(fieldNames.get(c));
+        if (field == null) {
+          batch.cols[c].noNulls = false;
+          batch.cols[c].isNull[batch.size] = true;
+        } else {
+          converters[c].convert(field, batch.cols[c], batch.size);
+        }
+      }
+      batch.size++;
+    }
+    rowNumber += batch.size;
+    return batch.size != 0;
+  }
+
+  @Override
+  public long getRowNumber() throws IOException {
+    return rowNumber;
+  }
+
+  @Override
+  public float getProgress() throws IOException {
+    long pos = rawStream.getPos();
+    return totalSize != 0 && pos < totalSize ? (float) pos / totalSize : 1;
+  }
+
+  public void close() throws IOException {
+    rawStream.close();
+  }
+
+  @Override
+  public void seekToRow(long rowCount) throws IOException {
+    throw new UnsupportedOperationException("Seek is not supported by JsonReader");
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/json/BooleanType.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/json/BooleanType.java b/java/tools/src/java/org/apache/orc/tools/json/BooleanType.java
new file mode 100644
index 0000000..916fe30
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/json/BooleanType.java
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools.json;
+
+import org.apache.orc.TypeDescription;
+
+/**
+ * A type that represents true, false, and null.
+ */
+class BooleanType extends HiveType {
+  BooleanType() {
+    super(Kind.BOOLEAN);
+  }
+
+  @Override
+  public String toString() {
+    return "boolean";
+  }
+
+  @Override
+  public boolean subsumes(HiveType other) {
+    return other.kind == Kind.BOOLEAN || other.kind == Kind.NULL;
+  }
+
+  @Override
+  public void merge(HiveType other) {
+    // nothing to do to merge boolean types
+  }
+
+  @Override
+  public TypeDescription getSchema() {
+    return TypeDescription.createBoolean();
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/json/HiveType.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/json/HiveType.java b/java/tools/src/java/org/apache/orc/tools/json/HiveType.java
new file mode 100644
index 0000000..6222aca
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/json/HiveType.java
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools.json;
+
+import org.apache.orc.TypeDescription;
+
+import java.io.PrintStream;
+
+/**
+ * The internal representation of what we have discovered about a given
+ * field's type.
+ */
+abstract class HiveType {
+  enum Kind {
+    NULL(0),
+    BOOLEAN(1),
+    BYTE(1), SHORT(2), INT(3), LONG(4), DECIMAL(5), FLOAT(6), DOUBLE(7),
+    BINARY(1), DATE(1), TIMESTAMP(1), STRING(2),
+    STRUCT(1, false),
+    LIST(1, false),
+    UNION(8, false);
+
+    // for types that subsume each other, establish a ranking.
+    final int rank;
+    final boolean isPrimitive;
+    Kind(int rank, boolean isPrimitive) {
+      this.rank = rank;
+      this.isPrimitive = isPrimitive;
+    }
+    Kind(int rank) {
+      this(rank, true);
+    }
+  }
+
+  protected Kind kind;
+
+  HiveType(Kind kind) {
+    this.kind = kind;
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (other == null || other.getClass() != getClass()) {
+      return false;
+    }
+    return ((HiveType) other).kind.equals(kind);
+  }
+
+  @Override
+  public int hashCode() {
+    return kind.hashCode();
+  }
+
+  /**
+   * Does this type include all of the values of the other type?
+   * @param other the other type to compare against
+   * @return true, if this type includes all of the values of the other type
+   */
+  public abstract boolean subsumes(HiveType other);
+
+  /**
+   * Merge the other type into this one. It assumes that subsubes(other) is
+   * true.
+   * @param other
+   */
+  public abstract void merge(HiveType other);
+
+  /**
+   * Print this type into the stream using a flat structure given the
+   * prefix on each element.
+   * @param out the stream to print to
+   * @param prefix the prefix to add to each field name
+   */
+  public void printFlat(PrintStream out, String prefix) {
+    out.println(prefix + ": " + toString());
+  }
+
+  public abstract TypeDescription getSchema();
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/json/JsonSchemaFinder.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/json/JsonSchemaFinder.java b/java/tools/src/java/org/apache/orc/tools/json/JsonSchemaFinder.java
new file mode 100644
index 0000000..40841fc
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/json/JsonSchemaFinder.java
@@ -0,0 +1,300 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.tools.json;
+
+import com.google.gson.JsonArray;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+import com.google.gson.JsonPrimitive;
+import com.google.gson.JsonStreamParser;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.orc.TypeDescription;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.PrintStream;
+import java.math.BigInteger;
+import java.nio.charset.StandardCharsets;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * This class determines the equivalent Hive schema for a group of JSON
+ * documents.
+ * boolean
+ */
+public class JsonSchemaFinder {
+  private static final Logger LOG = LoggerFactory.getLogger(JsonSchemaFinder.class);
+
+  private static final Pattern HEX_PATTERN =
+      Pattern.compile("^([0-9a-fA-F][0-9a-fA-F])+$");
+  private static final Pattern TIMESTAMP_PATTERN =
+      Pattern.compile("^[\"]?([0-9]{4}[-/][0-9]{2}[-/][0-9]{2})[T ]" +
+          "([0-9]{2}:[0-9]{2}:[0-9]{2})" +
+          "(([ ][-+]?[0-9]{2}([:][0-9]{2})?)|Z)?[\"]?$");
+  private static final Pattern DECIMAL_PATTERN =
+      Pattern.compile("^-?(?<int>[0-9]+)([.](?<fraction>[0-9]+))?$");
+  private static final int INDENT = 2;
+  private static final int MAX_DECIMAL_DIGITS = 38;
+
+  static final BigInteger MIN_LONG = new BigInteger("-9223372036854775808");
+  static final BigInteger MAX_LONG = new BigInteger("9223372036854775807");
+
+  private HiveType mergedType = null;
+  private long records = 0;
+
+  static HiveType pickType(JsonElement json) {
+    if (json.isJsonPrimitive()) {
+      JsonPrimitive prim = (JsonPrimitive) json;
+      if (prim.isBoolean()) {
+        return new BooleanType();
+      } else if (prim.isNumber()) {
+        Matcher matcher = DECIMAL_PATTERN.matcher(prim.getAsString());
+        if (matcher.matches()) {
+          int intDigits = matcher.group("int").length();
+          String fraction = matcher.group("fraction");
+          int scale = fraction == null ? 0 : fraction.length();
+          if (scale == 0) {
+            if (intDigits < 19) {
+              long value = prim.getAsLong();
+              if (value >= -128 && value < 128) {
+                return new NumericType(HiveType.Kind.BYTE, intDigits, scale);
+              } else if (value >= -32768 && value < 32768) {
+                return new NumericType(HiveType.Kind.SHORT, intDigits, scale);
+              } else if (value >= -2147483648 && value < 2147483648L) {
+                return new NumericType(HiveType.Kind.INT, intDigits, scale);
+              } else {
+                return new NumericType(HiveType.Kind.LONG, intDigits, scale);
+              }
+            } else if (intDigits == 19) {
+              // at 19 digits, it may fit inside a long, but we need to check
+              BigInteger val = prim.getAsBigInteger();
+              if (val.compareTo(MIN_LONG) >= 0 && val.compareTo(MAX_LONG) <= 0) {
+                return new NumericType(HiveType.Kind.LONG, intDigits, scale);
+              }
+            }
+          }
+          if (intDigits + scale <= MAX_DECIMAL_DIGITS) {
+            return new NumericType(HiveType.Kind.DECIMAL, intDigits, scale);
+          }
+        }
+        double value = prim.getAsDouble();
+        if (value >= Float.MIN_VALUE && value <= Float.MAX_VALUE) {
+          return new NumericType(HiveType.Kind.FLOAT, 0, 0);
+        } else {
+          return new NumericType(HiveType.Kind.DOUBLE, 0, 0);
+        }
+      } else {
+        String str = prim.getAsString();
+        if (TIMESTAMP_PATTERN.matcher(str).matches()) {
+          return new StringType(HiveType.Kind.TIMESTAMP);
+        } else if (HEX_PATTERN.matcher(str).matches()) {
+          return new StringType(HiveType.Kind.BINARY);
+        } else {
+          return new StringType(HiveType.Kind.STRING);
+        }
+      }
+    } else if (json.isJsonNull()) {
+      return new NullType();
+    } else if (json.isJsonArray()) {
+      ListType result = new ListType();
+      result.elementType = new NullType();
+      for(JsonElement child: ((JsonArray) json)) {
+        HiveType sub = pickType(child);
+        if (result.elementType.subsumes(sub)) {
+          result.elementType.merge(sub);
+        } else if (sub.subsumes(result.elementType)) {
+          sub.merge(result.elementType);
+          result.elementType = sub;
+        } else {
+          result.elementType = new UnionType(result.elementType, sub);
+        }
+      }
+      return result;
+    } else {
+      JsonObject obj = (JsonObject) json;
+      StructType result = new StructType();
+      for(Map.Entry<String,JsonElement> field: obj.entrySet()) {
+        String fieldName = field.getKey();
+        HiveType type = pickType(field.getValue());
+        result.fields.put(fieldName, type);
+      }
+      return result;
+    }
+  }
+
+  static HiveType mergeType(HiveType previous, HiveType type) {
+    if (previous == null) {
+      return type;
+    } else if (type == null) {
+      return previous;
+    }
+    if (previous.subsumes(type)) {
+      previous.merge(type);
+    } else if (type.subsumes(previous)) {
+      type.merge(previous);
+      previous = type;
+    } else {
+      previous = new UnionType(previous, type);
+    }
+    return previous;
+  }
+
+  static void printType(PrintStream out, HiveType type, int margin) {
+    if (type == null) {
+      out.print("void");
+    } else if (type.kind.isPrimitive) {
+      out.print(type.toString());
+    } else {
+      switch (type.kind) {
+        case STRUCT:
+          out.println("struct <");
+          boolean first = true;
+          for(Map.Entry<String, HiveType> field:
+              ((StructType) type).fields.entrySet()) {
+            if (!first) {
+              out.println(",");
+            } else {
+              first = false;
+            }
+            for(int i=0; i < margin; i++) {
+              out.print(' ');
+            }
+            out.print(field.getKey());
+            out.print(": ");
+            printType(out, field.getValue(), margin + INDENT);
+          }
+          out.print(">");
+          break;
+        case LIST:
+          out.print("array <");
+          printType(out, ((ListType) type).elementType, margin + INDENT);
+          out.print(">");
+          break;
+        case UNION:
+          out.print("uniontype <");
+          first = true;
+          for(HiveType child: ((UnionType) type).children) {
+            if (!first) {
+              out.print(',');
+            } else {
+              first = false;
+            }
+            printType(out, child, margin + INDENT);
+          }
+          out.print(">");
+          break;
+        default:
+          throw new IllegalArgumentException("Unknown kind " + type.kind);
+      }
+    }
+  }
+
+  static void printAsTable(PrintStream out, StructType type) {
+    out.println("create table tbl (");
+    boolean first = true;
+    for(Map.Entry<String, HiveType> field: type.fields.entrySet()) {
+      if (!first) {
+        out.println(",");
+      } else {
+        first = false;
+      }
+      for(int i=0; i < INDENT; ++i) {
+        out.print(' ');
+      }
+      out.print(field.getKey());
+      out.print(" ");
+      printType(out, field.getValue(), 2 * INDENT);
+    }
+    out.println();
+    out.println(")");
+  }
+
+  public void addFile(String filename) throws IOException {
+    java.io.Reader reader;
+    FileInputStream inputStream = new FileInputStream(filename);
+    if (filename.endsWith(".gz")) {
+      reader = new InputStreamReader(new GZIPInputStream(inputStream),
+          StandardCharsets.UTF_8);
+    } else {
+      reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8);
+    }
+    JsonStreamParser parser = new JsonStreamParser(reader);
+    while (parser.hasNext()) {
+      records += 1;
+      mergedType = mergeType(mergedType, pickType(parser.next()));
+    }
+  }
+
+  public TypeDescription getSchema() {
+    return mergedType.getSchema();
+  }
+
+  public static void main(Configuration conf,
+                          String[] args) throws Exception {
+    JsonSchemaFinder result = new JsonSchemaFinder();
+    CommandLine cli = parseArguments(args);
+    for (String filename: cli.getArgs()) {
+      System.err.println("Reading file " + filename);
+      result.addFile(filename);
+    }
+    System.err.println(result.records + " records read");
+    System.err.println();
+    if (cli.hasOption('f')) {
+      result.mergedType.printFlat(System.out, "root");
+    } else if (cli.hasOption('t')) {
+      printAsTable(System.out, (StructType) result.mergedType);
+    } else {
+      System.out.println(result.getSchema());
+    }
+  }
+
+  static CommandLine parseArguments(String[] args) throws ParseException {
+    Options options = new Options();
+
+    options.addOption(Option.builder("h").longOpt("help")
+        .desc("Provide help").build());
+    options.addOption(Option.builder("f").longOpt("flat")
+        .desc("Print types as flat list of types").build());
+    options.addOption(Option.builder("t").longOpt("table")
+        .desc("Print types as Hive table declaration").build());
+    CommandLine cli = new GnuParser().parse(options, args);
+    if (cli.hasOption('h')) {
+      HelpFormatter formatter = new HelpFormatter();
+      formatter.printHelp("json-schema", options);
+      System.exit(1);
+    }
+    return cli;
+  }
+
+  public static void main(String[] args) throws Exception {
+    main(new Configuration(), args);
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/json/JsonShredder.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/json/JsonShredder.java b/java/tools/src/java/org/apache/orc/tools/json/JsonShredder.java
new file mode 100644
index 0000000..2f626a5
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/json/JsonShredder.java
@@ -0,0 +1,116 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.tools.json;
+
+import com.google.gson.JsonArray;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+import com.google.gson.JsonPrimitive;
+import com.google.gson.JsonStreamParser;
+
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.PrintStream;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * This class takes a set of JSON documents and shreds them into a file per
+ * a primitive column. This is useful when trying to understand a set of
+ * documents by providing sample values for each of the columns.
+ *
+ * For example, a document that looks like:
+ * {'a': 'aaaa', 'b': { 'c': 12, 'd': true}, e: 'eeee'}
+ *
+ * Will produce 4 files with the given contents:
+ * root.a: aaaa
+ * root.b.c: 12
+ * root.b.d: true
+ * root.e: eeee
+ */
+public class JsonShredder {
+
+  private final Map<String, PrintStream> files =
+      new HashMap<String, PrintStream>();
+
+  private PrintStream getFile(String name) throws IOException {
+    PrintStream result = files.get(name);
+    if (result == null) {
+      result = new PrintStream(new FileOutputStream(name + ".txt"), false,
+          StandardCharsets.UTF_8.name());
+      files.put(name, result);
+    }
+    return result;
+  }
+
+  private void shredObject(String name, JsonElement json) throws IOException {
+    if (json.isJsonPrimitive()) {
+      JsonPrimitive primitive = (JsonPrimitive) json;
+      getFile(name).println(primitive.getAsString());
+    } else if (json.isJsonNull()) {
+      // just skip it
+    } else if (json.isJsonArray()) {
+      for(JsonElement child: ((JsonArray) json)) {
+        shredObject(name + ".list", child);
+      }
+    } else {
+      JsonObject obj = (JsonObject) json;
+      for(Map.Entry<String,JsonElement> field: obj.entrySet()) {
+        String fieldName = field.getKey();
+        shredObject(name + "." + fieldName, field.getValue());
+      }
+    }
+  }
+
+  private void close() throws IOException {
+    for(Map.Entry<String, PrintStream> file: files.entrySet()) {
+      file.getValue().close();
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    int count = 0;
+    JsonShredder shredder = new JsonShredder();
+    for (String filename: args) {
+      System.out.println("Reading " + filename);
+      System.out.flush();
+      java.io.Reader reader;
+      FileInputStream inStream = new FileInputStream(filename);
+      if (filename.endsWith(".gz")) {
+        reader = new InputStreamReader(new GZIPInputStream(inStream),
+            StandardCharsets.UTF_8);
+      } else {
+        reader = new InputStreamReader(inStream, StandardCharsets.UTF_8);
+      }
+      JsonStreamParser parser = new JsonStreamParser(reader);
+      while (parser.hasNext()) {
+        count += 1;
+        JsonElement item = parser.next();
+        shredder.shredObject("root", item);
+      }
+    }
+    shredder.close();
+    System.out.println(count + " records read");
+    System.out.println();
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/json/ListType.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/json/ListType.java b/java/tools/src/java/org/apache/orc/tools/json/ListType.java
new file mode 100644
index 0000000..7ef80fd
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/json/ListType.java
@@ -0,0 +1,87 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools.json;
+
+import org.apache.orc.TypeDescription;
+
+import java.io.PrintStream;
+
+/**
+ * A model for types that are lists.
+ */
+class ListType extends HiveType {
+  HiveType elementType;
+
+  public ListType() {
+    super(Kind.LIST);
+  }
+
+  public ListType(HiveType child) {
+    super(Kind.LIST);
+    this.elementType = child;
+  }
+
+  @Override
+  public String toString() {
+    StringBuilder buf = new StringBuilder("list<");
+    buf.append(elementType.toString());
+    buf.append(">");
+    return buf.toString();
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    return super.equals(other) &&
+        elementType.equals(((ListType) other).elementType);
+  }
+
+  @Override
+  public int hashCode() {
+    return super.hashCode() * 3 + elementType.hashCode();
+  }
+
+  @Override
+  public boolean subsumes(HiveType other) {
+    return other.kind == Kind.NULL || other.kind == Kind.LIST;
+  }
+
+  @Override
+  public void merge(HiveType other) {
+    if (other instanceof ListType) {
+      ListType otherList = (ListType) other;
+      if (elementType.subsumes(otherList.elementType)) {
+        elementType.merge(otherList.elementType);
+      } else if (otherList.elementType.subsumes(elementType)) {
+        otherList.elementType.merge(elementType);
+        elementType = otherList.elementType;
+      } else {
+        elementType = new UnionType(elementType, otherList.elementType);
+      }
+    }
+  }
+
+  public void printFlat(PrintStream out, String prefix) {
+    elementType.printFlat(out, prefix + "._list");
+  }
+
+  @Override
+  public TypeDescription getSchema() {
+    return TypeDescription.createList(elementType.getSchema());
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/json/NullType.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/json/NullType.java b/java/tools/src/java/org/apache/orc/tools/json/NullType.java
new file mode 100644
index 0000000..fa22a3b
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/json/NullType.java
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools.json;
+
+import org.apache.orc.TypeDescription;
+
+/**
+ * The type that only includes the null value.
+ */
+class NullType extends HiveType {
+  NullType() {
+    super(Kind.NULL);
+  }
+
+  @Override
+  public String toString() {
+    return "void";
+  }
+
+  @Override
+  public boolean subsumes(HiveType other) {
+    return other.kind == Kind.NULL;
+  }
+
+  @Override
+  public void merge(HiveType other) {
+    // nothing to do to merge null types *smile*
+  }
+
+  @Override
+  public TypeDescription getSchema() {
+    return TypeDescription.createUnion();
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/json/NumericType.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/json/NumericType.java b/java/tools/src/java/org/apache/orc/tools/json/NumericType.java
new file mode 100644
index 0000000..172cb4c
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/json/NumericType.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools.json;
+
+import org.apache.orc.TypeDescription;
+
+/**
+ * A type that represents all of the numeric types: byte, short, int, long,
+ * float, double, and decimal.
+ */
+class NumericType extends HiveType {
+  // the maximum number of digits before the decimal
+  int intDigits;
+  // the maximum number of digits after the decimal
+  int scale;
+
+  NumericType(Kind kind, int intDigits, int scale) {
+    super(kind);
+    this.intDigits = intDigits;
+    this.scale = scale;
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (super.equals(other)) {
+      NumericType otherNumber = (NumericType) other;
+      return intDigits == otherNumber.intDigits || scale == otherNumber.scale;
+    }
+    return false;
+  }
+
+  @Override
+  public int hashCode() {
+    return super.hashCode() * 41 + (intDigits * 17) + scale;
+  }
+
+  @Override
+  public String toString() {
+    switch (kind) {
+      case BYTE:
+        return "tinyint";
+      case SHORT:
+        return "smallint";
+      case INT:
+        return "int";
+      case LONG:
+        return "bigint";
+      case DECIMAL:
+        return "decimal(" + (intDigits + scale) + "," + scale + ")";
+      case FLOAT:
+        return "float";
+      case DOUBLE:
+        return "double";
+      default:
+        throw new IllegalArgumentException("Unknown kind " +  kind);
+    }
+  }
+
+  @Override
+  public boolean subsumes(HiveType other) {
+    return other.getClass() == NumericType.class || other.kind == Kind.NULL;
+  }
+
+  @Override
+  public void merge(HiveType other) {
+    if (other.getClass() == NumericType.class) {
+      NumericType otherNumber = (NumericType) other;
+      this.intDigits = Math.max(this.intDigits, otherNumber.intDigits);
+      this.scale = Math.max(this.scale, otherNumber.scale);
+      if (kind.rank < other.kind.rank) {
+        kind = other.kind;
+      }
+    }
+  }
+
+  @Override
+  public TypeDescription getSchema() {
+    switch (kind) {
+      case BYTE:
+        return TypeDescription.createByte();
+      case SHORT:
+        return TypeDescription.createShort();
+      case INT:
+        return TypeDescription.createInt();
+      case LONG:
+        return TypeDescription.createLong();
+      case DECIMAL:
+        return TypeDescription.createDecimal()
+            .withPrecision(intDigits+scale).withScale(scale);
+      case FLOAT:
+        return TypeDescription.createFloat();
+      case DOUBLE:
+        return TypeDescription.createDouble();
+      default:
+        throw new IllegalArgumentException("Unknown kind " +  kind);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/json/StringType.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/json/StringType.java b/java/tools/src/java/org/apache/orc/tools/json/StringType.java
new file mode 100644
index 0000000..32cb73d
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/json/StringType.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools.json;
+
+import org.apache.orc.TypeDescription;
+
+/**
+ * These are the types that correspond the the JSON string values: string,
+ * binary, timestamp, and date.
+ */
+class StringType extends HiveType {
+  StringType(Kind kind) {
+    super(kind);
+  }
+
+  @Override
+  public String toString() {
+    switch (kind) {
+      case BINARY:
+        return "binary";
+      case STRING:
+        return "string";
+      case TIMESTAMP:
+        return "timestamp";
+      case DATE:
+        return "date";
+      default:
+        throw new IllegalArgumentException("Unknown kind " + kind);
+    }
+  }
+
+  @Override
+  public boolean subsumes(HiveType other) {
+    return other.getClass() == StringType.class || other.kind == Kind.NULL;
+  }
+
+  @Override
+  public void merge(HiveType other) {
+    // the general case is that everything is a string.
+    if (other.getClass() == StringType.class && kind != other.kind) {
+      kind = Kind.STRING;
+    }
+  }
+
+  @Override
+  public TypeDescription getSchema() {
+    switch (kind) {
+      case BINARY:
+        return TypeDescription.createBinary();
+      case STRING:
+        return TypeDescription.createString();
+      case TIMESTAMP:
+        return TypeDescription.createTimestamp();
+      case DATE:
+        return TypeDescription.createDate();
+      default:
+        throw new IllegalArgumentException("Unknown kind " + kind);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/json/StructType.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/json/StructType.java b/java/tools/src/java/org/apache/orc/tools/json/StructType.java
new file mode 100644
index 0000000..c79146a
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/json/StructType.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools.json;
+
+import org.apache.orc.TypeDescription;
+
+import java.io.PrintStream;
+import java.util.Map;
+import java.util.TreeMap;
+
+/**
+ * Model structs.
+ */
+class StructType extends HiveType {
+  final Map<String, HiveType> fields = new TreeMap<String, HiveType>();
+
+  StructType() {
+    super(Kind.STRUCT);
+  }
+
+  @Override
+  public String toString() {
+    StringBuilder buf = new StringBuilder("struct<");
+    boolean first = true;
+    for (Map.Entry<String, HiveType> field : fields.entrySet()) {
+      if (!first) {
+        buf.append(',');
+      } else {
+        first = false;
+      }
+      buf.append(field.getKey());
+      buf.append(':');
+      buf.append(field.getValue().toString());
+    }
+    buf.append(">");
+    return buf.toString();
+  }
+
+  public StructType addField(String name, HiveType fieldType) {
+    fields.put(name, fieldType);
+    return this;
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    return super.equals(other) && fields.equals(((StructType) other).fields);
+  }
+
+  @Override
+  public int hashCode() {
+    int result = super.hashCode() * 3;
+    for (Map.Entry<String, HiveType> pair : fields.entrySet()) {
+      result += pair.getKey().hashCode() * 17 + pair.getValue().hashCode();
+    }
+    return result;
+  }
+
+  @Override
+  public boolean subsumes(HiveType other) {
+    return other.kind == Kind.NULL || other.kind == Kind.STRUCT;
+  }
+
+  @Override
+  public void merge(HiveType other) {
+    if (other.getClass() == StructType.class) {
+      StructType otherStruct = (StructType) other;
+      for (Map.Entry<String, HiveType> pair : otherStruct.fields.entrySet()) {
+        HiveType ourField = fields.get(pair.getKey());
+        if (ourField == null) {
+          fields.put(pair.getKey(), pair.getValue());
+        } else if (ourField.subsumes(pair.getValue())) {
+          ourField.merge(pair.getValue());
+        } else if (pair.getValue().subsumes(ourField)) {
+          pair.getValue().merge(ourField);
+          fields.put(pair.getKey(), pair.getValue());
+        } else {
+          fields.put(pair.getKey(), new UnionType(ourField, pair.getValue()));
+        }
+      }
+    }
+  }
+
+  public void printFlat(PrintStream out, String prefix) {
+    prefix = prefix + ".";
+    for (Map.Entry<String, HiveType> field : fields.entrySet()) {
+      field.getValue().printFlat(out, prefix + field.getKey());
+    }
+  }
+
+  @Override
+  public TypeDescription getSchema() {
+    TypeDescription result = TypeDescription.createStruct();
+    for (Map.Entry<String, HiveType> child: fields.entrySet()) {
+      result.addField(child.getKey(), child.getValue().getSchema());
+    }
+    return result;
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/java/org/apache/orc/tools/json/UnionType.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/json/UnionType.java b/java/tools/src/java/org/apache/orc/tools/json/UnionType.java
new file mode 100644
index 0000000..bd2fd89
--- /dev/null
+++ b/java/tools/src/java/org/apache/orc/tools/json/UnionType.java
@@ -0,0 +1,122 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools.json;
+
+import org.apache.orc.TypeDescription;
+
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * A union type to represent types that don't fit together.
+ */
+class UnionType extends HiveType {
+  final List<HiveType> children = new ArrayList<HiveType>();
+
+  UnionType() {
+    super(Kind.UNION);
+  }
+
+  UnionType(HiveType left, HiveType right) {
+    super(Kind.UNION);
+    children.add(left);
+    children.add(right);
+  }
+
+  UnionType addType(HiveType type) {
+    children.add(type);
+    return this;
+  }
+
+  @Override
+  public String toString() {
+    StringBuilder buf = new StringBuilder("uniontype<");
+    boolean first = true;
+    for (HiveType child : children) {
+      if (!first) {
+        buf.append(',');
+      } else {
+        first = false;
+      }
+      buf.append(child.toString());
+    }
+    buf.append(">");
+    return buf.toString();
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    return super.equals(other) &&
+        children.equals(((UnionType) other).children);
+  }
+
+  @Override
+  public int hashCode() {
+    int result = super.hashCode();
+    for (HiveType child : children) {
+      result += child.hashCode() * 17;
+    }
+    return result;
+  }
+
+  @Override
+  public boolean subsumes(HiveType other) {
+    return true;
+  }
+
+  @Override
+  public void merge(HiveType other) {
+    if (other instanceof UnionType) {
+      for (HiveType otherChild : ((UnionType) other).children) {
+        merge(otherChild);
+      }
+    } else {
+      for (int i = 0; i < children.size(); ++i) {
+        HiveType child = children.get(i);
+        if (child.subsumes(other)) {
+          child.merge(other);
+          return;
+        } else if (other.subsumes(child)) {
+          other.merge(child);
+          children.set(i, other);
+          return;
+        }
+      }
+      addType(other);
+    }
+  }
+
+  public void printFlat(PrintStream out, String prefix) {
+    prefix = prefix + ".";
+    int id = 0;
+    for (HiveType child : children) {
+      child.printFlat(out, prefix + (id++));
+    }
+  }
+
+  @Override
+  public TypeDescription getSchema() {
+    TypeDescription result = TypeDescription.createUnion();
+    for (HiveType child: children) {
+      result.addUnionChild(child.getSchema());
+    }
+    return result;
+  }
+}

http://git-wip-us.apache.org/repos/asf/orc/blob/9805e139/java/tools/src/test/org/apache/orc/tools/json/TestJsonSchemaFinder.java
----------------------------------------------------------------------
diff --git a/java/tools/src/test/org/apache/orc/tools/json/TestJsonSchemaFinder.java b/java/tools/src/test/org/apache/orc/tools/json/TestJsonSchemaFinder.java
new file mode 100644
index 0000000..fac092a
--- /dev/null
+++ b/java/tools/src/test/org/apache/orc/tools/json/TestJsonSchemaFinder.java
@@ -0,0 +1,346 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools.json;
+
+import com.google.gson.JsonArray;
+import com.google.gson.JsonNull;
+import com.google.gson.JsonObject;
+import com.google.gson.JsonPrimitive;
+import com.google.gson.internal.LazilyParsedNumber;
+import org.junit.Test;
+
+import static junit.framework.Assert.assertEquals;
+
+public class TestJsonSchemaFinder {
+
+  @Test
+  public void testBinaryPatterns() throws Exception {
+    assertEquals("binary",
+        JsonSchemaFinder.pickType(new JsonPrimitive("00000000")).toString());
+    assertEquals("string",
+        JsonSchemaFinder.pickType(new JsonPrimitive("0000000")).toString());
+    assertEquals("string",
+        JsonSchemaFinder.pickType(new JsonPrimitive("")).toString());
+    assertEquals("binary",
+        JsonSchemaFinder.pickType(new JsonPrimitive("0123456789abcdefABCDEF")).toString());
+    assertEquals("string",
+        JsonSchemaFinder.pickType(new JsonPrimitive("00x0")).toString());
+  }
+
+  @Test
+  public void testTimestampPatterns() throws Exception {
+    assertEquals("timestamp",
+        JsonSchemaFinder.pickType(new JsonPrimitive("2016-01-05T12:34:56Z")).toString());
+    assertEquals("timestamp",
+        JsonSchemaFinder.pickType(new JsonPrimitive("2016/01/05 12:34:56")).toString());
+    assertEquals("string",
+        JsonSchemaFinder.pickType(new JsonPrimitive("2016/01/05")).toString());
+    assertEquals("timestamp",
+        JsonSchemaFinder.pickType(new JsonPrimitive("2016-01-01 56:00:00 +08")).toString());
+    assertEquals("timestamp",
+        JsonSchemaFinder.pickType(new JsonPrimitive("2016-01-01 56:00:00 -08:30")).toString());
+  }
+
+  @Test
+  public void testBooleans() throws Exception {
+    assertEquals("boolean",
+        JsonSchemaFinder.pickType(new JsonPrimitive(true)).toString());
+    assertEquals("void",
+        JsonSchemaFinder.pickType(JsonNull.INSTANCE).toString());
+    assertEquals("boolean",
+        JsonSchemaFinder.pickType(new JsonPrimitive(false)).toString());
+  }
+
+  @Test
+  public void testNumbers() throws Exception {
+    assertEquals("tinyint",
+        JsonSchemaFinder.pickType(new JsonPrimitive
+            (new LazilyParsedNumber("120"))).toString());
+    assertEquals("tinyint",
+        JsonSchemaFinder.pickType(new JsonPrimitive
+            (new LazilyParsedNumber("-128"))).toString());
+    assertEquals("smallint",
+        JsonSchemaFinder.pickType(new JsonPrimitive
+            (new LazilyParsedNumber("-24120"))).toString());
+    assertEquals("smallint",
+        JsonSchemaFinder.pickType(new JsonPrimitive
+            (new LazilyParsedNumber("128"))).toString());
+    assertEquals("int",
+        JsonSchemaFinder.pickType(new JsonPrimitive
+            (new LazilyParsedNumber("60000"))).toString());
+    assertEquals("bigint",
+        JsonSchemaFinder.pickType(new JsonPrimitive
+            (new LazilyParsedNumber("-4294967296"))).toString());
+    assertEquals("bigint",
+        JsonSchemaFinder.pickType(new JsonPrimitive
+            (new LazilyParsedNumber("-9223372036854775808"))).toString());
+    assertEquals("bigint",
+        JsonSchemaFinder.pickType(new JsonPrimitive
+            (new LazilyParsedNumber("9223372036854775807"))).toString());
+    assertEquals("decimal(19,0)",
+        JsonSchemaFinder.pickType(new JsonPrimitive
+            (new LazilyParsedNumber("9223372036854775808"))).toString());
+    assertEquals("decimal(19,0)",
+        JsonSchemaFinder.pickType(new JsonPrimitive
+            (new LazilyParsedNumber("-9223372036854775809"))).toString());
+    assertEquals("decimal(10,6)",
+        JsonSchemaFinder.pickType(new JsonPrimitive
+            (new LazilyParsedNumber("1234.567890"))).toString());
+    assertEquals("decimal(20,10)",
+        JsonSchemaFinder.pickType(new JsonPrimitive
+            (new LazilyParsedNumber("-1234567890.1234567890"))).toString());
+    assertEquals("float",
+        JsonSchemaFinder.pickType(new JsonPrimitive
+            (new LazilyParsedNumber("1.2e9"))).toString());
+    assertEquals("double",
+        JsonSchemaFinder.pickType(new JsonPrimitive
+            (new LazilyParsedNumber("1234567890123456789012345678901234567890"))).toString());
+    assertEquals("double",
+        JsonSchemaFinder.pickType(new JsonPrimitive
+            (new LazilyParsedNumber("1.2E40"))).toString());
+  }
+
+  @Test
+  public void testLists() throws Exception {
+    assertEquals("list<void>",
+        JsonSchemaFinder.pickType(new JsonArray()).toString());
+    JsonArray list = new JsonArray();
+    list.add(new JsonPrimitive(50000));
+    assertEquals("list<int>", JsonSchemaFinder.pickType(list).toString());
+    list = new JsonArray();
+    list.add(new JsonPrimitive(127));
+    list.add(new JsonPrimitive(50000));
+    list.add(new JsonPrimitive(50000000000L));
+    list.add(new JsonPrimitive(-100));
+    assertEquals("list<bigint>", JsonSchemaFinder.pickType(list).toString());
+  }
+
+  @Test
+  public void testStructs() throws Exception {
+    assertEquals("struct<>",
+        JsonSchemaFinder.pickType(new JsonObject()).toString());
+    JsonObject struct = new JsonObject();
+    struct.addProperty("bool", true);
+    assertEquals("struct<bool:boolean>",
+        JsonSchemaFinder.pickType(struct).toString());
+    struct = new JsonObject();
+    struct.addProperty("str", "value");
+    struct.addProperty("i", new LazilyParsedNumber("124567"));
+    assertEquals("struct<i:int,str:string>",
+        JsonSchemaFinder.pickType(struct).toString());
+  }
+
+  @Test
+  public void testNullMerges() throws Exception {
+    assertEquals("void", JsonSchemaFinder.mergeType(
+        new NullType(),
+        new NullType()).toString());
+    assertEquals("boolean", JsonSchemaFinder.mergeType(
+        new BooleanType(),
+        new NullType()).toString());
+    assertEquals("int", JsonSchemaFinder.mergeType(
+        new NullType(),
+        new NumericType(HiveType.Kind.INT, 4, 0)
+        ).toString());
+    assertEquals("string", JsonSchemaFinder.mergeType(
+        new NullType(),
+        new StringType(HiveType.Kind.STRING)
+        ).toString());
+    assertEquals("struct<i:int>", JsonSchemaFinder.mergeType(
+        new StructType().addField("i", new NumericType(HiveType.Kind.INT, 5, 0)),
+        new NullType()
+        ).toString());
+    assertEquals("list<int>", JsonSchemaFinder.mergeType(
+        new ListType(new NumericType(HiveType.Kind.INT, 5, 0)),
+        new NullType()
+        ).toString());
+    assertEquals("uniontype<int>", JsonSchemaFinder.mergeType(
+        new UnionType().addType(new NumericType(HiveType.Kind.INT, 5, 0)),
+        new NullType()
+        ).toString());
+  }
+
+  @Test
+  public void testBooleanMerges() throws Exception {
+    assertEquals("boolean", JsonSchemaFinder.mergeType(
+        new BooleanType(),
+        new BooleanType()).toString());
+    assertEquals("uniontype<boolean,int>", JsonSchemaFinder.mergeType(
+        new BooleanType(),
+        new NumericType(HiveType.Kind.INT, 4, 0)
+        ).toString());
+    assertEquals("uniontype<boolean,string>", JsonSchemaFinder.mergeType(
+        new BooleanType(),
+        new StringType(HiveType.Kind.STRING)
+        ).toString());
+    assertEquals("uniontype<struct<i:int>,boolean>", JsonSchemaFinder.mergeType(
+        new StructType().addField("i", new NumericType(HiveType.Kind.INT, 5, 0)),
+        new BooleanType()
+        ).toString());
+    assertEquals("uniontype<list<int>,boolean>", JsonSchemaFinder.mergeType(
+        new ListType(new NumericType(HiveType.Kind.INT, 5, 0)),
+        new BooleanType()
+        ).toString());
+    assertEquals("uniontype<int,boolean>", JsonSchemaFinder.mergeType(
+        new UnionType().addType(new NumericType(HiveType.Kind.INT, 5, 0)),
+        new BooleanType()
+        ).toString());
+  }
+
+  @Test
+  public void testNumericMerges() throws Exception {
+    assertEquals("smallint", JsonSchemaFinder.mergeType(
+        new NumericType(HiveType.Kind.BYTE, 2, 0),
+        new NumericType(HiveType.Kind.SHORT, 4, 0)
+        ).toString());
+    assertEquals("int", JsonSchemaFinder.mergeType(
+        new NumericType(HiveType.Kind.INT, 6, 0),
+        new NumericType(HiveType.Kind.SHORT, 4, 0)
+        ).toString());
+    assertEquals("bigint", JsonSchemaFinder.mergeType(
+        new NumericType(HiveType.Kind.INT, 6, 0),
+        new NumericType(HiveType.Kind.LONG, 10, 0)
+        ).toString());
+    assertEquals("decimal(20,0)", JsonSchemaFinder.mergeType(
+        new NumericType(HiveType.Kind.SHORT, 4, 0),
+        new NumericType(HiveType.Kind.DECIMAL, 20, 0)
+        ).toString());
+    assertEquals("float", JsonSchemaFinder.mergeType(
+        new NumericType(HiveType.Kind.FLOAT, 21, 4),
+        new NumericType(HiveType.Kind.DECIMAL, 20, 0)
+        ).toString());
+    assertEquals("double", JsonSchemaFinder.mergeType(
+        new NumericType(HiveType.Kind.DOUBLE, 31, 4),
+        new NumericType(HiveType.Kind.DECIMAL, 20, 10)
+        ).toString());
+    assertEquals("uniontype<decimal(30,10),string>", JsonSchemaFinder.mergeType(
+        new NumericType(HiveType.Kind.DECIMAL, 20, 10),
+        new StringType(HiveType.Kind.STRING)
+        ).toString());
+    assertEquals("uniontype<struct<i:int>,smallint>", JsonSchemaFinder.mergeType(
+        new StructType().addField("i", new NumericType(HiveType.Kind.INT, 5, 0)),
+        new NumericType(HiveType.Kind.SHORT, 4, 0)
+        ).toString());
+    assertEquals("uniontype<smallint,list<int>>", JsonSchemaFinder.mergeType(
+        new NumericType(HiveType.Kind.SHORT, 4, 0),
+        new ListType(new NumericType(HiveType.Kind.INT, 5, 0))
+        ).toString());
+    assertEquals("uniontype<decimal(20,0),string>", JsonSchemaFinder.mergeType(
+        new UnionType()
+            .addType(new NumericType(HiveType.Kind.INT, 5, 0))
+            .addType(new StringType(HiveType.Kind.STRING)),
+        new NumericType(HiveType.Kind.DECIMAL, 20, 0)
+        ).toString());
+  }
+
+  @Test
+  public void testStringMerges() throws Exception {
+    assertEquals("string", JsonSchemaFinder.mergeType(
+        new StringType(HiveType.Kind.BINARY),
+        new StringType(HiveType.Kind.STRING)
+        ).toString());
+    assertEquals("string", JsonSchemaFinder.mergeType(
+        new StringType(HiveType.Kind.STRING),
+        new StringType(HiveType.Kind.TIMESTAMP)
+        ).toString());
+    assertEquals("uniontype<struct<i:int>,timestamp>", JsonSchemaFinder.mergeType(
+        new StructType().addField("i", new NumericType(HiveType.Kind.INT, 5, 0)),
+        new StringType(HiveType.Kind.TIMESTAMP)
+        ).toString());
+    assertEquals("uniontype<binary,list<int>>", JsonSchemaFinder.mergeType(
+        new StringType(HiveType.Kind.BINARY),
+        new ListType(new NumericType(HiveType.Kind.INT, 5, 0))
+        ).toString());
+    assertEquals("uniontype<int,string>", JsonSchemaFinder.mergeType(
+        new UnionType()
+            .addType(new NumericType(HiveType.Kind.INT, 5, 0))
+            .addType(new StringType(HiveType.Kind.STRING)),
+        new StringType(HiveType.Kind.TIMESTAMP)
+        ).toString());
+  }
+
+  @Test
+  public void testListMerges() throws Exception {
+    assertEquals("list<bigint>", JsonSchemaFinder.mergeType(
+        new ListType(new NumericType(HiveType.Kind.INT, 10, 0)),
+        new ListType(new NumericType(HiveType.Kind.LONG, 20, 0))
+        ).toString());
+    assertEquals("list<uniontype<int,string>>", JsonSchemaFinder.mergeType(
+        new ListType(new NumericType(HiveType.Kind.INT, 10, 0)),
+        new ListType(new StringType(HiveType.Kind.STRING))
+        ).toString());
+    assertEquals("uniontype<struct<foo:int>,list<int>>", JsonSchemaFinder.mergeType(
+        new StructType().addField("foo", new NumericType(HiveType.Kind.INT, 10, 0)),
+        new ListType(new NumericType(HiveType.Kind.INT, 5, 0))
+        ).toString());
+    assertEquals("uniontype<int,string,list<boolean>>", JsonSchemaFinder.mergeType(
+        new UnionType()
+            .addType(new NumericType(HiveType.Kind.INT, 5, 0))
+            .addType(new StringType(HiveType.Kind.STRING)),
+        new ListType(new BooleanType())
+        ).toString());
+  }
+
+  @Test
+  public void testStructMerges() throws Exception {
+    assertEquals("struct<bar:timestamp,foo:int>", JsonSchemaFinder.mergeType(
+        new StructType().addField("foo", new NumericType(HiveType.Kind.INT, 10, 0)),
+        new StructType().addField("bar", new StringType(HiveType.Kind.TIMESTAMP))
+        ).toString());
+    assertEquals("struct<bar:string,foo:int>", JsonSchemaFinder.mergeType(
+        new StructType()
+            .addField("foo", new NumericType(HiveType.Kind.INT, 10, 0))
+            .addField("bar", new StringType(HiveType.Kind.BINARY)),
+        new StructType()
+            .addField("bar", new StringType(HiveType.Kind.TIMESTAMP))
+        ).toString());
+    assertEquals("uniontype<int,string,struct<foo:boolean>>", JsonSchemaFinder.mergeType(
+        new UnionType()
+            .addType(new NumericType(HiveType.Kind.INT, 5, 0))
+            .addType(new StringType(HiveType.Kind.STRING)),
+        new StructType().addField("foo", new BooleanType())
+        ).toString());
+  }
+
+  @Test
+  public void testUnionMerges() throws Exception {
+    assertEquals("uniontype<decimal(15,10),boolean,string>", JsonSchemaFinder.mergeType(
+        new UnionType()
+            .addType(new NumericType(HiveType.Kind.DECIMAL, 2, 10))
+            .addType(new BooleanType())
+            .addType(new StringType(HiveType.Kind.BINARY)),
+        new UnionType()
+            .addType(new StringType(HiveType.Kind.TIMESTAMP))
+            .addType(new NumericType(HiveType.Kind.INT, 5, 0))
+        ).toString());
+    assertEquals("uniontype<int,binary,struct<bar:timestamp>>", JsonSchemaFinder.mergeType(
+        new UnionType()
+            .addType(new NumericType(HiveType.Kind.INT, 10, 0))
+            .addType(new StringType(HiveType.Kind.BINARY)),
+        new StructType()
+            .addField("bar", new StringType(HiveType.Kind.TIMESTAMP))
+    ).toString());
+    assertEquals("uniontype<int,string>", JsonSchemaFinder.mergeType(
+        new UnionType()
+            .addType(new NumericType(HiveType.Kind.INT, 5, 0))
+            .addType(new StringType(HiveType.Kind.BINARY)),
+        new StringType(HiveType.Kind.TIMESTAMP)
+        ).toString());
+  }
+}