You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ga...@apache.org on 2014/12/05 22:13:35 UTC
svn commit: r1643436 - in /hive/trunk/ql/src: java/org/apache/hadoop/hive/ql/io/orc/FileDump.java test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java

Author: gates
Date: Fri Dec  5 21:13:34 2014
New Revision: 1643436

URL: http://svn.apache.org/r1643436
Log:
HIVE-7896 orcfiledump should be able to dump data (Alan Gates, reviewed by Prasanth Jayachandran)

Modified:
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java
    hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java?rev=1643436&r1=1643435&r2=1643436&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java Fri Dec  5 21:13:34 2014
@@ -17,20 +17,34 @@
  */
 package org.apache.hadoop.hive.ql.io.orc;
 
+import java.io.OutputStreamWriter;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 
 import java.io.IOException;
 import java.text.DecimalFormat;
-import java.util.List;
+import java.util.Map;
 
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndex;
 import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndexEntry;
-import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
-import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue;
+import org.apache.hadoop.hive.serde2.io.ByteWritable;
+import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.hive.serde2.io.ShortWritable;
+import org.apache.hadoop.io.BooleanWritable;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.codehaus.jettison.json.JSONException;
+import org.codehaus.jettison.json.JSONWriter;
 
 /**
  * A tool for printing out the file structure of ORC files.
@@ -43,24 +57,40 @@ public final class FileDump {
 
   public static void main(String[] args) throws Exception {
     Configuration conf = new Configuration();
-    List<String> files = new ArrayList<String>();
+
     List<Integer> rowIndexCols = null;
-    for (String arg : args) {
-      if (arg.startsWith("--")) {
-        if (arg.startsWith(ROWINDEX_PREFIX)) {
-          String[] colStrs = arg.substring(ROWINDEX_PREFIX.length()).split(",");
-          rowIndexCols = new ArrayList<Integer>(colStrs.length);
-          for (String colStr : colStrs) {
-            rowIndexCols.add(Integer.parseInt(colStr));
-          }
-        } else {
-          System.err.println("Unknown argument " + arg);
-        }
-      } else {
-        files.add(arg);
+    Options opts = createOptions();
+    CommandLine cli = new GnuParser().parse(opts, args);
+
+    if (cli.hasOption('h')) {
+      HelpFormatter formatter = new HelpFormatter();
+      formatter.printHelp("orcfiledump", opts);
+      return;
+    }
+
+    boolean dumpData = cli.hasOption('d');
+    if (cli.hasOption("rowindex")) {
+      String[] colStrs = cli.getOptionValue("rowindex").split(",");
+      rowIndexCols = new ArrayList<Integer>(colStrs.length);
+      for (String colStr : colStrs) {
+        rowIndexCols.add(Integer.parseInt(colStr));
       }
     }
 
+    String[] files = cli.getArgs();
+    if (dumpData) printData(Arrays.asList(files), conf);
+    else printMetaData(Arrays.asList(files), conf, rowIndexCols);
+  }
+
+  private static void printData(List<String> files, Configuration conf) throws IOException,
+      JSONException {
+    for (String file : files) {
+      printJsonData(conf, file);
+    }
+  }
+
+  private static void printMetaData(List<String> files, Configuration conf,
+                                    List<Integer> rowIndexCols) throws IOException {
     for (String filename : files) {
       System.out.println("Structure for " + filename);
       Path path = new Path(filename);
@@ -181,4 +211,149 @@ public final class FileDump {
     }
     return paddedBytes;
   }
+
+  static Options createOptions() {
+    Options result = new Options();
+
+    // add -d and --data to print the rows
+    result.addOption(OptionBuilder
+        .withLongOpt("data")
+        .withDescription("Should the data be printed")
+        .create('d'));
+
+    result.addOption(OptionBuilder
+        .withLongOpt("help")
+        .withDescription("print help message")
+        .create('h'));
+
+    result.addOption(OptionBuilder
+        .withLongOpt("rowindex")
+        .withArgName("comma separated list of column ids for which row index should be printed")
+        .withDescription("Dump stats for column number(s)")
+        .hasArg()
+        .create());
+
+
+    return result;
+  }
+
+  private static void printMap(JSONWriter writer,
+                               Map<Object, Object> obj,
+                               List<OrcProto.Type> types,
+                               OrcProto.Type type
+  ) throws IOException, JSONException {
+    writer.array();
+    int keyType = type.getSubtypes(0);
+    int valueType = type.getSubtypes(1);
+    for(Map.Entry<Object,Object> item: obj.entrySet()) {
+      writer.object();
+      writer.key("_key");
+      printObject(writer, item.getKey(), types, keyType);
+      writer.key("_value");
+      printObject(writer, item.getValue(), types, valueType);
+      writer.endObject();
+    }
+    writer.endArray();
+  }
+
+  private static void printList(JSONWriter writer,
+                                List<Object> obj,
+                                List<OrcProto.Type> types,
+                                OrcProto.Type type
+  ) throws IOException, JSONException {
+    int subtype = type.getSubtypes(0);
+    writer.array();
+    for(Object item: obj) {
+      printObject(writer, item, types, subtype);
+    }
+    writer.endArray();
+  }
+
+  private static void printUnion(JSONWriter writer,
+                                 OrcUnion obj,
+                                 List<OrcProto.Type> types,
+                                 OrcProto.Type type
+  ) throws IOException, JSONException {
+    int subtype = type.getSubtypes(obj.getTag());
+    printObject(writer, obj.getObject(), types, subtype);
+  }
+
+  static void printStruct(JSONWriter writer,
+                          OrcStruct obj,
+                          List<OrcProto.Type> types,
+                          OrcProto.Type type) throws IOException, JSONException {
+    writer.object();
+    List<Integer> fieldTypes = type.getSubtypesList();
+    for(int i=0; i < fieldTypes.size(); ++i) {
+      writer.key(type.getFieldNames(i));
+      printObject(writer, obj.getFieldValue(i), types, fieldTypes.get(i));
+    }
+    writer.endObject();
+  }
+
+  static void printObject(JSONWriter writer,
+                          Object obj,
+                          List<OrcProto.Type> types,
+                          int typeId) throws IOException, JSONException {
+    OrcProto.Type type = types.get(typeId);
+    if (obj == null) {
+      writer.value(null);
+    } else {
+      switch (type.getKind()) {
+        case STRUCT:
+          printStruct(writer, (OrcStruct) obj, types, type);
+          break;
+        case UNION:
+          printUnion(writer, (OrcUnion) obj, types, type);
+          break;
+        case LIST:
+          printList(writer, (List<Object>) obj, types, type);
+          break;
+        case MAP:
+          printMap(writer, (Map<Object, Object>) obj, types, type);
+          break;
+        case BYTE:
+          writer.value(((ByteWritable) obj).get());
+          break;
+        case SHORT:
+          writer.value(((ShortWritable) obj).get());
+          break;
+        case INT:
+          writer.value(((IntWritable) obj).get());
+          break;
+        case LONG:
+          writer.value(((LongWritable) obj).get());
+          break;
+        case FLOAT:
+          writer.value(((FloatWritable) obj).get());
+          break;
+        case DOUBLE:
+          writer.value(((DoubleWritable) obj).get());
+          break;
+        case BOOLEAN:
+          writer.value(((BooleanWritable) obj).get());
+          break;
+        default:
+          writer.value(obj.toString());
+          break;
+      }
+    }
+  }
+
+  static void printJsonData(Configuration conf,
+                            String filename) throws IOException, JSONException {
+    Path path = new Path(filename);
+    Reader reader = OrcFile.createReader(path.getFileSystem(conf), path);
+    OutputStreamWriter out = new OutputStreamWriter(System.out, "UTF-8");
+    RecordReader rows = reader.rows(null);
+    Object row = null;
+    List<OrcProto.Type> types = reader.getTypes();
+    while (rows.hasNext()) {
+      row = rows.next(row);
+      JSONWriter writer = new JSONWriter(out);
+      printObject(writer, row, types, 0);
+      out.write("\n");
+      out.flush();
+    }
+  }
 }

Modified: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java?rev=1643436&r1=1643435&r2=1643436&view=diff
==============================================================================
--- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java (original)
+++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java Fri Dec  5 21:13:34 2014
@@ -22,19 +22,32 @@ import static org.junit.Assert.assertEqu
 import static org.junit.Assert.assertNull;
 
 import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.FileReader;
 import java.io.PrintStream;
+import java.math.BigDecimal;
+import java.sql.Date;
+import java.sql.Timestamp;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
 import java.util.Random;
 
+import junit.framework.Assert;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.common.type.HiveChar;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.common.type.HiveVarchar;
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
 import org.apache.hive.common.util.HiveTestUtils;
+import org.fusesource.leveldbjni.All;
 import org.junit.Before;
 import org.junit.Test;
 
@@ -65,6 +78,55 @@ public class TestFileDump {
     }
   }
 
+  static class AllTypesRecord {
+    static class Struct {
+      int i;
+      String s;
+
+      Struct(int i, String s) {
+        this.i = i;
+        this.s = s;
+      }
+    }
+    boolean b;
+    byte bt;
+    short s;
+    int i;
+    long l;
+    float f;
+    double d;
+    HiveDecimal de;
+    Timestamp t;
+    Date dt;
+    String str;
+    HiveChar c;
+    HiveVarchar vc;
+    Map<String, String> m;
+    List<Integer> a;
+    Struct st;
+
+    AllTypesRecord(boolean b, byte bt, short s, int i, long l, float f, double d, HiveDecimal de,
+                   Timestamp t, Date dt, String str, HiveChar c, HiveVarchar vc, Map<String,
+                   String> m, List<Integer> a, Struct st) {
+      this.b = b;
+      this.bt = bt;
+      this.s = s;
+      this.i = i;
+      this.l = l;
+      this.f = f;
+      this.d = d;
+      this.de = de;
+      this.t = t;
+      this.dt = dt;
+      this.str = str;
+      this.c = c;
+      this.vc = vc;
+      this.m = m;
+      this.a = a;
+      this.st = st;
+    }
+  }
+
   private static void checkOutput(String expected,
                                   String actual) throws Exception {
     BufferedReader eStream =
@@ -124,6 +186,72 @@ public class TestFileDump {
     checkOutput(outputFilename, workDir + File.separator + outputFilename);
   }
 
+  @Test
+  public void testDataDump() throws Exception {
+    ObjectInspector inspector;
+    synchronized (TestOrcFile.class) {
+      inspector = ObjectInspectorFactory.getReflectionObjectInspector
+          (AllTypesRecord.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+    }
+    Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector,
+        100000, CompressionKind.NONE, 10000, 1000);
+    Map<String, String> m = new HashMap<String, String>(2);
+    m.put("k1", "v1");
+    writer.addRow(new AllTypesRecord(
+        true,
+        (byte) 10,
+        (short) 100,
+        1000,
+        10000L,
+        4.0f,
+        20.0,
+        HiveDecimal.create(new BigDecimal(4.2222)),
+        new Timestamp(1416967764000L),
+        new Date(1416967764000L),
+        "string",
+        new HiveChar("hello", 5),
+        new HiveVarchar("hello", 10),
+        m,
+        Arrays.asList(100, 200),
+        new AllTypesRecord.Struct(10, "foo")));
+    m.clear();
+    m.put("k3", "v3");
+    writer.addRow(new AllTypesRecord(
+        false,
+        (byte)20,
+        (short)200,
+        2000,
+        20000L,
+        8.0f,
+        40.0,
+        HiveDecimal.create(new BigDecimal(2.2222)),
+        new Timestamp(1416967364000L),
+        new Date(1411967764000L),
+        "abcd",
+        new HiveChar("world", 5),
+        new HiveVarchar("world", 10),
+        m,
+        Arrays.asList(200, 300),
+        new AllTypesRecord.Struct(20, "bar")));
+
+    writer.close();
+    PrintStream origOut = System.out;
+    String outputFilename = "orc-file-dump.out";
+    ByteArrayOutputStream myOut = new ByteArrayOutputStream();
+
+    // replace stdout and run command
+    System.setOut(new PrintStream(myOut));
+    FileDump.main(new String[]{testFilePath.toString(), "-d"});
+    System.out.flush();
+    System.setOut(origOut);
+
+    String[] lines = myOut.toString().split("\n");
+    // Don't be fooled by the big space in the middle, this line is quite long
+    assertEquals("{\"b\":true,\"bt\":10,\"s\":100,\"i\":1000,\"l\":10000,\"f\":4,\"d\":20,\"de\":\"4.222199999999999953\",\"t\":\"2014-11-25 18:09:24\",\"dt\":\"2014-11-25\",\"str\":\"string\",\"c\":\"hello                                                                                                                                                                                                                                                          \",\"vc\":\"hello\",\"m\":[{\"_key\":\"k1\",\"_value\":\"v1\"}],\"a\":[100,200],\"st\":{\"i\":10,\"s\":\"foo\"}}", lines[0]);
+    assertEquals("{\"b\":false,\"bt\":20,\"s\":200,\"i\":2000,\"l\":20000,\"f\":8,\"d\":40,\"de\":\"2.222199999999999953\",\"t\":\"2014-11-25 18:02:44\",\"dt\":\"2014-09-28\",\"str\":\"abcd\",\"c\":\"world                                                                                                                                                                                                                                                          \",\"vc\":\"world\",\"m\":[{\"_key\":\"k3\",\"_value\":\"v3\"}],\"a\":[200,300],\"st\":{\"i\":20,\"s\":\"bar\"}}", lines[1]);
+
+  }
+
   // Test that if the fraction of rows that have distinct strings is greater than the configured
   // threshold dictionary encoding is turned off.  If dictionary encoding is turned off the length
   // of the dictionary stream for the column will be 0 in the ORC file dump.