You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2018/03/26 21:06:14 UTC

orc git commit: ORC-328. Allow custom timestamp format to be provided when converting CSV -> ORC

Repository: orc
Updated Branches:
  refs/heads/master a49612ead -> ff6394f06


ORC-328. Allow custom timestamp format to be provided when converting CSV -> ORC

Added -t/-timestampformat option to conversion tool.

Fixes #235

Signed-off-by: Owen O'Malley <om...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/ff6394f0
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/ff6394f0
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/ff6394f0

Branch: refs/heads/master
Commit: ff6394f064e4cc080b030d46ec380f0f2e067cb5
Parents: a49612e
Author: Bill Warshaw <bi...@appian.com>
Authored: Wed Mar 21 12:06:22 2018 -0400
Committer: Owen O'Malley <om...@apache.org>
Committed: Mon Mar 26 14:03:40 2018 -0700

----------------------------------------------------------------------
 .../apache/orc/tools/convert/ConvertTool.java   |  9 ++++-
 .../org/apache/orc/tools/convert/CsvReader.java | 13 ++++---
 .../apache/orc/tools/convert/TestCsvReader.java | 36 +++++++++++++++-----
 3 files changed, 42 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/orc/blob/ff6394f0/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java b/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java
index 33f1cf5..02d7ee8 100644
--- a/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java
+++ b/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java
@@ -47,6 +47,8 @@ import java.util.zip.GZIPInputStream;
  * A conversion tool to convert CSV or JSON files into ORC files.
  */
 public class ConvertTool {
+  static final String DEFAULT_TIMESTAMP_FORMAT = "yyyy[[-][/]]MM[[-][/]]dd[['T'][ ]]HH:mm:ss[ ][XXX][X]";
+
   private final List<FileInformation> fileList;
   private final TypeDescription schema;
   private final char csvSeparator;
@@ -54,6 +56,7 @@ public class ConvertTool {
   private final char csvEscape;
   private final int csvHeaderLines;
   private final String csvNullString;
+  private final String timestampFormat;
   private final Writer writer;
   private final VectorizedRowBatch batch;
 
@@ -148,7 +151,7 @@ public class ConvertTool {
         case CSV: {
           FSDataInputStream underlying = filesystem.open(path);
           return new CsvReader(getReader(underlying), underlying, size, schema,
-              csvSeparator, csvQuote, csvEscape, csvHeaderLines, csvNullString);
+              csvSeparator, csvQuote, csvEscape, csvHeaderLines, csvNullString, timestampFormat);
         }
         default:
           throw new IllegalArgumentException("Unhandled format " + format +
@@ -186,6 +189,7 @@ public class ConvertTool {
     this.csvSeparator = getCharOption(opts, 'S', ',');
     this.csvHeaderLines = getIntOption(opts, 'H', 0);
     this.csvNullString = opts.getOptionValue('n', "");
+    this.timestampFormat = opts.getOptionValue("t", DEFAULT_TIMESTAMP_FORMAT);
     String outFilename = opts.hasOption('o')
         ? opts.getOptionValue('o') : "output.orc";
     writer = OrcFile.createWriter(new Path(outFilename),
@@ -247,6 +251,9 @@ public class ConvertTool {
     options.addOption(
         Option.builder("H").longOpt("header").desc("CSV header lines")
             .hasArg().build());
+    options.addOption(
+            Option.builder("t").longOpt("timestampformat").desc("Timestamp Format")
+            .hasArg().build());
     CommandLine cli = new DefaultParser().parse(options, args);
     if (cli.hasOption('h') || cli.getArgs().length == 0) {
       HelpFormatter formatter = new HelpFormatter();

http://git-wip-us.apache.org/repos/asf/orc/blob/ff6394f0/java/tools/src/java/org/apache/orc/tools/convert/CsvReader.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/convert/CsvReader.java b/java/tools/src/java/org/apache/orc/tools/convert/CsvReader.java
index 3ce32ac..6dbdf30 100644
--- a/java/tools/src/java/org/apache/orc/tools/convert/CsvReader.java
+++ b/java/tools/src/java/org/apache/orc/tools/convert/CsvReader.java
@@ -17,6 +17,7 @@
  */
 package org.apache.orc.tools.convert;
 
+import com.opencsv.CSVReader;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
@@ -36,16 +37,11 @@ import org.threeten.bp.ZonedDateTime;
 import org.threeten.bp.format.DateTimeFormatter;
 import org.threeten.bp.temporal.TemporalAccessor;
 
-import com.opencsv.CSVReader;
-
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 import java.sql.Timestamp;
 
 public class CsvReader implements RecordReader {
-  private static final DateTimeFormatter DATE_TIME_FORMATTER = DateTimeFormatter.ofPattern(
-      "yyyy[[-][/]]MM[[-][/]]dd[['T'][ ]]HH:mm:ss[ ][XXX][X]");
-
   private long rowNumber = 0;
   private final Converter converter;
   private final int columns;
@@ -53,6 +49,7 @@ public class CsvReader implements RecordReader {
   private final String nullString;
   private final FSDataInputStream underlying;
   private final long totalSize;
+  private final DateTimeFormatter dateTimeFormatter;
 
   /**
    * Create a CSV reader
@@ -76,7 +73,8 @@ public class CsvReader implements RecordReader {
                    char quoteChar,
                    char escapeChar,
                    int headerLines,
-                   String nullString) throws IOException {
+                   String nullString,
+                   String timestampFormat) {
     this.underlying = input;
     this.reader = new CSVReader(reader, separatorChar, quoteChar, escapeChar,
         headerLines);
@@ -85,6 +83,7 @@ public class CsvReader implements RecordReader {
     IntWritable nextColumn = new IntWritable(0);
     this.converter = buildConverter(nextColumn, schema);
     this.columns = nextColumn.get();
+    this.dateTimeFormatter = DateTimeFormatter.ofPattern(timestampFormat);
   }
 
   interface Converter {
@@ -252,7 +251,7 @@ public class CsvReader implements RecordReader {
       } else {
         TimestampColumnVector vector = (TimestampColumnVector) column;
         TemporalAccessor temporalAccessor =
-            DATE_TIME_FORMATTER.parseBest(values[offset],
+            dateTimeFormatter.parseBest(values[offset],
                 ZonedDateTime.FROM, LocalDateTime.FROM);
         if (temporalAccessor instanceof ZonedDateTime) {
           vector.set(row, new Timestamp(

http://git-wip-us.apache.org/repos/asf/orc/blob/ff6394f0/java/tools/src/test/org/apache/orc/tools/convert/TestCsvReader.java
----------------------------------------------------------------------
diff --git a/java/tools/src/test/org/apache/orc/tools/convert/TestCsvReader.java b/java/tools/src/test/org/apache/orc/tools/convert/TestCsvReader.java
index 8943692..ed24c1e 100644
--- a/java/tools/src/test/org/apache/orc/tools/convert/TestCsvReader.java
+++ b/java/tools/src/test/org/apache/orc/tools/convert/TestCsvReader.java
@@ -18,10 +18,6 @@
 
 package org.apache.orc.tools.convert;
 
-import static org.junit.Assert.assertEquals;
-
-import java.io.StringReader;
-
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
@@ -35,6 +31,11 @@ import org.apache.orc.TypeDescription;
 import org.junit.Before;
 import org.junit.Test;
 
+import java.io.StringReader;
+
+import static org.apache.orc.tools.convert.ConvertTool.DEFAULT_TIMESTAMP_FORMAT;
+import static org.junit.Assert.assertEquals;
+
 public class TestCsvReader {
 
   Configuration conf;
@@ -60,7 +61,7 @@ public class TestCsvReader {
     TypeDescription schema = TypeDescription.fromString(
         "struct<a:int,b:double,c:decimal(10,2),d:string,e:boolean,e:timestamp>");
     RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
-        '\\', 0, "");
+        '\\', 0, "", DEFAULT_TIMESTAMP_FORMAT);
     VectorizedRowBatch batch = schema.createRowBatch(5);
     assertEquals(true, reader.nextBatch(batch));
     assertEquals(5, batch.size);
@@ -98,7 +99,7 @@ public class TestCsvReader {
     TypeDescription schema = TypeDescription.fromString(
         "struct<a:int,b:double,c:decimal(10,2),d:string>");
     RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
-        '\\', 0, "null");
+        '\\', 0, "null", DEFAULT_TIMESTAMP_FORMAT);
     VectorizedRowBatch batch = schema.createRowBatch();
     assertEquals(true, reader.nextBatch(batch));
     assertEquals(3, batch.size);
@@ -139,7 +140,7 @@ public class TestCsvReader {
     TypeDescription schema = TypeDescription.fromString(
         "struct<a:int,b:struct<c:int,d:int>,e:int>");
     RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
-        '\\', 0, "null");
+        '\\', 0, "null", DEFAULT_TIMESTAMP_FORMAT);
     VectorizedRowBatch batch = schema.createRowBatch();
     assertEquals(true, reader.nextBatch(batch));
     assertEquals(2, batch.size);
@@ -162,7 +163,7 @@ public class TestCsvReader {
     TypeDescription schema = TypeDescription.fromString(
             "struct<a:int,b:int,d:bigint,e:bigint>");
     RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
-            '\\', 0, "null");
+            '\\', 0, "null", DEFAULT_TIMESTAMP_FORMAT);
     VectorizedRowBatch batch = schema.createRowBatch();
     assertEquals(true, reader.nextBatch(batch));
     assertEquals(1, batch.size);
@@ -172,4 +173,23 @@ public class TestCsvReader {
     assertEquals(-9223372036854775807L, ((LongColumnVector) batch.cols[3]).vector[0]);
     assertEquals(false, reader.nextBatch(batch));
   }
+
+  @Test
+  public void testCustomTimestampFormat() throws Exception {
+    String tsFormat = "d[d] MMM yyyy HH:mm:ss";
+    StringReader input = new StringReader(
+            "'21 Mar 2018 12:23:34'\n" +
+                    "'3 Feb 2018 18:04:51'\n"
+    );
+    TypeDescription schema = TypeDescription.fromString(
+            "struct<a:timestamp>");
+    RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
+            '\\', 0, "", tsFormat);
+    VectorizedRowBatch batch = schema.createRowBatch(2);
+    assertEquals(true, reader.nextBatch(batch));
+    assertEquals(2, batch.size);
+    TimestampColumnVector cv = (TimestampColumnVector) batch.cols[0];
+    assertEquals("2018-03-21 12:23:34.0", cv.asScratchTimestamp(0).toString());
+    assertEquals("2018-02-03 18:04:51.0", cv.asScratchTimestamp(1).toString());
+  }
 }