You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2018/03/26 21:06:14 UTC
orc git commit: ORC-328. Allow custom timestamp format to be provided
when converting CSV -> ORC
Repository: orc
Updated Branches:
refs/heads/master a49612ead -> ff6394f06
ORC-328. Allow custom timestamp format to be provided when converting CSV -> ORC
Added -t/-timestampformat option to conversion tool.
Fixes #235
Signed-off-by: Owen O'Malley <om...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/orc/repo
Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/ff6394f0
Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/ff6394f0
Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/ff6394f0
Branch: refs/heads/master
Commit: ff6394f064e4cc080b030d46ec380f0f2e067cb5
Parents: a49612e
Author: Bill Warshaw <bi...@appian.com>
Authored: Wed Mar 21 12:06:22 2018 -0400
Committer: Owen O'Malley <om...@apache.org>
Committed: Mon Mar 26 14:03:40 2018 -0700
----------------------------------------------------------------------
.../apache/orc/tools/convert/ConvertTool.java | 9 ++++-
.../org/apache/orc/tools/convert/CsvReader.java | 13 ++++---
.../apache/orc/tools/convert/TestCsvReader.java | 36 +++++++++++++++-----
3 files changed, 42 insertions(+), 16 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/orc/blob/ff6394f0/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java b/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java
index 33f1cf5..02d7ee8 100644
--- a/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java
+++ b/java/tools/src/java/org/apache/orc/tools/convert/ConvertTool.java
@@ -47,6 +47,8 @@ import java.util.zip.GZIPInputStream;
* A conversion tool to convert CSV or JSON files into ORC files.
*/
public class ConvertTool {
+ static final String DEFAULT_TIMESTAMP_FORMAT = "yyyy[[-][/]]MM[[-][/]]dd[['T'][ ]]HH:mm:ss[ ][XXX][X]";
+
private final List<FileInformation> fileList;
private final TypeDescription schema;
private final char csvSeparator;
@@ -54,6 +56,7 @@ public class ConvertTool {
private final char csvEscape;
private final int csvHeaderLines;
private final String csvNullString;
+ private final String timestampFormat;
private final Writer writer;
private final VectorizedRowBatch batch;
@@ -148,7 +151,7 @@ public class ConvertTool {
case CSV: {
FSDataInputStream underlying = filesystem.open(path);
return new CsvReader(getReader(underlying), underlying, size, schema,
- csvSeparator, csvQuote, csvEscape, csvHeaderLines, csvNullString);
+ csvSeparator, csvQuote, csvEscape, csvHeaderLines, csvNullString, timestampFormat);
}
default:
throw new IllegalArgumentException("Unhandled format " + format +
@@ -186,6 +189,7 @@ public class ConvertTool {
this.csvSeparator = getCharOption(opts, 'S', ',');
this.csvHeaderLines = getIntOption(opts, 'H', 0);
this.csvNullString = opts.getOptionValue('n', "");
+ this.timestampFormat = opts.getOptionValue("t", DEFAULT_TIMESTAMP_FORMAT);
String outFilename = opts.hasOption('o')
? opts.getOptionValue('o') : "output.orc";
writer = OrcFile.createWriter(new Path(outFilename),
@@ -247,6 +251,9 @@ public class ConvertTool {
options.addOption(
Option.builder("H").longOpt("header").desc("CSV header lines")
.hasArg().build());
+ options.addOption(
+ Option.builder("t").longOpt("timestampformat").desc("Timestamp Format")
+ .hasArg().build());
CommandLine cli = new DefaultParser().parse(options, args);
if (cli.hasOption('h') || cli.getArgs().length == 0) {
HelpFormatter formatter = new HelpFormatter();
http://git-wip-us.apache.org/repos/asf/orc/blob/ff6394f0/java/tools/src/java/org/apache/orc/tools/convert/CsvReader.java
----------------------------------------------------------------------
diff --git a/java/tools/src/java/org/apache/orc/tools/convert/CsvReader.java b/java/tools/src/java/org/apache/orc/tools/convert/CsvReader.java
index 3ce32ac..6dbdf30 100644
--- a/java/tools/src/java/org/apache/orc/tools/convert/CsvReader.java
+++ b/java/tools/src/java/org/apache/orc/tools/convert/CsvReader.java
@@ -17,6 +17,7 @@
*/
package org.apache.orc.tools.convert;
+import com.opencsv.CSVReader;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
@@ -36,16 +37,11 @@ import org.threeten.bp.ZonedDateTime;
import org.threeten.bp.format.DateTimeFormatter;
import org.threeten.bp.temporal.TemporalAccessor;
-import com.opencsv.CSVReader;
-
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.sql.Timestamp;
public class CsvReader implements RecordReader {
- private static final DateTimeFormatter DATE_TIME_FORMATTER = DateTimeFormatter.ofPattern(
- "yyyy[[-][/]]MM[[-][/]]dd[['T'][ ]]HH:mm:ss[ ][XXX][X]");
-
private long rowNumber = 0;
private final Converter converter;
private final int columns;
@@ -53,6 +49,7 @@ public class CsvReader implements RecordReader {
private final String nullString;
private final FSDataInputStream underlying;
private final long totalSize;
+ private final DateTimeFormatter dateTimeFormatter;
/**
* Create a CSV reader
@@ -76,7 +73,8 @@ public class CsvReader implements RecordReader {
char quoteChar,
char escapeChar,
int headerLines,
- String nullString) throws IOException {
+ String nullString,
+ String timestampFormat) {
this.underlying = input;
this.reader = new CSVReader(reader, separatorChar, quoteChar, escapeChar,
headerLines);
@@ -85,6 +83,7 @@ public class CsvReader implements RecordReader {
IntWritable nextColumn = new IntWritable(0);
this.converter = buildConverter(nextColumn, schema);
this.columns = nextColumn.get();
+ this.dateTimeFormatter = DateTimeFormatter.ofPattern(timestampFormat);
}
interface Converter {
@@ -252,7 +251,7 @@ public class CsvReader implements RecordReader {
} else {
TimestampColumnVector vector = (TimestampColumnVector) column;
TemporalAccessor temporalAccessor =
- DATE_TIME_FORMATTER.parseBest(values[offset],
+ dateTimeFormatter.parseBest(values[offset],
ZonedDateTime.FROM, LocalDateTime.FROM);
if (temporalAccessor instanceof ZonedDateTime) {
vector.set(row, new Timestamp(
http://git-wip-us.apache.org/repos/asf/orc/blob/ff6394f0/java/tools/src/test/org/apache/orc/tools/convert/TestCsvReader.java
----------------------------------------------------------------------
diff --git a/java/tools/src/test/org/apache/orc/tools/convert/TestCsvReader.java b/java/tools/src/test/org/apache/orc/tools/convert/TestCsvReader.java
index 8943692..ed24c1e 100644
--- a/java/tools/src/test/org/apache/orc/tools/convert/TestCsvReader.java
+++ b/java/tools/src/test/org/apache/orc/tools/convert/TestCsvReader.java
@@ -18,10 +18,6 @@
package org.apache.orc.tools.convert;
-import static org.junit.Assert.assertEquals;
-
-import java.io.StringReader;
-
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
@@ -35,6 +31,11 @@ import org.apache.orc.TypeDescription;
import org.junit.Before;
import org.junit.Test;
+import java.io.StringReader;
+
+import static org.apache.orc.tools.convert.ConvertTool.DEFAULT_TIMESTAMP_FORMAT;
+import static org.junit.Assert.assertEquals;
+
public class TestCsvReader {
Configuration conf;
@@ -60,7 +61,7 @@ public class TestCsvReader {
TypeDescription schema = TypeDescription.fromString(
"struct<a:int,b:double,c:decimal(10,2),d:string,e:boolean,e:timestamp>");
RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
- '\\', 0, "");
+ '\\', 0, "", DEFAULT_TIMESTAMP_FORMAT);
VectorizedRowBatch batch = schema.createRowBatch(5);
assertEquals(true, reader.nextBatch(batch));
assertEquals(5, batch.size);
@@ -98,7 +99,7 @@ public class TestCsvReader {
TypeDescription schema = TypeDescription.fromString(
"struct<a:int,b:double,c:decimal(10,2),d:string>");
RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
- '\\', 0, "null");
+ '\\', 0, "null", DEFAULT_TIMESTAMP_FORMAT);
VectorizedRowBatch batch = schema.createRowBatch();
assertEquals(true, reader.nextBatch(batch));
assertEquals(3, batch.size);
@@ -139,7 +140,7 @@ public class TestCsvReader {
TypeDescription schema = TypeDescription.fromString(
"struct<a:int,b:struct<c:int,d:int>,e:int>");
RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
- '\\', 0, "null");
+ '\\', 0, "null", DEFAULT_TIMESTAMP_FORMAT);
VectorizedRowBatch batch = schema.createRowBatch();
assertEquals(true, reader.nextBatch(batch));
assertEquals(2, batch.size);
@@ -162,7 +163,7 @@ public class TestCsvReader {
TypeDescription schema = TypeDescription.fromString(
"struct<a:int,b:int,d:bigint,e:bigint>");
RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
- '\\', 0, "null");
+ '\\', 0, "null", DEFAULT_TIMESTAMP_FORMAT);
VectorizedRowBatch batch = schema.createRowBatch();
assertEquals(true, reader.nextBatch(batch));
assertEquals(1, batch.size);
@@ -172,4 +173,23 @@ public class TestCsvReader {
assertEquals(-9223372036854775807L, ((LongColumnVector) batch.cols[3]).vector[0]);
assertEquals(false, reader.nextBatch(batch));
}
+
+ @Test
+ public void testCustomTimestampFormat() throws Exception {
+ String tsFormat = "d[d] MMM yyyy HH:mm:ss";
+ StringReader input = new StringReader(
+ "'21 Mar 2018 12:23:34'\n" +
+ "'3 Feb 2018 18:04:51'\n"
+ );
+ TypeDescription schema = TypeDescription.fromString(
+ "struct<a:timestamp>");
+ RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
+ '\\', 0, "", tsFormat);
+ VectorizedRowBatch batch = schema.createRowBatch(2);
+ assertEquals(true, reader.nextBatch(batch));
+ assertEquals(2, batch.size);
+ TimestampColumnVector cv = (TimestampColumnVector) batch.cols[0];
+ assertEquals("2018-03-21 12:23:34.0", cv.asScratchTimestamp(0).toString());
+ assertEquals("2018-02-03 18:04:51.0", cv.asScratchTimestamp(1).toString());
+ }
}