You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by pg...@apache.org on 2021/01/20 11:08:12 UTC
[orc] branch master updated: ORC-738: Add date type conversion
support in `Java Tools` (#631)
This is an automated email from the ASF dual-hosted git repository.
pgaref pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/master by this push:
new e81e572 ORC-738: Add date type conversion support in `Java Tools` (#631)
e81e572 is described below
commit e81e572d26ecceb6ae4626d16865b98cd3a5b1c5
Author: darkamgine <da...@gmail.com>
AuthorDate: Wed Jan 20 19:08:03 2021 +0800
ORC-738: Add date type conversion support in `Java Tools` (#631)
### What changes were proposed in this pull request?
* CSV and Json converters can now take in columns with date format instead of throwing an error on invalid type
* Removed threetenbp date/time library with standard java version
* Datetimes support OffsetDateTime formats, e.g. 2021-01-18T12:34:56.123+04:00
* Create timestamps from converting to Instant rather than manually setting seconds and nanos
### Why are the changes needed?
Date is a common datatype and should be supported instead of using hack with timestamps at 00:00:00
threetenbp library outdated and is not necessary for java 8+
Datetime formats should allow for OffsetDateTime as well as ZonedDateTime and LocalDateTime formats for full iso8601 compatibility
### How was this patch tested?
TestJsonReader.testDateTypeSupport
TestJsonReader.testDateTimeTypeSupport
---
java/tools/pom.xml | 4 --
.../org/apache/orc/tools/convert/CsvReader.java | 51 ++++++++++++----
.../org/apache/orc/tools/convert/JsonReader.java | 47 +++++++++++----
.../apache/orc/tools/convert/TestJsonReader.java | 68 ++++++++++++++++++++++
4 files changed, 146 insertions(+), 24 deletions(-)
diff --git a/java/tools/pom.xml b/java/tools/pom.xml
index 4d40fcf..0de5696 100644
--- a/java/tools/pom.xml
+++ b/java/tools/pom.xml
@@ -78,10 +78,6 @@
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</dependency>
- <dependency>
- <groupId>org.threeten</groupId>
- <artifactId>threetenbp</artifactId>
- </dependency>
<!-- orc-tools uber jar needs to include this -->
<dependency>
<groupId>com.google.guava</groupId>
diff --git a/java/tools/src/java/org/apache/orc/tools/convert/CsvReader.java b/java/tools/src/java/org/apache/orc/tools/convert/CsvReader.java
index 3967a49..14e3cc9 100644
--- a/java/tools/src/java/org/apache/orc/tools/convert/CsvReader.java
+++ b/java/tools/src/java/org/apache/orc/tools/convert/CsvReader.java
@@ -21,6 +21,7 @@ import com.opencsv.CSVReader;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DateColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
@@ -31,15 +32,18 @@ import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;
-import org.threeten.bp.LocalDateTime;
-import org.threeten.bp.ZoneId;
-import org.threeten.bp.ZonedDateTime;
-import org.threeten.bp.format.DateTimeFormatter;
-import org.threeten.bp.temporal.TemporalAccessor;
+
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.sql.Timestamp;
+import java.time.LocalDate;
+import java.time.LocalDateTime;
+import java.time.OffsetDateTime;
+import java.time.ZonedDateTime;
+import java.time.ZoneId;
+import java.time.format.DateTimeFormatter;
+import java.time.temporal.TemporalAccessor;
public class CsvReader implements RecordReader {
private long rowNumber = 0;
@@ -238,6 +242,29 @@ public class CsvReader implements RecordReader {
}
}
+ class DateColumnConverter extends ConverterImpl {
+ DateColumnConverter(IntWritable offset) { super(offset); }
+
+ @Override
+ public void convert(String[] values, ColumnVector column, int row) {
+ if (values[offset] == null || nullString.equals(values[offset])) {
+ column.noNulls = false;
+ column.isNull[row] = true;
+ } else {
+ DateColumnVector vector = (DateColumnVector) column;
+
+ final LocalDate dt = LocalDate.parse(values[offset]);
+
+ if (dt != null) {
+ vector.vector[row] = dt.toEpochDay();
+ } else {
+ column.noNulls = false;
+ column.isNull[row] = true;
+ }
+ }
+ }
+ }
+
class TimestampConverter extends ConverterImpl {
TimestampConverter(IntWritable offset) {
super(offset);
@@ -252,16 +279,18 @@ public class CsvReader implements RecordReader {
TimestampColumnVector vector = (TimestampColumnVector) column;
TemporalAccessor temporalAccessor =
dateTimeFormatter.parseBest(values[offset],
- ZonedDateTime.FROM, LocalDateTime.FROM);
+ ZonedDateTime::from, OffsetDateTime::from, LocalDateTime::from);
if (temporalAccessor instanceof ZonedDateTime) {
ZonedDateTime zonedDateTime = ((ZonedDateTime) temporalAccessor);
- Timestamp timestamp = new Timestamp(zonedDateTime.toEpochSecond() * 1000L);
- timestamp.setNanos(zonedDateTime.getNano());
+ Timestamp timestamp = Timestamp.from(zonedDateTime.toInstant());
+ vector.set(row, timestamp);
+ } else if (temporalAccessor instanceof OffsetDateTime) {
+ OffsetDateTime offsetDateTime = (OffsetDateTime) temporalAccessor;
+ Timestamp timestamp = Timestamp.from(offsetDateTime.toInstant());
vector.set(row, timestamp);
} else if (temporalAccessor instanceof LocalDateTime) {
ZonedDateTime tz = ((LocalDateTime) temporalAccessor).atZone(ZoneId.systemDefault());
- Timestamp timestamp = new Timestamp(tz.toEpochSecond() * 1000L);
- timestamp.setNanos(tz.getNano());
+ Timestamp timestamp = Timestamp.from(tz.toInstant());
vector.set(row, timestamp);
} else {
column.noNulls = false;
@@ -318,6 +347,8 @@ public class CsvReader implements RecordReader {
case CHAR:
case VARCHAR:
return new BytesConverter(startOffset);
+ case DATE:
+ return new DateColumnConverter(startOffset);
case TIMESTAMP:
case TIMESTAMP_INSTANT:
return new TimestampConverter(startOffset);
diff --git a/java/tools/src/java/org/apache/orc/tools/convert/JsonReader.java b/java/tools/src/java/org/apache/orc/tools/convert/JsonReader.java
index d8213e3..0118c20 100644
--- a/java/tools/src/java/org/apache/orc/tools/convert/JsonReader.java
+++ b/java/tools/src/java/org/apache/orc/tools/convert/JsonReader.java
@@ -26,6 +26,7 @@ import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DateColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
@@ -36,16 +37,18 @@ import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;
-import org.threeten.bp.LocalDateTime;
-import org.threeten.bp.ZonedDateTime;
-import org.threeten.bp.ZoneId;
-import org.threeten.bp.format.DateTimeFormatter;
-import org.threeten.bp.temporal.TemporalAccessor;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.sql.Timestamp;
+import java.time.LocalDate;
+import java.time.LocalDateTime;
+import java.time.OffsetDateTime;
+import java.time.ZonedDateTime;
+import java.time.ZoneId;
+import java.time.format.DateTimeFormatter;
+import java.time.temporal.TemporalAccessor;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@@ -135,6 +138,26 @@ public class JsonReader implements RecordReader {
}
}
+ static class DateColumnConverter implements JsonConverter {
+ public void convert(JsonElement value, ColumnVector vect, int row) {
+ if (value == null || value.isJsonNull()) {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ } else {
+ DateColumnVector vector = (DateColumnVector) vect;
+
+ final LocalDate dt = LocalDate.parse(value.getAsString());
+
+ if (dt != null) {
+ vector.vector[row] = dt.toEpochDay();
+ } else {
+ vect.noNulls = false;
+ vect.isNull[row] = true;
+ }
+ }
+ }
+ }
+
class TimestampColumnConverter implements JsonConverter {
@Override
public void convert(JsonElement value, ColumnVector vect, int row) {
@@ -144,16 +167,18 @@ public class JsonReader implements RecordReader {
} else {
TimestampColumnVector vector = (TimestampColumnVector) vect;
TemporalAccessor temporalAccessor = dateTimeFormatter.parseBest(value.getAsString(),
- ZonedDateTime.FROM, LocalDateTime.FROM);
+ ZonedDateTime::from, OffsetDateTime::from, LocalDateTime::from);
if (temporalAccessor instanceof ZonedDateTime) {
ZonedDateTime zonedDateTime = ((ZonedDateTime) temporalAccessor);
- Timestamp timestamp = new Timestamp(zonedDateTime.toEpochSecond() * 1000L);
- timestamp.setNanos(zonedDateTime.getNano());
+ Timestamp timestamp = Timestamp.from(zonedDateTime.toInstant());
+ vector.set(row, timestamp);
+ } else if (temporalAccessor instanceof OffsetDateTime) {
+ OffsetDateTime offsetDateTime = (OffsetDateTime) temporalAccessor;
+ Timestamp timestamp = Timestamp.from(offsetDateTime.toInstant());
vector.set(row, timestamp);
} else if (temporalAccessor instanceof LocalDateTime) {
ZonedDateTime tz = ((LocalDateTime) temporalAccessor).atZone(ZoneId.systemDefault());
- Timestamp timestamp = new Timestamp(tz.toEpochSecond() * 1000L);
- timestamp.setNanos(tz.getNano());
+ Timestamp timestamp = Timestamp.from(tz.toInstant());
vector.set(row, timestamp);
} else {
vect.noNulls = false;
@@ -283,6 +308,8 @@ public class JsonReader implements RecordReader {
return new StringColumnConverter();
case DECIMAL:
return new DecimalColumnConverter();
+ case DATE:
+ return new DateColumnConverter();
case TIMESTAMP:
case TIMESTAMP_INSTANT:
return new TimestampColumnConverter();
diff --git a/java/tools/src/test/org/apache/orc/tools/convert/TestJsonReader.java b/java/tools/src/test/org/apache/orc/tools/convert/TestJsonReader.java
index f839513..95ca355 100644
--- a/java/tools/src/test/org/apache/orc/tools/convert/TestJsonReader.java
+++ b/java/tools/src/test/org/apache/orc/tools/convert/TestJsonReader.java
@@ -18,14 +18,24 @@
package org.apache.orc.tools.convert;
+import org.apache.hadoop.hive.ql.exec.vector.DateColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.TypeDescription;
import org.junit.Test;
+import java.io.IOException;
import java.io.StringReader;
+import java.time.LocalDate;
+import java.time.LocalDateTime;
+import java.time.OffsetDateTime;
+import java.time.ZonedDateTime;
+import java.time.ZoneId;
+import java.time.ZoneOffset;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
public class TestJsonReader {
@Test
@@ -72,4 +82,62 @@ public class TestJsonReader {
assertEquals("1969-12-31 23:59:58.9999", cv.asScratchTimestamp(5).toString());
}
+ @Test
+ public void testDateTypeSupport() throws IOException {
+ LocalDate date1 = LocalDate.of(2021, 1, 18);
+ LocalDate date2 = LocalDate.now();
+ String inputString = "{\"dt\": \"" + date1.toString() + "\"}\n" +
+ "{\"dt\": \"" + date2.toString() + "\"}\n" +
+ "{\"dt\": \"" + date2.toString() + "\"}\n" +
+ "{\"dt\": null}";
+
+
+ StringReader input = new StringReader(inputString);
+
+ TypeDescription schema = TypeDescription.fromString("struct<dt:date>");
+ JsonReader reader = new JsonReader(input, null, 1, schema, "");
+ VectorizedRowBatch batch = schema.createRowBatch(4);
+ assertTrue(reader.nextBatch(batch));
+ assertEquals(4, batch.size);
+ DateColumnVector cv = (DateColumnVector) batch.cols[0];
+ assertEquals(date1, LocalDate.ofEpochDay(cv.vector[0]));
+ assertEquals(date2, LocalDate.ofEpochDay(cv.vector[1]));
+ assertEquals(date2, LocalDate.ofEpochDay(cv.vector[2]));
+ assertFalse(cv.isNull[2]);
+ assertTrue(cv.isNull[3]);
+ }
+
+ @Test
+ public void testDateTimeTypeSupport() throws IOException {
+ String timestampFormat = "yyyy[[-][/]]MM[[-][/]]dd[['T'][ ]]HH:mm:ss[['.'][ ]][[SSSSSSSSS][SSSSSS][SSS]][[X][Z]['['VV']']]";
+ LocalDateTime datetime1 = LocalDateTime.of(2021, 1, 18, 1, 2, 3, 4);
+ LocalDateTime datetime2 = LocalDateTime.now();
+ OffsetDateTime datetime3 = OffsetDateTime.of(datetime1, ZoneOffset.UTC);
+ OffsetDateTime datetime4 = OffsetDateTime.of(datetime2, ZoneOffset.ofHours(-7));
+ ZonedDateTime datetime5 = ZonedDateTime.of(datetime1, ZoneId.of("UTC"));
+ ZonedDateTime datetime6 = ZonedDateTime.of(datetime2, ZoneId.of("America/New_York"));
+
+ String inputString = "{\"dt\": \"" + datetime1.toString() + "\"}\n" +
+ "{\"dt\": \"" + datetime2.toString() + "\"}\n" +
+ "{\"dt\": \"" + datetime3.toString() + "\"}\n" +
+ "{\"dt\": \"" + datetime4.toString().replace("07:00", "0700") + "\"}\n" +
+ "{\"dt\": \"" + datetime5.toLocalDateTime().toString() + "[" + datetime5.getZone() + "]\"}\n" +
+ "{\"dt\": \"" + datetime6.toLocalDateTime().toString() + "[" + datetime6.getZone() + "]\"}\n";
+
+ StringReader input = new StringReader(inputString);
+
+ TypeDescription schema = TypeDescription.fromString("struct<dt:timestamp>");
+ JsonReader reader = new JsonReader(input, null, 1, schema, timestampFormat);
+ VectorizedRowBatch batch = schema.createRowBatch(6);
+ assertTrue(reader.nextBatch(batch));
+ assertEquals(6, batch.size);
+ TimestampColumnVector cv = (TimestampColumnVector) batch.cols[0];
+ assertEquals(datetime1, LocalDateTime.from(cv.asScratchTimestamp(0).toLocalDateTime()));
+ assertEquals(datetime2, LocalDateTime.from(cv.asScratchTimestamp(1).toLocalDateTime()));
+ assertEquals(datetime3.toInstant(), cv.asScratchTimestamp(2).toInstant());
+ assertEquals(datetime4.toInstant(), cv.asScratchTimestamp(3).toInstant());
+ assertEquals(datetime5.toInstant(), cv.asScratchTimestamp(4).toInstant());
+ assertEquals(datetime6.toInstant(), cv.asScratchTimestamp(5).toInstant());
+ }
+
}