You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by pg...@apache.org on 2021/01/20 11:08:12 UTC

[orc] branch master updated: ORC-738: Add date type conversion support in `Java Tools` (#631)

This is an automated email from the ASF dual-hosted git repository.

pgaref pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/master by this push:
     new e81e572  ORC-738: Add date type conversion support in `Java Tools` (#631)
e81e572 is described below

commit e81e572d26ecceb6ae4626d16865b98cd3a5b1c5
Author: darkamgine <da...@gmail.com>
AuthorDate: Wed Jan 20 19:08:03 2021 +0800

    ORC-738: Add date type conversion support in `Java Tools` (#631)
    
    ### What changes were proposed in this pull request?
    * CSV and Json converters can now take in columns with date format instead of throwing an error on invalid type
    * Removed threetenbp date/time library with standard java version
    * Datetimes support OffsetDateTime formats, e.g. 2021-01-18T12:34:56.123+04:00
    * Create timestamps from converting to Instant rather than manually setting seconds and nanos
    
    ### Why are the changes needed?
    Date is a common datatype and should be supported instead of using hack with timestamps at 00:00:00
    threetenbp library outdated and is not necessary for java 8+
    Datetime formats should allow for OffsetDateTime as well as ZonedDateTime and LocalDateTime formats for full iso8601 compatibility
    
    ### How was this patch tested?
    TestJsonReader.testDateTypeSupport
    TestJsonReader.testDateTimeTypeSupport
---
 java/tools/pom.xml                                 |  4 --
 .../org/apache/orc/tools/convert/CsvReader.java    | 51 ++++++++++++----
 .../org/apache/orc/tools/convert/JsonReader.java   | 47 +++++++++++----
 .../apache/orc/tools/convert/TestJsonReader.java   | 68 ++++++++++++++++++++++
 4 files changed, 146 insertions(+), 24 deletions(-)

diff --git a/java/tools/pom.xml b/java/tools/pom.xml
index 4d40fcf..0de5696 100644
--- a/java/tools/pom.xml
+++ b/java/tools/pom.xml
@@ -78,10 +78,6 @@
       <groupId>org.slf4j</groupId>
       <artifactId>slf4j-api</artifactId>
     </dependency>
-    <dependency>
-      <groupId>org.threeten</groupId>
-      <artifactId>threetenbp</artifactId>
-    </dependency>
     <!-- orc-tools uber jar needs to include this -->
     <dependency>
       <groupId>com.google.guava</groupId>
diff --git a/java/tools/src/java/org/apache/orc/tools/convert/CsvReader.java b/java/tools/src/java/org/apache/orc/tools/convert/CsvReader.java
index 3967a49..14e3cc9 100644
--- a/java/tools/src/java/org/apache/orc/tools/convert/CsvReader.java
+++ b/java/tools/src/java/org/apache/orc/tools/convert/CsvReader.java
@@ -21,6 +21,7 @@ import com.opencsv.CSVReader;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DateColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
@@ -31,15 +32,18 @@ import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.orc.RecordReader;
 import org.apache.orc.TypeDescription;
-import org.threeten.bp.LocalDateTime;
-import org.threeten.bp.ZoneId;
-import org.threeten.bp.ZonedDateTime;
-import org.threeten.bp.format.DateTimeFormatter;
-import org.threeten.bp.temporal.TemporalAccessor;
+
 
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 import java.sql.Timestamp;
+import java.time.LocalDate;
+import java.time.LocalDateTime;
+import java.time.OffsetDateTime;
+import java.time.ZonedDateTime;
+import java.time.ZoneId;
+import java.time.format.DateTimeFormatter;
+import java.time.temporal.TemporalAccessor;
 
 public class CsvReader implements RecordReader {
   private long rowNumber = 0;
@@ -238,6 +242,29 @@ public class CsvReader implements RecordReader {
     }
   }
 
+  class DateColumnConverter extends ConverterImpl {
+    DateColumnConverter(IntWritable offset) { super(offset); }
+
+    @Override
+    public void convert(String[] values, ColumnVector column, int row) {
+      if (values[offset] == null || nullString.equals(values[offset])) {
+        column.noNulls = false;
+        column.isNull[row] = true;
+      } else {
+        DateColumnVector vector = (DateColumnVector) column;
+
+        final LocalDate dt = LocalDate.parse(values[offset]);
+
+        if (dt != null) {
+          vector.vector[row] = dt.toEpochDay();
+        } else {
+          column.noNulls = false;
+          column.isNull[row] = true;
+        }
+      }
+    }
+  }
+
   class TimestampConverter extends ConverterImpl {
     TimestampConverter(IntWritable offset) {
       super(offset);
@@ -252,16 +279,18 @@ public class CsvReader implements RecordReader {
         TimestampColumnVector vector = (TimestampColumnVector) column;
         TemporalAccessor temporalAccessor =
             dateTimeFormatter.parseBest(values[offset],
-                ZonedDateTime.FROM, LocalDateTime.FROM);
+                ZonedDateTime::from, OffsetDateTime::from, LocalDateTime::from);
         if (temporalAccessor instanceof ZonedDateTime) {
           ZonedDateTime zonedDateTime = ((ZonedDateTime) temporalAccessor);
-          Timestamp timestamp = new Timestamp(zonedDateTime.toEpochSecond() * 1000L);
-          timestamp.setNanos(zonedDateTime.getNano());
+          Timestamp timestamp = Timestamp.from(zonedDateTime.toInstant());
+          vector.set(row, timestamp);
+        } else if (temporalAccessor instanceof OffsetDateTime) {
+          OffsetDateTime offsetDateTime = (OffsetDateTime) temporalAccessor;
+          Timestamp timestamp = Timestamp.from(offsetDateTime.toInstant());
           vector.set(row, timestamp);
         } else if (temporalAccessor instanceof LocalDateTime) {
           ZonedDateTime tz = ((LocalDateTime) temporalAccessor).atZone(ZoneId.systemDefault());
-          Timestamp timestamp = new Timestamp(tz.toEpochSecond() * 1000L);
-          timestamp.setNanos(tz.getNano());
+          Timestamp timestamp = Timestamp.from(tz.toInstant());
           vector.set(row, timestamp);
         } else {
           column.noNulls = false;
@@ -318,6 +347,8 @@ public class CsvReader implements RecordReader {
       case CHAR:
       case VARCHAR:
         return new BytesConverter(startOffset);
+      case DATE:
+        return new DateColumnConverter(startOffset);
       case TIMESTAMP:
       case TIMESTAMP_INSTANT:
         return new TimestampConverter(startOffset);
diff --git a/java/tools/src/java/org/apache/orc/tools/convert/JsonReader.java b/java/tools/src/java/org/apache/orc/tools/convert/JsonReader.java
index d8213e3..0118c20 100644
--- a/java/tools/src/java/org/apache/orc/tools/convert/JsonReader.java
+++ b/java/tools/src/java/org/apache/orc/tools/convert/JsonReader.java
@@ -26,6 +26,7 @@ import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.hive.common.type.HiveDecimal;
 import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DateColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
@@ -36,16 +37,18 @@ import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
 import org.apache.orc.RecordReader;
 import org.apache.orc.TypeDescription;
-import org.threeten.bp.LocalDateTime;
-import org.threeten.bp.ZonedDateTime;
-import org.threeten.bp.ZoneId;
-import org.threeten.bp.format.DateTimeFormatter;
-import org.threeten.bp.temporal.TemporalAccessor;
 
 import java.io.IOException;
 import java.io.Reader;
 import java.nio.charset.StandardCharsets;
 import java.sql.Timestamp;
+import java.time.LocalDate;
+import java.time.LocalDateTime;
+import java.time.OffsetDateTime;
+import java.time.ZonedDateTime;
+import java.time.ZoneId;
+import java.time.format.DateTimeFormatter;
+import java.time.temporal.TemporalAccessor;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@@ -135,6 +138,26 @@ public class JsonReader implements RecordReader {
     }
   }
 
+  static class DateColumnConverter implements JsonConverter {
+    public void convert(JsonElement value, ColumnVector vect, int row) {
+      if (value == null || value.isJsonNull()) {
+        vect.noNulls = false;
+        vect.isNull[row] = true;
+      } else {
+        DateColumnVector vector = (DateColumnVector) vect;
+
+        final LocalDate dt = LocalDate.parse(value.getAsString());
+
+        if (dt != null) {
+          vector.vector[row] = dt.toEpochDay();
+        } else {
+          vect.noNulls = false;
+          vect.isNull[row] = true;
+        }
+      }
+    }
+  }
+
   class TimestampColumnConverter implements JsonConverter {
     @Override
     public void convert(JsonElement value, ColumnVector vect, int row) {
@@ -144,16 +167,18 @@ public class JsonReader implements RecordReader {
       } else {
         TimestampColumnVector vector = (TimestampColumnVector) vect;
         TemporalAccessor temporalAccessor = dateTimeFormatter.parseBest(value.getAsString(),
-          ZonedDateTime.FROM, LocalDateTime.FROM);
+          ZonedDateTime::from, OffsetDateTime::from, LocalDateTime::from);
         if (temporalAccessor instanceof ZonedDateTime) {
           ZonedDateTime zonedDateTime = ((ZonedDateTime) temporalAccessor);
-          Timestamp timestamp = new Timestamp(zonedDateTime.toEpochSecond() * 1000L);
-          timestamp.setNanos(zonedDateTime.getNano());
+          Timestamp timestamp = Timestamp.from(zonedDateTime.toInstant());
+          vector.set(row, timestamp);
+        } else if (temporalAccessor instanceof OffsetDateTime) {
+          OffsetDateTime offsetDateTime = (OffsetDateTime) temporalAccessor;
+          Timestamp timestamp = Timestamp.from(offsetDateTime.toInstant());
           vector.set(row, timestamp);
         } else if (temporalAccessor instanceof LocalDateTime) {
           ZonedDateTime tz = ((LocalDateTime) temporalAccessor).atZone(ZoneId.systemDefault());
-          Timestamp timestamp = new Timestamp(tz.toEpochSecond() * 1000L);
-          timestamp.setNanos(tz.getNano());
+          Timestamp timestamp = Timestamp.from(tz.toInstant());
           vector.set(row, timestamp);
         } else {
           vect.noNulls = false;
@@ -283,6 +308,8 @@ public class JsonReader implements RecordReader {
         return new StringColumnConverter();
       case DECIMAL:
         return new DecimalColumnConverter();
+      case DATE:
+        return new DateColumnConverter();
       case TIMESTAMP:
       case TIMESTAMP_INSTANT:
         return new TimestampColumnConverter();
diff --git a/java/tools/src/test/org/apache/orc/tools/convert/TestJsonReader.java b/java/tools/src/test/org/apache/orc/tools/convert/TestJsonReader.java
index f839513..95ca355 100644
--- a/java/tools/src/test/org/apache/orc/tools/convert/TestJsonReader.java
+++ b/java/tools/src/test/org/apache/orc/tools/convert/TestJsonReader.java
@@ -18,14 +18,24 @@
 
 package org.apache.orc.tools.convert;
 
+import org.apache.hadoop.hive.ql.exec.vector.DateColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
 import org.apache.orc.TypeDescription;
 import org.junit.Test;
 
+import java.io.IOException;
 import java.io.StringReader;
+import java.time.LocalDate;
+import java.time.LocalDateTime;
+import java.time.OffsetDateTime;
+import java.time.ZonedDateTime;
+import java.time.ZoneId;
+import java.time.ZoneOffset;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
 
 public class TestJsonReader {
     @Test
@@ -72,4 +82,62 @@ public class TestJsonReader {
         assertEquals("1969-12-31 23:59:58.9999", cv.asScratchTimestamp(5).toString());
     }
 
+    @Test
+    public void testDateTypeSupport() throws IOException {
+        LocalDate date1 = LocalDate.of(2021, 1, 18);
+        LocalDate date2 = LocalDate.now();
+        String inputString = "{\"dt\": \"" + date1.toString() + "\"}\n" +
+                             "{\"dt\": \"" + date2.toString() + "\"}\n" +
+                             "{\"dt\": \"" + date2.toString() + "\"}\n" +
+                             "{\"dt\": null}";
+
+
+        StringReader input = new StringReader(inputString);
+
+        TypeDescription schema = TypeDescription.fromString("struct<dt:date>");
+        JsonReader reader = new JsonReader(input, null, 1, schema, "");
+        VectorizedRowBatch batch = schema.createRowBatch(4);
+        assertTrue(reader.nextBatch(batch));
+        assertEquals(4, batch.size);
+        DateColumnVector cv = (DateColumnVector) batch.cols[0];
+        assertEquals(date1, LocalDate.ofEpochDay(cv.vector[0]));
+        assertEquals(date2, LocalDate.ofEpochDay(cv.vector[1]));
+        assertEquals(date2, LocalDate.ofEpochDay(cv.vector[2]));
+        assertFalse(cv.isNull[2]);
+        assertTrue(cv.isNull[3]);
+    }
+
+    @Test
+    public void testDateTimeTypeSupport() throws IOException {
+        String timestampFormat = "yyyy[[-][/]]MM[[-][/]]dd[['T'][ ]]HH:mm:ss[['.'][ ]][[SSSSSSSSS][SSSSSS][SSS]][[X][Z]['['VV']']]";
+        LocalDateTime datetime1 = LocalDateTime.of(2021, 1, 18, 1, 2, 3, 4);
+        LocalDateTime datetime2 = LocalDateTime.now();
+        OffsetDateTime datetime3 = OffsetDateTime.of(datetime1, ZoneOffset.UTC);
+        OffsetDateTime datetime4 = OffsetDateTime.of(datetime2, ZoneOffset.ofHours(-7));
+        ZonedDateTime datetime5 = ZonedDateTime.of(datetime1, ZoneId.of("UTC"));
+        ZonedDateTime datetime6 = ZonedDateTime.of(datetime2, ZoneId.of("America/New_York"));
+
+        String inputString = "{\"dt\": \"" + datetime1.toString() + "\"}\n" +
+                             "{\"dt\": \"" + datetime2.toString() + "\"}\n" +
+                             "{\"dt\": \"" + datetime3.toString() + "\"}\n" +
+                             "{\"dt\": \"" + datetime4.toString().replace("07:00", "0700") + "\"}\n" +
+                             "{\"dt\": \"" + datetime5.toLocalDateTime().toString() + "[" + datetime5.getZone() + "]\"}\n" +
+                             "{\"dt\": \"" + datetime6.toLocalDateTime().toString() + "[" + datetime6.getZone() + "]\"}\n";
+
+        StringReader input = new StringReader(inputString);
+
+        TypeDescription schema = TypeDescription.fromString("struct<dt:timestamp>");
+        JsonReader reader = new JsonReader(input, null, 1, schema, timestampFormat);
+        VectorizedRowBatch batch = schema.createRowBatch(6);
+        assertTrue(reader.nextBatch(batch));
+        assertEquals(6, batch.size);
+        TimestampColumnVector cv = (TimestampColumnVector) batch.cols[0];
+        assertEquals(datetime1, LocalDateTime.from(cv.asScratchTimestamp(0).toLocalDateTime()));
+        assertEquals(datetime2, LocalDateTime.from(cv.asScratchTimestamp(1).toLocalDateTime()));
+        assertEquals(datetime3.toInstant(), cv.asScratchTimestamp(2).toInstant());
+        assertEquals(datetime4.toInstant(), cv.asScratchTimestamp(3).toInstant());
+        assertEquals(datetime5.toInstant(), cv.asScratchTimestamp(4).toInstant());
+        assertEquals(datetime6.toInstant(), cv.asScratchTimestamp(5).toInstant());
+    }
+
 }