You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2020/05/03 04:07:47 UTC

[hive] branch master updated: HIVE-23345: Enable Parquet timestamps types (INT64 and INT96) conversion to Hive BIGINT type Adding test cases (Panos G via Ashutosh Chauhan)

This is an automated email from the ASF dual-hosted git repository.

hashutosh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 659e28d  HIVE-23345: Enable Parquet timestamps types (INT64 and INT96) conversion to Hive BIGINT type Adding test cases (Panos G via Ashutosh Chauhan)
659e28d is described below

commit 659e28de0b609d114e20e6294348abb74f49f6e0
Author: Panos Garefalakis <pg...@cloudera.com>
AuthorDate: Fri May 1 14:24:39 2020 +0100

    HIVE-23345: Enable Parquet timestamps types (INT64 and INT96) conversion to Hive BIGINT type Adding test cases (Panos G via Ashutosh Chauhan)
    
    Change-Id: I8666a95cc7ff7495a86b960c2ea173cd875bfa4f
    Signed-off-by: Ashutosh Chauhan <ha...@apache.org>
---
 .../test/resources/testconfiguration.properties    |  1 +
 .../hive/ql/io/parquet/convert/ETypeConverter.java | 40 +++++++++++++--
 .../ql/io/parquet/convert/TestETypeConverter.java  | 29 +++++++++++
 .../clientpositive/parquet_timestampt_to_bigint.q  | 25 +++++++++
 .../llap/parquet_timestampt_to_bigint.q.out        | 60 ++++++++++++++++++++++
 5 files changed, 152 insertions(+), 3 deletions(-)

diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties
index b639718..5468728 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -695,6 +695,7 @@ minillaplocal.query.files=\
   parquet_legacy_mixed_date.q,\
   parquet_legacy_mixed_timestamp.q,\
   parquet_proleptic_mixed_date.q,\
+  parquet_timestampt_to_bigint.q,\
   partition_ctas.q,\
   partition_multilevels.q,\
   partition_shared_scan.q,\
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java
index 6082321..8e436bc 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java
@@ -14,6 +14,8 @@
 package org.apache.hadoop.hive.ql.io.parquet.convert;
 
 import java.math.BigDecimal;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 import java.util.ArrayList;
 import java.util.Map;
 import java.util.Optional;
@@ -43,6 +45,7 @@ import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 
+import org.apache.parquet.Preconditions;
 import org.apache.parquet.column.Dictionary;
 import org.apache.parquet.io.api.Binary;
 import org.apache.parquet.io.api.PrimitiveConverter;
@@ -662,9 +665,25 @@ public enum ETypeConverter {
       };
     }
   },
-  ETIMESTAMP_CONVERTER(TimestampWritableV2.class) {
+  EINT96_TIMESTAMP_CONVERTER(TimestampWritableV2.class) {
     @Override
     PrimitiveConverter getConverter(final PrimitiveType type, final int index, final ConverterParent parent, TypeInfo hiveTypeInfo) {
+      if (hiveTypeInfo != null) {
+        String typeName = TypeInfoUtils.getBaseName(hiveTypeInfo.getTypeName());
+        switch (typeName) {
+          case serdeConstants.BIGINT_TYPE_NAME:
+            return new BinaryConverter<LongWritable>(type, parent, index) {
+              @Override
+              protected LongWritable convert(Binary binary) {
+                Preconditions.checkArgument(binary.length() == 12, "Must be 12 bytes");
+                ByteBuffer buf = binary.toByteBuffer();
+                buf.order(ByteOrder.LITTLE_ENDIAN);
+                long longVal = buf.getLong();
+                return new LongWritable(longVal);
+              }
+            };
+        }
+      }
       return new BinaryConverter<TimestampWritableV2>(type, parent, index) {
         @Override
         protected TimestampWritableV2 convert(Binary binary) {
@@ -690,6 +709,22 @@ public enum ETypeConverter {
     @Override
     PrimitiveConverter getConverter(final PrimitiveType type, final int index, final ConverterParent parent,
         TypeInfo hiveTypeInfo) {
+      if (hiveTypeInfo != null) {
+        String typeName = TypeInfoUtils.getBaseName(hiveTypeInfo.getTypeName());
+        switch (typeName) {
+          case serdeConstants.BIGINT_TYPE_NAME:
+            return new BinaryConverter<LongWritable>(type, parent, index) {
+              @Override
+              protected LongWritable convert(Binary binary) {
+                Preconditions.checkArgument(binary.length() == 8, "Must be 8 bytes");
+                ByteBuffer buf = binary.toByteBuffer();
+                buf.order(ByteOrder.LITTLE_ENDIAN);
+                long longVal = buf.getLong();
+                return new LongWritable(longVal);
+              }
+            };
+        }
+      }
       return new PrimitiveConverter() {
         @Override
         public void addLong(final long value) {
@@ -735,8 +770,7 @@ public enum ETypeConverter {
   public static PrimitiveConverter getNewConverter(final PrimitiveType type, final int index,
                                                    final ConverterParent parent, final TypeInfo hiveTypeInfo) {
     if (type.isPrimitive() && (type.asPrimitiveType().getPrimitiveTypeName().equals(PrimitiveType.PrimitiveTypeName.INT96))) {
-      //TODO- cleanup once parquet support Timestamp type annotation.
-      return ETypeConverter.ETIMESTAMP_CONVERTER.getConverter(type, index, parent, hiveTypeInfo);
+      return EINT96_TIMESTAMP_CONVERTER.getConverter(type, index, parent, hiveTypeInfo);
     }
     if (type.getLogicalTypeAnnotation() != null) {
       Optional<PrimitiveConverter> converter = type.getLogicalTypeAnnotation()
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestETypeConverter.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestETypeConverter.java
index be4c880..74e2495 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestETypeConverter.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestETypeConverter.java
@@ -21,6 +21,8 @@ package org.apache.hadoop.hive.ql.io.parquet.convert;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 import java.time.ZoneId;
 
 import org.apache.hadoop.hive.common.type.Timestamp;
@@ -109,6 +111,33 @@ public class TestETypeConverter {
   }
 
   @Test
+  public void testGetSmallBigIntConverter() {
+    Timestamp timestamp = Timestamp.valueOf("1998-10-03 09:58:31.231");
+    long msTime = timestamp.toEpochMilli();
+    ByteBuffer buf = ByteBuffer.allocate(12);
+    buf.order(ByteOrder.LITTLE_ENDIAN);
+    buf.putLong(msTime);
+    buf.flip();
+    // Need TimeStamp logicalType annotation here
+    PrimitiveType primitiveType = createInt64TimestampType(false, TimeUnit.MILLIS);
+    Writable writable = getWritableFromBinaryConverter(createHiveTypeInfo("bigint"), primitiveType, Binary.fromByteBuffer(buf));
+    // Retrieve as BigInt
+    LongWritable longWritable = (LongWritable) writable;
+    assertEquals(msTime, longWritable.get());
+  }
+
+  @Test
+  public void testGetBigIntConverter() {
+    Timestamp timestamp = Timestamp.valueOf("1998-10-03 09:58:31.231");
+    NanoTime nanoTime = NanoTimeUtils.getNanoTime(timestamp, true);
+    PrimitiveType primitiveType = Types.optional(PrimitiveTypeName.INT96).named("value");
+    Writable writable = getWritableFromBinaryConverter(createHiveTypeInfo("bigint"), primitiveType, nanoTime.toBinary());
+    // Retrieve as BigInt
+    LongWritable longWritable = (LongWritable) writable;
+    assertEquals(nanoTime.getTimeOfDayNanos(), longWritable.get());
+  }
+
+  @Test
   public void testGetTimestampConverter() throws Exception {
     Timestamp timestamp = Timestamp.valueOf("2018-06-15 15:12:20.0");
     NanoTime nanoTime = NanoTimeUtils.getNanoTime(timestamp, true);
diff --git a/ql/src/test/queries/clientpositive/parquet_timestampt_to_bigint.q b/ql/src/test/queries/clientpositive/parquet_timestampt_to_bigint.q
new file mode 100644
index 0000000..5aa4ab1
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/parquet_timestampt_to_bigint.q
@@ -0,0 +1,25 @@
+set hive.vectorized.execution.enabled=false;
+set parquet.column.index.access=true;
+
+-- Test paquet table with Timestamp Col to BigInt convertion
+dfs ${system:test.dfs.mkdir} ${system:test.tmp.dir}/parquet_format_ts;
+
+DROP TABLE ts_pq;
+
+CREATE EXTERNAL TABLE ts_pq (ts1 TIMESTAMP)
+    STORED AS PARQUET
+    LOCATION '${system:test.tmp.dir}/parquet_format_ts';
+
+INSERT INTO ts_pq VALUES ('1998-10-03 09:58:31.231');
+
+SELECT * FROM ts_pq;
+
+-- Now use data from another table that uses TS as a BIGINT
+
+CREATE EXTERNAL TABLE ts_pq_2 (ts2 BIGINT)
+    STORED AS PARQUET
+    LOCATION '${system:test.tmp.dir}/parquet_format_ts';
+
+SELECT * FROM ts_pq_2;
+
+dfs -rmr ${system:test.tmp.dir}/parquet_format_ts;
\ No newline at end of file
diff --git a/ql/src/test/results/clientpositive/llap/parquet_timestampt_to_bigint.q.out b/ql/src/test/results/clientpositive/llap/parquet_timestampt_to_bigint.q.out
new file mode 100644
index 0000000..63af5b8
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/parquet_timestampt_to_bigint.q.out
@@ -0,0 +1,60 @@
+PREHOOK: query: DROP TABLE ts_pq
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE ts_pq
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE EXTERNAL TABLE ts_pq (ts1 TIMESTAMP)
+    STORED AS PARQUET
+#### A masked pattern was here ####
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+PREHOOK: Output: database:default
+PREHOOK: Output: default@ts_pq
+POSTHOOK: query: CREATE EXTERNAL TABLE ts_pq (ts1 TIMESTAMP)
+    STORED AS PARQUET
+#### A masked pattern was here ####
+POSTHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@ts_pq
+PREHOOK: query: INSERT INTO ts_pq VALUES ('1998-10-03 09:58:31.231')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@ts_pq
+POSTHOOK: query: INSERT INTO ts_pq VALUES ('1998-10-03 09:58:31.231')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@ts_pq
+POSTHOOK: Lineage: ts_pq.ts1 SCRIPT []
+PREHOOK: query: SELECT * FROM ts_pq
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ts_pq
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM ts_pq
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ts_pq
+#### A masked pattern was here ####
+1998-10-03 09:58:31.231
+PREHOOK: query: CREATE EXTERNAL TABLE ts_pq_2 (ts2 BIGINT)
+    STORED AS PARQUET
+#### A masked pattern was here ####
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+PREHOOK: Output: database:default
+PREHOOK: Output: default@ts_pq_2
+POSTHOOK: query: CREATE EXTERNAL TABLE ts_pq_2 (ts2 BIGINT)
+    STORED AS PARQUET
+#### A masked pattern was here ####
+POSTHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@ts_pq_2
+PREHOOK: query: SELECT * FROM ts_pq_2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ts_pq_2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM ts_pq_2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ts_pq_2
+#### A masked pattern was here ####
+61111231000000
+#### A masked pattern was here ####