You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2020/05/03 04:07:47 UTC
[hive] branch master updated: HIVE-23345: Enable Parquet timestamps
types (INT64 and INT96) conversion to Hive BIGINT type Adding test cases
(Panos G via Ashutosh Chauhan)
This is an automated email from the ASF dual-hosted git repository.
hashutosh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 659e28d HIVE-23345: Enable Parquet timestamps types (INT64 and INT96) conversion to Hive BIGINT type Adding test cases (Panos G via Ashutosh Chauhan)
659e28d is described below
commit 659e28de0b609d114e20e6294348abb74f49f6e0
Author: Panos Garefalakis <pg...@cloudera.com>
AuthorDate: Fri May 1 14:24:39 2020 +0100
HIVE-23345: Enable Parquet timestamps types (INT64 and INT96) conversion to Hive BIGINT type Adding test cases (Panos G via Ashutosh Chauhan)
Change-Id: I8666a95cc7ff7495a86b960c2ea173cd875bfa4f
Signed-off-by: Ashutosh Chauhan <ha...@apache.org>
---
.../test/resources/testconfiguration.properties | 1 +
.../hive/ql/io/parquet/convert/ETypeConverter.java | 40 +++++++++++++--
.../ql/io/parquet/convert/TestETypeConverter.java | 29 +++++++++++
.../clientpositive/parquet_timestampt_to_bigint.q | 25 +++++++++
.../llap/parquet_timestampt_to_bigint.q.out | 60 ++++++++++++++++++++++
5 files changed, 152 insertions(+), 3 deletions(-)
diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties
index b639718..5468728 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -695,6 +695,7 @@ minillaplocal.query.files=\
parquet_legacy_mixed_date.q,\
parquet_legacy_mixed_timestamp.q,\
parquet_proleptic_mixed_date.q,\
+ parquet_timestampt_to_bigint.q,\
partition_ctas.q,\
partition_multilevels.q,\
partition_shared_scan.q,\
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java
index 6082321..8e436bc 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/convert/ETypeConverter.java
@@ -14,6 +14,8 @@
package org.apache.hadoop.hive.ql.io.parquet.convert;
import java.math.BigDecimal;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
import java.util.ArrayList;
import java.util.Map;
import java.util.Optional;
@@ -43,6 +45,7 @@ import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
+import org.apache.parquet.Preconditions;
import org.apache.parquet.column.Dictionary;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.io.api.PrimitiveConverter;
@@ -662,9 +665,25 @@ public enum ETypeConverter {
};
}
},
- ETIMESTAMP_CONVERTER(TimestampWritableV2.class) {
+ EINT96_TIMESTAMP_CONVERTER(TimestampWritableV2.class) {
@Override
PrimitiveConverter getConverter(final PrimitiveType type, final int index, final ConverterParent parent, TypeInfo hiveTypeInfo) {
+ if (hiveTypeInfo != null) {
+ String typeName = TypeInfoUtils.getBaseName(hiveTypeInfo.getTypeName());
+ switch (typeName) {
+ case serdeConstants.BIGINT_TYPE_NAME:
+ return new BinaryConverter<LongWritable>(type, parent, index) {
+ @Override
+ protected LongWritable convert(Binary binary) {
+ Preconditions.checkArgument(binary.length() == 12, "Must be 12 bytes");
+ ByteBuffer buf = binary.toByteBuffer();
+ buf.order(ByteOrder.LITTLE_ENDIAN);
+ long longVal = buf.getLong();
+ return new LongWritable(longVal);
+ }
+ };
+ }
+ }
return new BinaryConverter<TimestampWritableV2>(type, parent, index) {
@Override
protected TimestampWritableV2 convert(Binary binary) {
@@ -690,6 +709,22 @@ public enum ETypeConverter {
@Override
PrimitiveConverter getConverter(final PrimitiveType type, final int index, final ConverterParent parent,
TypeInfo hiveTypeInfo) {
+ if (hiveTypeInfo != null) {
+ String typeName = TypeInfoUtils.getBaseName(hiveTypeInfo.getTypeName());
+ switch (typeName) {
+ case serdeConstants.BIGINT_TYPE_NAME:
+ return new BinaryConverter<LongWritable>(type, parent, index) {
+ @Override
+ protected LongWritable convert(Binary binary) {
+ Preconditions.checkArgument(binary.length() == 8, "Must be 8 bytes");
+ ByteBuffer buf = binary.toByteBuffer();
+ buf.order(ByteOrder.LITTLE_ENDIAN);
+ long longVal = buf.getLong();
+ return new LongWritable(longVal);
+ }
+ };
+ }
+ }
return new PrimitiveConverter() {
@Override
public void addLong(final long value) {
@@ -735,8 +770,7 @@ public enum ETypeConverter {
public static PrimitiveConverter getNewConverter(final PrimitiveType type, final int index,
final ConverterParent parent, final TypeInfo hiveTypeInfo) {
if (type.isPrimitive() && (type.asPrimitiveType().getPrimitiveTypeName().equals(PrimitiveType.PrimitiveTypeName.INT96))) {
- //TODO- cleanup once parquet support Timestamp type annotation.
- return ETypeConverter.ETIMESTAMP_CONVERTER.getConverter(type, index, parent, hiveTypeInfo);
+ return EINT96_TIMESTAMP_CONVERTER.getConverter(type, index, parent, hiveTypeInfo);
}
if (type.getLogicalTypeAnnotation() != null) {
Optional<PrimitiveConverter> converter = type.getLogicalTypeAnnotation()
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestETypeConverter.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestETypeConverter.java
index be4c880..74e2495 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestETypeConverter.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/convert/TestETypeConverter.java
@@ -21,6 +21,8 @@ package org.apache.hadoop.hive.ql.io.parquet.convert;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
import java.time.ZoneId;
import org.apache.hadoop.hive.common.type.Timestamp;
@@ -109,6 +111,33 @@ public class TestETypeConverter {
}
@Test
+ public void testGetSmallBigIntConverter() {
+ Timestamp timestamp = Timestamp.valueOf("1998-10-03 09:58:31.231");
+ long msTime = timestamp.toEpochMilli();
+ ByteBuffer buf = ByteBuffer.allocate(12);
+ buf.order(ByteOrder.LITTLE_ENDIAN);
+ buf.putLong(msTime);
+ buf.flip();
+ // Need TimeStamp logicalType annotation here
+ PrimitiveType primitiveType = createInt64TimestampType(false, TimeUnit.MILLIS);
+ Writable writable = getWritableFromBinaryConverter(createHiveTypeInfo("bigint"), primitiveType, Binary.fromByteBuffer(buf));
+ // Retrieve as BigInt
+ LongWritable longWritable = (LongWritable) writable;
+ assertEquals(msTime, longWritable.get());
+ }
+
+ @Test
+ public void testGetBigIntConverter() {
+ Timestamp timestamp = Timestamp.valueOf("1998-10-03 09:58:31.231");
+ NanoTime nanoTime = NanoTimeUtils.getNanoTime(timestamp, true);
+ PrimitiveType primitiveType = Types.optional(PrimitiveTypeName.INT96).named("value");
+ Writable writable = getWritableFromBinaryConverter(createHiveTypeInfo("bigint"), primitiveType, nanoTime.toBinary());
+ // Retrieve as BigInt
+ LongWritable longWritable = (LongWritable) writable;
+ assertEquals(nanoTime.getTimeOfDayNanos(), longWritable.get());
+ }
+
+ @Test
public void testGetTimestampConverter() throws Exception {
Timestamp timestamp = Timestamp.valueOf("2018-06-15 15:12:20.0");
NanoTime nanoTime = NanoTimeUtils.getNanoTime(timestamp, true);
diff --git a/ql/src/test/queries/clientpositive/parquet_timestampt_to_bigint.q b/ql/src/test/queries/clientpositive/parquet_timestampt_to_bigint.q
new file mode 100644
index 0000000..5aa4ab1
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/parquet_timestampt_to_bigint.q
@@ -0,0 +1,25 @@
+set hive.vectorized.execution.enabled=false;
+set parquet.column.index.access=true;
+
+-- Test paquet table with Timestamp Col to BigInt convertion
+dfs ${system:test.dfs.mkdir} ${system:test.tmp.dir}/parquet_format_ts;
+
+DROP TABLE ts_pq;
+
+CREATE EXTERNAL TABLE ts_pq (ts1 TIMESTAMP)
+ STORED AS PARQUET
+ LOCATION '${system:test.tmp.dir}/parquet_format_ts';
+
+INSERT INTO ts_pq VALUES ('1998-10-03 09:58:31.231');
+
+SELECT * FROM ts_pq;
+
+-- Now use data from another table that uses TS as a BIGINT
+
+CREATE EXTERNAL TABLE ts_pq_2 (ts2 BIGINT)
+ STORED AS PARQUET
+ LOCATION '${system:test.tmp.dir}/parquet_format_ts';
+
+SELECT * FROM ts_pq_2;
+
+dfs -rmr ${system:test.tmp.dir}/parquet_format_ts;
\ No newline at end of file
diff --git a/ql/src/test/results/clientpositive/llap/parquet_timestampt_to_bigint.q.out b/ql/src/test/results/clientpositive/llap/parquet_timestampt_to_bigint.q.out
new file mode 100644
index 0000000..63af5b8
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/parquet_timestampt_to_bigint.q.out
@@ -0,0 +1,60 @@
+PREHOOK: query: DROP TABLE ts_pq
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: DROP TABLE ts_pq
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: CREATE EXTERNAL TABLE ts_pq (ts1 TIMESTAMP)
+ STORED AS PARQUET
+#### A masked pattern was here ####
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+PREHOOK: Output: database:default
+PREHOOK: Output: default@ts_pq
+POSTHOOK: query: CREATE EXTERNAL TABLE ts_pq (ts1 TIMESTAMP)
+ STORED AS PARQUET
+#### A masked pattern was here ####
+POSTHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@ts_pq
+PREHOOK: query: INSERT INTO ts_pq VALUES ('1998-10-03 09:58:31.231')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@ts_pq
+POSTHOOK: query: INSERT INTO ts_pq VALUES ('1998-10-03 09:58:31.231')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@ts_pq
+POSTHOOK: Lineage: ts_pq.ts1 SCRIPT []
+PREHOOK: query: SELECT * FROM ts_pq
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ts_pq
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM ts_pq
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ts_pq
+#### A masked pattern was here ####
+1998-10-03 09:58:31.231
+PREHOOK: query: CREATE EXTERNAL TABLE ts_pq_2 (ts2 BIGINT)
+ STORED AS PARQUET
+#### A masked pattern was here ####
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+PREHOOK: Output: database:default
+PREHOOK: Output: default@ts_pq_2
+POSTHOOK: query: CREATE EXTERNAL TABLE ts_pq_2 (ts2 BIGINT)
+ STORED AS PARQUET
+#### A masked pattern was here ####
+POSTHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@ts_pq_2
+PREHOOK: query: SELECT * FROM ts_pq_2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ts_pq_2
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT * FROM ts_pq_2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ts_pq_2
+#### A masked pattern was here ####
+61111231000000
+#### A masked pattern was here ####