You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by jc...@apache.org on 2018/06/01 15:13:55 UTC
hive git commit: Revert "HIVE-19723: Arrow serde: "Unsupported data
type: Timestamp(NANOSECOND, null)" (Teddy Choi, reviewed by Matt McCline)"
Repository: hive
Updated Branches:
refs/heads/master 28779d202 -> 17d661e5d
Revert "HIVE-19723: Arrow serde: "Unsupported data type: Timestamp(NANOSECOND, null)" (Teddy Choi, reviewed by Matt McCline)"
This reverts commit 1c970d9243dba471068998e18cf0823eb43a09de.
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/17d661e5
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/17d661e5
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/17d661e5
Branch: refs/heads/master
Commit: 17d661e5db4d480f6d0d3a6e92a15b407d15fdeb
Parents: 28779d2
Author: Jesus Camacho Rodriguez <jc...@apache.org>
Authored: Fri Jun 1 08:13:42 2018 -0700
Committer: Jesus Camacho Rodriguez <jc...@apache.org>
Committed: Fri Jun 1 08:13:42 2018 -0700
----------------------------------------------------------------------
.../hadoop/hive/ql/io/arrow/Deserializer.java | 86 +++++++-------------
.../hadoop/hive/ql/io/arrow/Serializer.java | 20 +++--
.../io/arrow/TestArrowColumnarBatchSerDe.java | 29 +++----
3 files changed, 56 insertions(+), 79 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/17d661e5/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Deserializer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Deserializer.java b/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Deserializer.java
index 3a79e5f..fb5800b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Deserializer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Deserializer.java
@@ -29,14 +29,12 @@ import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.IntervalDayVector;
import org.apache.arrow.vector.IntervalYearVector;
import org.apache.arrow.vector.SmallIntVector;
-import org.apache.arrow.vector.TimeStampMilliVector;
import org.apache.arrow.vector.TimeStampNanoVector;
import org.apache.arrow.vector.TinyIntVector;
import org.apache.arrow.vector.VarBinaryVector;
import org.apache.arrow.vector.VarCharVector;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.arrow.vector.holders.NullableIntervalDayHolder;
-import org.apache.arrow.vector.types.Types;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.common.type.HiveIntervalDayTime;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
@@ -132,7 +130,7 @@ class Deserializer {
private void read(FieldVector arrowVector, ColumnVector hiveVector, TypeInfo typeInfo) {
switch (typeInfo.getCategory()) {
case PRIMITIVE:
- readPrimitive(arrowVector, hiveVector);
+ readPrimitive(arrowVector, hiveVector, typeInfo);
break;
case LIST:
readList(arrowVector, (ListColumnVector) hiveVector, (ListTypeInfo) typeInfo);
@@ -151,14 +149,15 @@ class Deserializer {
}
}
- private void readPrimitive(FieldVector arrowVector, ColumnVector hiveVector) {
- final Types.MinorType minorType = arrowVector.getMinorType();
+ private void readPrimitive(FieldVector arrowVector, ColumnVector hiveVector, TypeInfo typeInfo) {
+ final PrimitiveObjectInspector.PrimitiveCategory primitiveCategory =
+ ((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory();
final int size = arrowVector.getValueCount();
hiveVector.ensureSize(size, false);
- switch (minorType) {
- case BIT:
+ switch (primitiveCategory) {
+ case BOOLEAN:
{
for (int i = 0; i < size; i++) {
if (arrowVector.isNull(i)) {
@@ -170,7 +169,7 @@ class Deserializer {
}
}
break;
- case TINYINT:
+ case BYTE:
{
for (int i = 0; i < size; i++) {
if (arrowVector.isNull(i)) {
@@ -182,7 +181,7 @@ class Deserializer {
}
}
break;
- case SMALLINT:
+ case SHORT:
{
for (int i = 0; i < size; i++) {
if (arrowVector.isNull(i)) {
@@ -206,7 +205,7 @@ class Deserializer {
}
}
break;
- case BIGINT:
+ case LONG:
{
for (int i = 0; i < size; i++) {
if (arrowVector.isNull(i)) {
@@ -218,19 +217,19 @@ class Deserializer {
}
}
break;
- case FLOAT4:
- {
- for (int i = 0; i < size; i++) {
- if (arrowVector.isNull(i)) {
- VectorizedBatchUtil.setNullColIsNullValue(hiveVector, i);
- } else {
- hiveVector.isNull[i] = false;
- ((DoubleColumnVector) hiveVector).vector[i] = ((Float4Vector) arrowVector).get(i);
- }
+ case FLOAT:
+ {
+ for (int i = 0; i < size; i++) {
+ if (arrowVector.isNull(i)) {
+ VectorizedBatchUtil.setNullColIsNullValue(hiveVector, i);
+ } else {
+ hiveVector.isNull[i] = false;
+ ((DoubleColumnVector) hiveVector).vector[i] = ((Float4Vector) arrowVector).get(i);
}
}
+ }
break;
- case FLOAT8:
+ case DOUBLE:
{
for (int i = 0; i < size; i++) {
if (arrowVector.isNull(i)) {
@@ -242,7 +241,9 @@ class Deserializer {
}
}
break;
+ case STRING:
case VARCHAR:
+ case CHAR:
{
for (int i = 0; i < size; i++) {
if (arrowVector.isNull(i)) {
@@ -254,7 +255,7 @@ class Deserializer {
}
}
break;
- case DATEDAY:
+ case DATE:
{
for (int i = 0; i < size; i++) {
if (arrowVector.isNull(i)) {
@@ -266,37 +267,7 @@ class Deserializer {
}
}
break;
- case TIMESTAMPMILLI:
- {
- for (int i = 0; i < size; i++) {
- if (arrowVector.isNull(i)) {
- VectorizedBatchUtil.setNullColIsNullValue(hiveVector, i);
- } else {
- hiveVector.isNull[i] = false;
-
- // Time = second + sub-second
- final long timeInMillis = ((TimeStampMilliVector) arrowVector).get(i);
- final TimestampColumnVector timestampColumnVector = (TimestampColumnVector) hiveVector;
- int subSecondInNanos = (int) ((timeInMillis % MS_PER_SECOND) * NS_PER_MS);
- long second = timeInMillis / MS_PER_SECOND;
-
- // A nanosecond value should not be negative
- if (subSecondInNanos < 0) {
-
- // So add one second to the negative nanosecond value to make it positive
- subSecondInNanos += NS_PER_SECOND;
-
- // Subtract one second from the second value because we added one second,
- // then subtract one more second because of the ceiling in the division.
- second -= 2;
- }
- timestampColumnVector.time[i] = second * MS_PER_SECOND;
- timestampColumnVector.nanos[i] = subSecondInNanos;
- }
- }
- }
- break;
- case TIMESTAMPNANO:
+ case TIMESTAMP:
{
for (int i = 0; i < size; i++) {
if (arrowVector.isNull(i)) {
@@ -326,7 +297,7 @@ class Deserializer {
}
}
break;
- case VARBINARY:
+ case BINARY:
{
for (int i = 0; i < size; i++) {
if (arrowVector.isNull(i)) {
@@ -351,7 +322,7 @@ class Deserializer {
}
}
break;
- case INTERVALYEAR:
+ case INTERVAL_YEAR_MONTH:
{
for (int i = 0; i < size; i++) {
if (arrowVector.isNull(i)) {
@@ -363,7 +334,7 @@ class Deserializer {
}
}
break;
- case INTERVALDAY:
+ case INTERVAL_DAY_TIME:
{
final IntervalDayVector intervalDayVector = (IntervalDayVector) arrowVector;
final NullableIntervalDayHolder intervalDayHolder = new NullableIntervalDayHolder();
@@ -383,8 +354,11 @@ class Deserializer {
}
}
break;
+ case VOID:
+ case TIMESTAMPLOCALTZ:
+ case UNKNOWN:
default:
- throw new IllegalArgumentException();
+ break;
}
}
http://git-wip-us.apache.org/repos/asf/hive/blob/17d661e5/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Serializer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Serializer.java b/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Serializer.java
index 6c16808..bd23011 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Serializer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/arrow/Serializer.java
@@ -30,7 +30,7 @@ import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.IntervalDayVector;
import org.apache.arrow.vector.IntervalYearVector;
import org.apache.arrow.vector.SmallIntVector;
-import org.apache.arrow.vector.TimeStampMilliVector;
+import org.apache.arrow.vector.TimeStampNanoVector;
import org.apache.arrow.vector.TinyIntVector;
import org.apache.arrow.vector.VarBinaryVector;
import org.apache.arrow.vector.VarCharVector;
@@ -175,7 +175,7 @@ class Serializer {
case DATE:
return Types.MinorType.DATEDAY.getType();
case TIMESTAMP:
- return Types.MinorType.TIMESTAMPMILLI.getType();
+ return Types.MinorType.TIMESTAMPNANO.getType();
case BINARY:
return Types.MinorType.VARBINARY.getType();
case DECIMAL:
@@ -430,13 +430,23 @@ class Serializer {
break;
case TIMESTAMP:
{
- final TimeStampMilliVector timeStampMilliVector = (TimeStampMilliVector) arrowVector;
+ final TimeStampNanoVector timeStampNanoVector = (TimeStampNanoVector) arrowVector;
final TimestampColumnVector timestampColumnVector = (TimestampColumnVector) hiveVector;
for (int i = 0; i < size; i++) {
if (hiveVector.isNull[i]) {
- timeStampMilliVector.setNull(i);
+ timeStampNanoVector.setNull(i);
} else {
- timeStampMilliVector.set(i, timestampColumnVector.getTime(i));
+ // Time = second + sub-second
+ final long secondInMillis = timestampColumnVector.getTime(i);
+ final long secondInNanos = (secondInMillis - secondInMillis % 1000) * NS_PER_MS; // second
+ final long subSecondInNanos = timestampColumnVector.getNanos(i); // sub-second
+
+ if ((secondInMillis > 0 && secondInNanos < 0) || (secondInMillis < 0 && secondInNanos > 0)) {
+ // If the timestamp cannot be represented in long nanosecond, set it as a null value
+ timeStampNanoVector.setNull(i);
+ } else {
+ timeStampNanoVector.set(i, secondInNanos + subSecondInNanos);
+ }
}
}
}
http://git-wip-us.apache.org/repos/asf/hive/blob/17d661e5/ql/src/test/org/apache/hadoop/hive/ql/io/arrow/TestArrowColumnarBatchSerDe.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/arrow/TestArrowColumnarBatchSerDe.java b/ql/src/test/org/apache/hadoop/hive/ql/io/arrow/TestArrowColumnarBatchSerDe.java
index 4b430eb..74f6624 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/io/arrow/TestArrowColumnarBatchSerDe.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/io/arrow/TestArrowColumnarBatchSerDe.java
@@ -26,7 +26,6 @@ import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.common.type.HiveIntervalDayTime;
import org.apache.hadoop.hive.common.type.HiveIntervalYearMonth;
import org.apache.hadoop.hive.common.type.HiveVarchar;
-import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
@@ -109,10 +108,14 @@ public class TestArrowColumnarBatchSerDe {
private final static long NEGATIVE_TIME_IN_MS = TimeUnit.DAYS.toMillis(-9 * 365 + 31 + 3);
private final static Timestamp TIMESTAMP;
private final static Timestamp NEGATIVE_TIMESTAMP_WITHOUT_NANOS;
+ private final static Timestamp NEGATIVE_TIMESTAMP_WITH_NANOS;
static {
TIMESTAMP = new Timestamp(TIME_IN_MS);
+ TIMESTAMP.setNanos(123456789);
NEGATIVE_TIMESTAMP_WITHOUT_NANOS = new Timestamp(NEGATIVE_TIME_IN_MS);
+ NEGATIVE_TIMESTAMP_WITH_NANOS = new Timestamp(NEGATIVE_TIME_IN_MS);
+ NEGATIVE_TIMESTAMP_WITH_NANOS.setNanos(123456789);
}
private final static Object[][] DTI_ROWS = {
@@ -128,6 +131,12 @@ public class TestArrowColumnarBatchSerDe {
null,
null
},
+ {
+ null,
+ new TimestampWritable(NEGATIVE_TIMESTAMP_WITH_NANOS),
+ null,
+ null
+ },
{null, null, null, null},
};
@@ -498,23 +507,6 @@ public class TestArrowColumnarBatchSerDe {
}
@Test
- public void testPrimitiveRandomTimestamp() throws SerDeException {
- String[][] schema = {
- {"timestamp1", "timestamp"},
- };
-
- int size = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_ARROW_BATCH_SIZE);
- Random rand = new Random(294722773L);
- Object[][] rows = new Object[size][];
- for (int i = 0; i < size; i++) {
- long millis = ((long) rand.nextInt(Integer.MAX_VALUE)) * 1000;
- rows[i] = new Object[] {new TimestampWritable(new Timestamp(millis))};
- }
-
- initAndSerializeAndDeserialize(schema, rows);
- }
-
- @Test
public void testPrimitiveDecimal() throws SerDeException {
String[][] schema = {
{"decimal1", "decimal(38,10)"},
@@ -781,4 +773,5 @@ public class TestArrowColumnarBatchSerDe {
initAndSerializeAndDeserialize(schema, toList(DECIMAL_ROWS));
}
+
}