You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by mm...@apache.org on 2016/12/12 09:51:45 UTC

[4/4] hive git commit: HIVE-15338: Wrong result from non-vectorized DATEDIFF with scalar parameter of type DATE/TIMESTAMP (Matt McCline, reviewed by Jason Dere)

HIVE-15338: Wrong result from non-vectorized DATEDIFF with scalar parameter of type DATE/TIMESTAMP (Matt McCline, reviewed by Jason Dere)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/6ef0b507
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/6ef0b507
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/6ef0b507

Branch: refs/heads/master
Commit: 6ef0b5078448f0589b85f7d6165c86961bf3eacf
Parents: 0ae2475
Author: Matt McCline <mm...@hortonworks.com>
Authored: Mon Dec 12 01:51:24 2016 -0800
Committer: Matt McCline <mm...@hortonworks.com>
Committed: Mon Dec 12 01:51:24 2016 -0800

----------------------------------------------------------------------
 .../ql/exec/vector/VectorizationContext.java    |   4 +-
 .../expressions/VectorUDFDateAddScalarCol.java  |   2 +
 .../expressions/VectorUDFDateDiffColScalar.java |   2 +
 .../expressions/VectorUDFDateDiffScalarCol.java |   2 +
 .../hive/ql/udf/generic/GenericUDFDateDiff.java |  14 +-
 .../TestVectorGenericDateExpressions.java       |   2 +-
 ql/src/test/queries/clientpositive/date_udf.q   |  15 +-
 .../clientpositive/vectorized_date_funcs.q      |  67 +-
 .../test/results/clientpositive/date_udf.q.out  |  34 +-
 .../llap/vectorized_date_funcs.q.out            | 979 +++++++++++--------
 .../results/clientpositive/spark/date_udf.q.out |  34 +-
 .../clientpositive/vectorized_date_funcs.q.out  | 979 +++++++++++--------
 12 files changed, 1244 insertions(+), 890 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/6ef0b507/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
index d9acdf5..d2f5408 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
@@ -1709,7 +1709,7 @@ public class VectorizationContext {
       cl = (mode == VectorExpressionDescriptor.Mode.FILTER ? FilterLongColumnInList.class : LongColumnInList.class);
       long[] inVals = new long[childrenForInList.size()];
       for (int i = 0; i != inVals.length; i++) {
-        inVals[i] = (Integer) getVectorTypeScalarValue((ExprNodeConstantDesc) childrenForInList.get(i));
+        inVals[i] = (Long) getVectorTypeScalarValue((ExprNodeConstantDesc) childrenForInList.get(i));
       }
       expr = createVectorExpression(cl, childExpr.subList(0, 1), VectorExpressionDescriptor.Mode.PROJECTION, returnType);
       ((ILongInExpr) expr).setInListValues(inVals);
@@ -2326,7 +2326,7 @@ public class VectorizationContext {
     Object scalarValue = getScalarValue(constDesc);
     switch (type) {
       case DATE:
-        return DateWritable.dateToDays((Date) scalarValue);
+        return new Long(DateWritable.dateToDays((Date) scalarValue));
       case INTERVAL_YEAR_MONTH:
         return ((HiveIntervalYearMonth) scalarValue).getTotalMonths();
       default:

http://git-wip-us.apache.org/repos/asf/hive/blob/6ef0b507/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFDateAddScalarCol.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFDateAddScalarCol.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFDateAddScalarCol.java
index 724ea45..74ef6a6 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFDateAddScalarCol.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFDateAddScalarCol.java
@@ -56,6 +56,8 @@ public class VectorUDFDateAddScalarCol extends VectorExpression {
         this.timestampValue = (Timestamp) object;
     } else if (object instanceof byte []) {
       this.stringValue = (byte[]) object;
+    } else {
+      throw new RuntimeException("Unexpected scalar object " + object.getClass().getName() + " " + object.toString());
     }
   }
 

http://git-wip-us.apache.org/repos/asf/hive/blob/6ef0b507/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFDateDiffColScalar.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFDateDiffColScalar.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFDateDiffColScalar.java
index 71b3887..15edbdf 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFDateDiffColScalar.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFDateDiffColScalar.java
@@ -57,6 +57,8 @@ public class VectorUDFDateDiffColScalar extends VectorExpression {
       this.timestampValue = (Timestamp) object;
     } else if (object instanceof byte []) {
       this.stringValue = (byte []) object;
+    } else {
+      throw new RuntimeException("Unexpected scalar object " + object.getClass().getName() + " " + object.toString());
     }
   }
 

http://git-wip-us.apache.org/repos/asf/hive/blob/6ef0b507/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFDateDiffScalarCol.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFDateDiffScalarCol.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFDateDiffScalarCol.java
index c733bc9..371537a 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFDateDiffScalarCol.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFDateDiffScalarCol.java
@@ -56,6 +56,8 @@ public class VectorUDFDateDiffScalarCol extends VectorExpression {
       this.timestampValue = (Timestamp) object;
     } else if (object instanceof byte []) {
       this.stringValue = (byte[]) object;
+    } else {
+      throw new RuntimeException("Unexpected scalar object " + object.getClass().getName() + " " + object.toString());
     }
   }
 

http://git-wip-us.apache.org/repos/asf/hive/blob/6ef0b507/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFDateDiff.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFDateDiff.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFDateDiff.java
index 403cf11..ac48f01 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFDateDiff.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFDateDiff.java
@@ -20,7 +20,7 @@ package org.apache.hadoop.hive.ql.udf.generic;
 import java.sql.Timestamp;
 import java.text.ParseException;
 import java.text.SimpleDateFormat;
-import java.util.Date;
+import java.sql.Date;
 import java.util.TimeZone;
 
 import org.apache.hadoop.hive.ql.exec.Description;
@@ -70,7 +70,6 @@ public class GenericUDFDateDiff extends GenericUDF {
   private IntWritable result = new IntWritable();
 
   public GenericUDFDateDiff() {
-    formatter.setTimeZone(TimeZone.getTimeZone("UTC"));
   }
 
   @Override
@@ -106,14 +105,14 @@ public class GenericUDFDateDiff extends GenericUDF {
     if (argument.get() == null) {
       return null;
     }
-    Date date = new Date();
+    Date date = new Date(0);
     switch (inputType) {
     case STRING:
     case VARCHAR:
     case CHAR:
       String dateString = converter.convert(argument.get()).toString();
       try {
-        date = formatter.parse(dateString);
+        date.setTime(formatter.parse(dateString).getTime());
       } catch (ParseException e) {
         return null;
       }
@@ -171,11 +170,8 @@ public class GenericUDFDateDiff extends GenericUDF {
     if (date == null || date2 == null) {
       return null;
     }
-    // NOTE: This implementation avoids the extra-second problem
-    // by comparing with UTC epoch and integer division.
-    // 86400 is the number of seconds in a day
-    long diffInMilliSeconds = date.getTime() - date2.getTime();
-    result.set((int) (diffInMilliSeconds / (86400 * 1000)));
+
+    result.set(DateWritable.dateToDays(date) - DateWritable.dateToDays(date2));
     return result;
   }
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/6ef0b507/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorGenericDateExpressions.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorGenericDateExpressions.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorGenericDateExpressions.java
index 13bfdd7..e25dcdf 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorGenericDateExpressions.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorGenericDateExpressions.java
@@ -511,7 +511,7 @@ public class TestVectorGenericDateExpressions {
     byte[] bytes = "error".getBytes(utf8);
     VectorizedRowBatch batch = new VectorizedRowBatch(2, 1);
 
-    udf = new VectorUDFDateDiffColScalar(0, 0, 1);
+    udf = new VectorUDFDateDiffColScalar(0, 0L, 1);
     udf.setInputTypes(VectorExpression.Type.TIMESTAMP, VectorExpression.Type.STRING);
     batch.cols[0] = new BytesColumnVector(1);
     batch.cols[1] = new LongColumnVector(1);

http://git-wip-us.apache.org/repos/asf/hive/blob/6ef0b507/ql/src/test/queries/clientpositive/date_udf.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/date_udf.q b/ql/src/test/queries/clientpositive/date_udf.q
index c55b9f9..3b82265 100644
--- a/ql/src/test/queries/clientpositive/date_udf.q
+++ b/ql/src/test/queries/clientpositive/date_udf.q
@@ -39,9 +39,20 @@ select unix_timestamp(d), year(d), month(d), day(d), dayofmonth(d),
 
 select date_add(d, 5), date_sub(d, 10)  from date_udf_string;
 
-select datediff(d, d), datediff(d, '2002-03-21'), datediff('2002-03-21', d),
+select
+    datediff(d, d),
+    datediff(d, '2002-03-21'),
+    datediff(d, date '2002-03-21'),
+    datediff('2002-03-21', d),
+    datediff(date '2002-03-21', d),
     datediff('2002-03-21 00:00:00', d),
-    datediff(d, '2002-03-21 00:00:00')
+    datediff(timestamp '2002-03-21 00:00:00', d),
+    datediff(d, '2002-03-21 00:00:00'),
+    datediff(d, timestamp '2002-03-21 00:00:00'),
+    datediff('2002-03-21 08:01:59', d),
+    datediff(timestamp '2002-03-21 08:01:59', d),
+    datediff(d, '2002-03-21 08:01:59'),
+    datediff(d, timestamp '2002-03-21 08:01:59')
   from date_udf_string;
 
 select 

http://git-wip-us.apache.org/repos/asf/hive/blob/6ef0b507/ql/src/test/queries/clientpositive/vectorized_date_funcs.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/vectorized_date_funcs.q b/ql/src/test/queries/clientpositive/vectorized_date_funcs.q
index 7d7b1cf..899e922 100644
--- a/ql/src/test/queries/clientpositive/vectorized_date_funcs.q
+++ b/ql/src/test/queries/clientpositive/vectorized_date_funcs.q
@@ -1,6 +1,7 @@
 set hive.mapred.mode=nonstrict;
 set hive.explain.user=false;
 SET hive.vectorized.execution.enabled = true;
+set hive.cli.print.header=true;
 
 -- SORT_QUERY_RESULTS
 
@@ -25,6 +26,7 @@ INSERT INTO TABLE date_udf_flight_orc SELECT fl_date, to_utc_timestamp(fl_date,
 SELECT * FROM date_udf_flight_orc;
 
 EXPLAIN SELECT
+  fl_time,
   to_unix_timestamp(fl_time),
   year(fl_time),
   month(fl_time),
@@ -36,10 +38,18 @@ EXPLAIN SELECT
   to_date(fl_time),
   date_add(fl_time, 2),
   date_sub(fl_time, 2),
-  datediff(fl_time, "2000-01-01")
+  datediff(fl_time, "2000-01-01"),
+  datediff(fl_time, date "2000-01-01"),
+  datediff(fl_time, timestamp "2000-01-01 00:00:00"),
+  datediff(fl_time, timestamp "2000-01-01 11:13:09"),
+  datediff(fl_time, "2007-03-14"),
+  datediff(fl_time, date "2007-03-14"),
+  datediff(fl_time, timestamp "2007-03-14 00:00:00"),
+  datediff(fl_time, timestamp "2007-03-14 08:21:59")
 FROM date_udf_flight_orc;
 
 SELECT
+  fl_time,
   to_unix_timestamp(fl_time),
   year(fl_time),
   month(fl_time),
@@ -51,10 +61,18 @@ SELECT
   to_date(fl_time),
   date_add(fl_time, 2),
   date_sub(fl_time, 2),
-  datediff(fl_time, "2000-01-01")
+  datediff(fl_time, "2000-01-01"),
+  datediff(fl_time, date "2000-01-01"),
+  datediff(fl_time, timestamp "2000-01-01 00:00:00"),
+  datediff(fl_time, timestamp "2000-01-01 11:13:09"),
+  datediff(fl_time, "2007-03-14"),
+  datediff(fl_time, date "2007-03-14"),
+  datediff(fl_time, timestamp "2007-03-14 00:00:00"),
+  datediff(fl_time, timestamp "2007-03-14 08:21:59")
 FROM date_udf_flight_orc;
 
 EXPLAIN SELECT
+  fl_date,
   to_unix_timestamp(fl_date),
   year(fl_date),
   month(fl_date),
@@ -66,10 +84,18 @@ EXPLAIN SELECT
   to_date(fl_date),
   date_add(fl_date, 2),
   date_sub(fl_date, 2),
-  datediff(fl_date, "2000-01-01")
+  datediff(fl_date, "2000-01-01"),
+  datediff(fl_date, date "2000-01-01"),
+  datediff(fl_date, timestamp "2000-01-01 00:00:00"),
+  datediff(fl_date, timestamp "2000-01-01 11:13:09"),
+  datediff(fl_date, "2007-03-14"),
+  datediff(fl_date, date "2007-03-14"),
+  datediff(fl_date, timestamp "2007-03-14 00:00:00"),
+  datediff(fl_date, timestamp "2007-03-14 08:21:59")
 FROM date_udf_flight_orc;
 
 SELECT
+  fl_date,
   to_unix_timestamp(fl_date),
   year(fl_date),
   month(fl_date),
@@ -81,10 +107,19 @@ SELECT
   to_date(fl_date),
   date_add(fl_date, 2),
   date_sub(fl_date, 2),
-  datediff(fl_date, "2000-01-01")
+  datediff(fl_date, "2000-01-01"),
+  datediff(fl_date, date "2000-01-01"),
+  datediff(fl_date, timestamp "2000-01-01 00:00:00"),
+  datediff(fl_date, timestamp "2000-01-01 11:13:09"),
+  datediff(fl_date, "2007-03-14"),
+  datediff(fl_date, date "2007-03-14"),
+  datediff(fl_date, timestamp "2007-03-14 00:00:00"),
+  datediff(fl_date, timestamp "2007-03-14 08:21:59")
 FROM date_udf_flight_orc;
 
 EXPLAIN SELECT
+  fl_time,
+  fl_date,
   year(fl_time) = year(fl_date),
   month(fl_time) = month(fl_date),
   day(fl_time) = day(fl_date),
@@ -95,11 +130,22 @@ EXPLAIN SELECT
   to_date(fl_time) = to_date(fl_date),
   date_add(fl_time, 2) = date_add(fl_date, 2),
   date_sub(fl_time, 2) = date_sub(fl_date, 2),
-  datediff(fl_time, "2000-01-01") = datediff(fl_date, "2000-01-01")
+  datediff(fl_time, "2000-01-01") = datediff(fl_date, "2000-01-01"),
+  datediff(fl_time, date "2000-01-01") = datediff(fl_date, date "2000-01-01"),
+  datediff(fl_time, timestamp "2000-01-01 00:00:00") = datediff(fl_date, timestamp "2000-01-01 00:00:00"),
+  datediff(fl_time, timestamp "2000-01-01 11:13:09") = datediff(fl_date, timestamp "2000-01-01 11:13:09"),
+  datediff(fl_time, "2007-03-14") = datediff(fl_date, "2007-03-14"),
+  datediff(fl_time, date "2007-03-14") = datediff(fl_date, date "2007-03-14"),
+  datediff(fl_time, timestamp "2007-03-14 00:00:00") = datediff(fl_date, timestamp "2007-03-14 00:00:00"),
+  datediff(fl_time, timestamp "2007-03-14 08:21:59") = datediff(fl_date, timestamp "2007-03-14 08:21:59"),
+  datediff(fl_date, "2000-01-01") = datediff(fl_date, date "2000-01-01"),
+  datediff(fl_date, "2007-03-14") = datediff(fl_date, date "2007-03-14")
 FROM date_udf_flight_orc;
 
 -- Should all be true or NULL
 SELECT
+  fl_time,
+  fl_date,
   year(fl_time) = year(fl_date),
   month(fl_time) = month(fl_date),
   day(fl_time) = day(fl_date),
@@ -110,7 +156,16 @@ SELECT
   to_date(fl_time) = to_date(fl_date),
   date_add(fl_time, 2) = date_add(fl_date, 2),
   date_sub(fl_time, 2) = date_sub(fl_date, 2),
-  datediff(fl_time, "2000-01-01") = datediff(fl_date, "2000-01-01")
+  datediff(fl_time, "2000-01-01") = datediff(fl_date, "2000-01-01"),
+  datediff(fl_time, date "2000-01-01") = datediff(fl_date, date "2000-01-01"),
+  datediff(fl_time, timestamp "2000-01-01 00:00:00") = datediff(fl_date, timestamp "2000-01-01 00:00:00"),
+  datediff(fl_time, timestamp "2000-01-01 11:13:09") = datediff(fl_date, timestamp "2000-01-01 11:13:09"),
+  datediff(fl_time, "2007-03-14") = datediff(fl_date, "2007-03-14"),
+  datediff(fl_time, date "2007-03-14") = datediff(fl_date, date "2007-03-14"),
+  datediff(fl_time, timestamp "2007-03-14 00:00:00") = datediff(fl_date, timestamp "2007-03-14 00:00:00"),
+  datediff(fl_time, timestamp "2007-03-14 08:21:59") = datediff(fl_date, timestamp "2007-03-14 08:21:59"),
+  datediff(fl_date, "2000-01-01") = datediff(fl_date, date "2000-01-01"),
+  datediff(fl_date, "2007-03-14") = datediff(fl_date, date "2007-03-14")
 FROM date_udf_flight_orc;
 
 EXPLAIN SELECT 

http://git-wip-us.apache.org/repos/asf/hive/blob/6ef0b507/ql/src/test/results/clientpositive/date_udf.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/date_udf.q.out b/ql/src/test/results/clientpositive/date_udf.q.out
index 9b37da6..2037367 100644
--- a/ql/src/test/results/clientpositive/date_udf.q.out
+++ b/ql/src/test/results/clientpositive/date_udf.q.out
@@ -114,7 +114,7 @@ POSTHOOK: query: select datediff(d, d), datediff(d, '2002-03-21'), datediff('200
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@date_udf
 #### A masked pattern was here ####
-0	3333	-3333	-3332	3332
+0	3333	-3333	-3333	3333
 PREHOOK: query: -- Test UDFs with string input
 select unix_timestamp(d), year(d), month(d), day(d), dayofmonth(d), 
     weekofyear(d), to_date(d)
@@ -139,21 +139,43 @@ POSTHOOK: type: QUERY
 POSTHOOK: Input: default@date_udf_string
 #### A masked pattern was here ####
 2011-05-11	2011-04-26
-PREHOOK: query: select datediff(d, d), datediff(d, '2002-03-21'), datediff('2002-03-21', d),
+PREHOOK: query: select
+    datediff(d, d),
+    datediff(d, '2002-03-21'),
+    datediff(d, date '2002-03-21'),
+    datediff('2002-03-21', d),
+    datediff(date '2002-03-21', d),
     datediff('2002-03-21 00:00:00', d),
-    datediff(d, '2002-03-21 00:00:00')
+    datediff(timestamp '2002-03-21 00:00:00', d),
+    datediff(d, '2002-03-21 00:00:00'),
+    datediff(d, timestamp '2002-03-21 00:00:00'),
+    datediff('2002-03-21 08:01:59', d),
+    datediff(timestamp '2002-03-21 08:01:59', d),
+    datediff(d, '2002-03-21 08:01:59'),
+    datediff(d, timestamp '2002-03-21 08:01:59')
   from date_udf_string
 PREHOOK: type: QUERY
 PREHOOK: Input: default@date_udf_string
 #### A masked pattern was here ####
-POSTHOOK: query: select datediff(d, d), datediff(d, '2002-03-21'), datediff('2002-03-21', d),
+POSTHOOK: query: select
+    datediff(d, d),
+    datediff(d, '2002-03-21'),
+    datediff(d, date '2002-03-21'),
+    datediff('2002-03-21', d),
+    datediff(date '2002-03-21', d),
     datediff('2002-03-21 00:00:00', d),
-    datediff(d, '2002-03-21 00:00:00')
+    datediff(timestamp '2002-03-21 00:00:00', d),
+    datediff(d, '2002-03-21 00:00:00'),
+    datediff(d, timestamp '2002-03-21 00:00:00'),
+    datediff('2002-03-21 08:01:59', d),
+    datediff(timestamp '2002-03-21 08:01:59', d),
+    datediff(d, '2002-03-21 08:01:59'),
+    datediff(d, timestamp '2002-03-21 08:01:59')
   from date_udf_string
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@date_udf_string
 #### A masked pattern was here ####
-0	3333	-3333	-3333	3333
+0	3333	3333	-3333	-3333	-3333	-3333	3333	3333	-3333	-3333	3333	3333
 PREHOOK: query: select 
     to_utc_timestamp(date '1970-01-01', 'America/Los_Angeles'),
     from_utc_timestamp(date '1970-01-01', 'America/Los_Angeles'),