You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by sz...@apache.org on 2019/11/25 14:06:01 UTC
[hive] branch master updated: HIVE-22483: Vectorize UDF
datetime_legacy_hybrid_calendar (Karen Coppage, reviewed by Adam Szita)
This is an automated email from the ASF dual-hosted git repository.
szita pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 523d766 HIVE-22483: Vectorize UDF datetime_legacy_hybrid_calendar (Karen Coppage, reviewed by Adam Szita)
523d766 is described below
commit 523d76650c6ef23cbdee69c77f948dfd25b83104
Author: Karen Coppage <kc...@gmail.com>
AuthorDate: Mon Nov 25 14:47:16 2019 +0100
HIVE-22483: Vectorize UDF datetime_legacy_hybrid_calendar (Karen Coppage, reviewed by Adam Szita)
---
.../ql/exec/vector/expressions/FuncDateToDate.java | 159 ++++++++++++++++
.../expressions/FuncTimestampToTimestamp.java | 159 ++++++++++++++++
.../VectorUDFDatetimeLegacyHybridCalendarDate.java | 60 ++++++
...orUDFDatetimeLegacyHybridCalendarTimestamp.java | 61 ++++++
.../GenericUDFDatetimeLegacyHybridCalendar.java | 28 ++-
.../TestVectorUDFDatetimeLegacyHybridCalendar.java | 209 +++++++++++++++++++++
.../udf_datetime_legacy_hybrid_calendar.q | 29 +++
.../udf_datetime_legacy_hybrid_calendar.q.out | 128 ++++++++++++-
8 files changed, 816 insertions(+), 17 deletions(-)
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FuncDateToDate.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FuncDateToDate.java
new file mode 100644
index 0000000..d0e68ae
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FuncDateToDate.java
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+
+import java.util.Arrays;
+
+/**
+ * This is a superclass for unary functions and expressions taking a single timestamp and returning
+ * a timestamp, that operate directly on the input and set the output.
+ */
+public abstract class FuncDateToDate extends VectorExpression {
+
+ private static final long serialVersionUID = 1L;
+ private final int inputColumn;
+
+ public FuncDateToDate(int inputColumn, int outputColumnNum) {
+ super(outputColumnNum);
+ this.inputColumn = inputColumn;
+ }
+
+ public FuncDateToDate() {
+ super();
+
+ // Dummy final assignments.
+ inputColumn = -1;
+ }
+
+ protected abstract void func(LongColumnVector outputColVector, LongColumnVector inputColVector, int i);
+
+ @Override
+ public void evaluate(VectorizedRowBatch batch) throws HiveException {
+
+ if (childExpressions != null) {
+ super.evaluateChildren(batch);
+ }
+
+ LongColumnVector inputColVector = (LongColumnVector) batch.cols[inputColumn];
+ LongColumnVector outputColVector = (LongColumnVector) batch.cols[outputColumnNum];
+
+ int[] sel = batch.selected;
+ int n = batch.size;
+
+ boolean[] inputIsNull = inputColVector.isNull;
+ boolean[] outputIsNull = outputColVector.isNull;
+
+ if (n == 0) {
+
+ // Nothing to do
+ return;
+ }
+
+ // We do not need to do a column reset since we are carefully changing the output.
+ outputColVector.isRepeating = false;
+
+ if (inputColVector.isRepeating) {
+ if (inputColVector.noNulls || !inputIsNull[0]) {
+ // Set isNull before call in case it changes it mind.
+ outputIsNull[0] = false;
+ func(outputColVector, inputColVector, 0);
+ } else {
+ outputIsNull[0] = true;
+ outputColVector.noNulls = false;
+ }
+ outputColVector.isRepeating = true;
+ return;
+ }
+
+ if (inputColVector.noNulls) {
+ if (batch.selectedInUse) {
+
+ // CONSIDER: For large n, fill n or all of isNull array and use the tighter ELSE loop.
+
+ if (!outputColVector.noNulls) {
+ for(int j = 0; j != n; j++) {
+ final int i = sel[j];
+ // Set isNull before call in case it changes it mind.
+ outputIsNull[i] = false;
+ func(outputColVector, inputColVector, i);
+ }
+ } else {
+ for(int j = 0; j != n; j++) {
+ final int i = sel[j];
+ func(outputColVector, inputColVector, i);
+ }
+ }
+ } else {
+ if (!outputColVector.noNulls) {
+
+ // Assume it is almost always a performance win to fill all of isNull so we can
+ // safely reset noNulls.
+ Arrays.fill(outputIsNull, false);
+ outputColVector.noNulls = true;
+ }
+ for(int i = 0; i != n; i++) {
+ func(outputColVector, inputColVector, i);
+ }
+ }
+ } else /* there are nulls in the inputColVector */ {
+
+ // Carefully handle NULLs...
+ outputColVector.noNulls = false;
+
+ if (batch.selectedInUse) {
+ for(int j = 0; j != n; j++) {
+ int i = sel[j];
+ outputColVector.isNull[i] = inputColVector.isNull[i];
+ if (!inputColVector.isNull[i]) {
+ func(outputColVector, inputColVector, i);
+ }
+ }
+ } else {
+ System.arraycopy(inputColVector.isNull, 0, outputColVector.isNull, 0, n);
+ for(int i = 0; i != n; i++) {
+ if (!inputColVector.isNull[i]) {
+ func(outputColVector, inputColVector, i);
+ }
+ }
+ }
+ }
+ }
+
+ @Override
+ public String vectorExpressionParameters() {
+ return getColumnParamString(0, inputColumn);
+ }
+
+ @Override
+ public VectorExpressionDescriptor.Descriptor getDescriptor() {
+ VectorExpressionDescriptor.Builder b = new VectorExpressionDescriptor.Builder();
+ b.setMode(VectorExpressionDescriptor.Mode.PROJECTION).setNumArguments(1)
+ .setArgumentTypes(getInputColumnType())
+ .setInputExpressionTypes(VectorExpressionDescriptor.InputExpressionType.COLUMN);
+ return b.build();
+ }
+
+ protected VectorExpressionDescriptor.ArgumentType getInputColumnType() {
+ return VectorExpressionDescriptor.ArgumentType.DATE;
+ }
+}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FuncTimestampToTimestamp.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FuncTimestampToTimestamp.java
new file mode 100644
index 0000000..1a45ef9
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FuncTimestampToTimestamp.java
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import java.util.Arrays;
+
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+
+/**
+ * This is a superclass for unary functions and expressions taking a single timestamp and returning
+ * a timestamp, that operate directly on the input and set the output.
+ */
+public abstract class FuncTimestampToTimestamp extends VectorExpression {
+
+ private static final long serialVersionUID = 1L;
+ private final int inputColumn;
+
+ public FuncTimestampToTimestamp(int inputColumn, int outputColumnNum) {
+ super(outputColumnNum);
+ this.inputColumn = inputColumn;
+ }
+
+ public FuncTimestampToTimestamp() {
+ super();
+
+ // Dummy final assignments.
+ inputColumn = -1;
+ }
+
+ protected abstract void func(
+ TimestampColumnVector outputColVector, TimestampColumnVector inputColVector, int i);
+
+ @Override
+ public void evaluate(VectorizedRowBatch batch) throws HiveException {
+
+ if (childExpressions != null) {
+ super.evaluateChildren(batch);
+ }
+
+ TimestampColumnVector inputColVector = (TimestampColumnVector) batch.cols[inputColumn];
+ int[] sel = batch.selected;
+ int n = batch.size;
+ TimestampColumnVector outputColVector = (TimestampColumnVector) batch.cols[outputColumnNum];
+
+ boolean[] inputIsNull = inputColVector.isNull;
+ boolean[] outputIsNull = outputColVector.isNull;
+
+ if (n == 0) {
+
+ // Nothing to do
+ return;
+ }
+
+ // We do not need to do a column reset since we are carefully changing the output.
+ outputColVector.isRepeating = false;
+
+ if (inputColVector.isRepeating) {
+ if (inputColVector.noNulls || !inputIsNull[0]) {
+ // Set isNull before call in case it changes it mind.
+ outputIsNull[0] = false;
+ func(outputColVector, inputColVector, 0);
+ } else {
+ outputIsNull[0] = true;
+ outputColVector.noNulls = false;
+ }
+ outputColVector.isRepeating = true;
+ return;
+ }
+
+ if (inputColVector.noNulls) {
+ if (batch.selectedInUse) {
+
+ // CONSIDER: For large n, fill n or all of isNull array and use the tighter ELSE loop.
+
+ if (!outputColVector.noNulls) {
+ for(int j = 0; j != n; j++) {
+ final int i = sel[j];
+ // Set isNull before call in case it changes it mind.
+ outputIsNull[i] = false;
+ func(outputColVector, inputColVector, i);
+ }
+ } else {
+ for(int j = 0; j != n; j++) {
+ final int i = sel[j];
+ func(outputColVector, inputColVector, i);
+ }
+ }
+ } else {
+ if (!outputColVector.noNulls) {
+
+ // Assume it is almost always a performance win to fill all of isNull so we can
+ // safely reset noNulls.
+ Arrays.fill(outputIsNull, false);
+ outputColVector.noNulls = true;
+ }
+ for(int i = 0; i != n; i++) {
+ func(outputColVector, inputColVector, i);
+ }
+ }
+ } else /* there are nulls in the inputColVector */ {
+
+ // Carefully handle NULLs...
+ outputColVector.noNulls = false;
+
+ if (batch.selectedInUse) {
+ for(int j = 0; j != n; j++) {
+ int i = sel[j];
+ outputColVector.isNull[i] = inputColVector.isNull[i];
+ if (!inputColVector.isNull[i]) {
+ func(outputColVector, inputColVector, i);
+ }
+ }
+ } else {
+ System.arraycopy(inputColVector.isNull, 0, outputColVector.isNull, 0, n);
+ for(int i = 0; i != n; i++) {
+ if (!inputColVector.isNull[i]) {
+ func(outputColVector, inputColVector, i);
+ }
+ }
+ }
+ }
+ }
+
+ @Override
+ public String vectorExpressionParameters() {
+ return getColumnParamString(0, inputColumn);
+ }
+
+ @Override
+ public VectorExpressionDescriptor.Descriptor getDescriptor() {
+ VectorExpressionDescriptor.Builder b = new VectorExpressionDescriptor.Builder();
+ b.setMode(VectorExpressionDescriptor.Mode.PROJECTION).setNumArguments(1)
+ .setArgumentTypes(getInputColumnType())
+ .setInputExpressionTypes(VectorExpressionDescriptor.InputExpressionType.COLUMN);
+ return b.build();
+ }
+
+ protected VectorExpressionDescriptor.ArgumentType getInputColumnType() {
+ return VectorExpressionDescriptor.ArgumentType.TIMESTAMP;
+ }
+}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFDatetimeLegacyHybridCalendarDate.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFDatetimeLegacyHybridCalendarDate.java
new file mode 100644
index 0000000..5b5fff5
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFDatetimeLegacyHybridCalendarDate.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.apache.hadoop.hive.common.type.Date;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+
+import java.text.SimpleDateFormat;
+import java.util.TimeZone;
+
+/**
+ * Vectorized version of GenericUDFDatetimeLegacyHybridCalendar (datetime_legacy_hybrid_calendar).
+ * Converts a date/timestamp to legacy hybrid Julian-Gregorian calendar assuming that its internal
+ * days/milliseconds since epoch is calculated using the proleptic Gregorian calendar.
+ * Extends {@link FuncDateToDate}
+ */
+
+public class VectorUDFDatetimeLegacyHybridCalendarDate extends FuncDateToDate {
+ private static final long serialVersionUID = 1L;
+
+ // SimpleDateFormat doesn't serialize well; it's also not thread-safe
+ private static final ThreadLocal<SimpleDateFormat> SIMPLE_DATE_FORMAT_THREAD_LOCAL =
+ ThreadLocal.withInitial(() -> {
+ SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ formatter.setTimeZone(TimeZone.getTimeZone("UTC"));
+ formatter.setLenient(false);
+ return formatter;
+ });
+
+ public VectorUDFDatetimeLegacyHybridCalendarDate() {
+ super();
+ }
+
+ public VectorUDFDatetimeLegacyHybridCalendarDate(int inputColumn, int outputColumnNum) {
+ super(inputColumn, outputColumnNum);
+ }
+
+ protected void func(LongColumnVector outputColVector, LongColumnVector inputColVector, int i) {
+ // get number of milliseconds from number of days
+ Date inputDate = Date.ofEpochDay((int) inputColVector.vector[i]);
+ java.sql.Date oldDate = new java.sql.Date(inputDate.toEpochMilli());
+ Date adjustedDate = Date.valueOf(SIMPLE_DATE_FORMAT_THREAD_LOCAL.get().format(oldDate));
+ outputColVector.vector[i] = adjustedDate.toEpochDay();
+ }
+}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFDatetimeLegacyHybridCalendarTimestamp.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFDatetimeLegacyHybridCalendarTimestamp.java
new file mode 100644
index 0000000..0bb93ff
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/VectorUDFDatetimeLegacyHybridCalendarTimestamp.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.apache.hadoop.hive.common.type.Timestamp;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import java.text.SimpleDateFormat;
+import java.util.TimeZone;
+
+/**
+ * Vectorized version of GenericUDFDatetimeLegacyHybridCalendar (datetime_legacy_hybrid_calendar).
+ * Converts a date/timestamp to legacy hybrid Julian-Gregorian calendar assuming that its internal
+ * days/milliseconds since epoch is calculated using the proleptic Gregorian calendar.
+ * Extends {@link FuncTimestampToTimestamp}
+ */
+
+public class VectorUDFDatetimeLegacyHybridCalendarTimestamp extends FuncTimestampToTimestamp {
+ private static final long serialVersionUID = 1L;
+
+ // SimpleDateFormat doesn't serialize well; it's also not thread-safe
+ private static final ThreadLocal<SimpleDateFormat> SIMPLE_DATE_FORMAT_THREAD_LOCAL =
+ ThreadLocal.withInitial(() -> {
+ SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ formatter.setTimeZone(TimeZone.getTimeZone("UTC"));
+ formatter.setLenient(false);
+ return formatter;
+ });
+
+ public VectorUDFDatetimeLegacyHybridCalendarTimestamp() {
+ super();
+ }
+
+ public VectorUDFDatetimeLegacyHybridCalendarTimestamp(int inputColumn, int outputColumnNum) {
+ super(inputColumn, outputColumnNum);
+ }
+
+ protected void func(TimestampColumnVector outputColVector, TimestampColumnVector inputColVector,
+ int i) {
+ String adjustedTimestampString = SIMPLE_DATE_FORMAT_THREAD_LOCAL.get()
+ .format(new java.sql.Timestamp(inputColVector.time[i]));
+ Timestamp adjustedTimestamp = Timestamp.valueOf(adjustedTimestampString);
+ outputColVector.time[i] = adjustedTimestamp.toEpochMilli();
+ // Nanos don't change
+ outputColVector.nanos[i] = inputColVector.nanos[i];
+ }
+}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFDatetimeLegacyHybridCalendar.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFDatetimeLegacyHybridCalendar.java
index 4a94b44..b2f11d7 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFDatetimeLegacyHybridCalendar.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFDatetimeLegacyHybridCalendar.java
@@ -24,6 +24,9 @@ import org.apache.hadoop.hive.common.type.Timestamp;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions;
+import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFDatetimeLegacyHybridCalendarDate;
+import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFDatetimeLegacyHybridCalendarTimestamp;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.serde2.io.DateWritableV2;
import org.apache.hadoop.hive.serde2.io.TimestampWritableV2;
@@ -36,16 +39,25 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectIn
/**
- * GenericUDFToProlepticGregorian.
+ * GenericUDFDatetimeLegacyHybridCalendar.
*/
@Description(name = "datetime_legacy_hybrid_calendar",
- value = "_FUNC_(date/timestamp) - Converts a date/timestamp to new proleptic Gregorian calendar \n"
- + "assuming that its internal days/milliseconds since epoch is calculated using legacy Gregorian-Julian hybrid calendar.",
- extended = "Converts a date/timestamp to new proleptic Gregorian calendar (ISO 8601 standard), which is produced \n"
- + "by extending the Gregorian calendar backward to dates preceding its official introduction in 1582, assuming \n"
- + "that its internal days/milliseconds since epoch is calculated using legacy Gregorian-Julian hybrid calendar, \n"
- + "i.e., calendar that supports both the Julian and Gregorian calendar systems with the support of a single \n"
- + "discontinuity, which corresponds by default to the Gregorian date when the Gregorian calendar was instituted.")
+ value = "_FUNC_(date/timestamp) - Converts a date/timestamp to legacy hybrid Julian-Gregorian "
+ + "calendar\n"
+ + "assuming that its internal days/milliseconds since epoch is calculated using the "
+ + "proleptic Gregorian calendar.",
+ extended = "Converts a date/timestamp to legacy Gregorian-Julian hybrid calendar, i.e., "
+ + "calendar that supports both\n"
+ + "the Julian and Gregorian calendar systems with the support of a single discontinuity, "
+ + "which corresponds by\n"
+ + "default to the Gregorian date when the Gregorian calendar was instituted; assuming "
+ + "that its internal\n"
+ + "days/milliseconds since epoch is calculated using new proleptic Gregorian calendar "
+ + "(ISO 8601 standard), which\n"
+ + "is produced by extending the Gregorian calendar backward to dates preceding its "
+ + "official introduction in 1582.\n")
+@VectorizedExpressions({VectorUDFDatetimeLegacyHybridCalendarTimestamp.class,
+ VectorUDFDatetimeLegacyHybridCalendarDate.class })
public class GenericUDFDatetimeLegacyHybridCalendar extends GenericUDF {
private transient PrimitiveObjectInspector inputOI;
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorUDFDatetimeLegacyHybridCalendar.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorUDFDatetimeLegacyHybridCalendar.java
new file mode 100644
index 0000000..08fabdd
--- /dev/null
+++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorUDFDatetimeLegacyHybridCalendar.java
@@ -0,0 +1,209 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.apache.hadoop.hive.common.type.Date;
+import org.apache.hadoop.hive.common.type.Timestamp;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TestVectorizedRowBatch;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFDatetimeLegacyHybridCalendar;
+import org.apache.hadoop.hive.serde2.io.DateWritableV2;
+import org.apache.hadoop.hive.serde2.io.TimestampWritableV2;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Tests VectorUDFDatetimeLegacyHybridCalendarTimestamp and
+ * VectorUDFDatetimeLegacyHybridCalendarDate.
+ */
+public class TestVectorUDFDatetimeLegacyHybridCalendar {
+
+ @Test
+ public void testVectorUDFDatetimeLegacyHybridCalendarTimestamp() throws HiveException {
+ VectorizedRowBatch batch = getFreshBatchOfTimestamps(VectorizedRowBatch.DEFAULT_SIZE);
+ Assert.assertTrue(((TimestampColumnVector) batch.cols[1]).noNulls);
+ Assert.assertFalse(((TimestampColumnVector) batch.cols[1]).isRepeating);
+ verifyVectorUDFDatetimeLegacyHybridCalendarTimestamp(batch);
+ TestVectorizedRowBatch.addRandomNulls(batch.cols[0]);
+ verifyVectorUDFDatetimeLegacyHybridCalendarTimestamp(batch);
+
+ batch = getFreshBatchOfTimestamps(1);
+ batch.cols[0].isRepeating = true; //
+ verifyVectorUDFDatetimeLegacyHybridCalendarTimestamp(batch);
+ batch.cols[0].noNulls = false;
+ batch.cols[0].isNull[0] = true;
+ verifyVectorUDFDatetimeLegacyHybridCalendarTimestamp(batch);
+
+ batch = getFreshBatchOfTimestamps(3);
+ batch.cols[0].isRepeating = false;
+ batch.selectedInUse = true;
+ batch.selected = new int[] {0, 1, 2};
+ verifyVectorUDFDatetimeLegacyHybridCalendarTimestamp(batch);
+ batch.cols[0].noNulls = false;
+ batch.cols[0].isNull[0] = true;
+ verifyVectorUDFDatetimeLegacyHybridCalendarTimestamp(batch);
+ }
+
+ private VectorizedRowBatch getFreshBatchOfTimestamps(int size) {
+ return getVectorizedRowBatch(new java.sql.Timestamp[] {
+ new java.sql.Timestamp(Timestamp.valueOf("0001-01-01 00:00:00").toEpochMilli()),
+ new java.sql.Timestamp(Timestamp.valueOf("1400-01-01 00:30:00.123456").toEpochMilli()),
+ new java.sql.Timestamp(Timestamp.valueOf("1500-01-01 00:30:00").toEpochMilli()),
+ new java.sql.Timestamp(Timestamp.valueOf("1583-01-01 00:30:00.123").toEpochMilli()),
+ },
+ size);
+ }
+
+ /**
+ * Input array is used to fill the entire specified size of the vector row batch.
+ */
+ private VectorizedRowBatch getVectorizedRowBatch(java.sql.Timestamp[] inputs, int size) {
+ VectorizedRowBatch batch = new VectorizedRowBatch(2, size);
+ TimestampColumnVector inputCol = new TimestampColumnVector(size);
+ for (int i = 0; i < size; i++) {
+ inputCol.set(i, inputs[i % inputs.length]);
+ }
+ batch.cols[0] = inputCol;
+ batch.cols[1] = new TimestampColumnVector(size);
+ batch.size = size;
+ return batch;
+ }
+
+ private void verifyVectorUDFDatetimeLegacyHybridCalendarTimestamp(VectorizedRowBatch batch)
+ throws HiveException {
+ GenericUDF genUdf = new GenericUDFDatetimeLegacyHybridCalendar();
+ genUdf.initialize(new ObjectInspector[]{
+ PrimitiveObjectInspectorFactory.writableTimestampObjectInspector});
+
+ VectorExpression vecUdf = new VectorUDFDatetimeLegacyHybridCalendarTimestamp(0, 1);
+ vecUdf.evaluate(batch);
+ final int in = 0;
+ final int out = 1;
+
+ for (int i = 0; i < batch.size; i++) {
+ if (batch.cols[in].noNulls || !batch.cols[in].isNull[i]) {
+ java.sql.Timestamp input =
+ ((TimestampColumnVector) batch.cols[in]).asScratchTimestamp(i);
+ java.sql.Timestamp result =
+ ((TimestampColumnVector) batch.cols[out]).asScratchTimestamp(i);
+ compareToUDFDatetimeLegacyHybridCalendar(genUdf, input, result);
+ } else {
+ Assert.assertEquals(batch.cols[out].isNull[i], batch.cols[in].isNull[i]);
+ }
+ }
+ }
+
+ private void compareToUDFDatetimeLegacyHybridCalendar(
+ GenericUDF udf, java.sql.Timestamp in, java.sql.Timestamp out) throws HiveException {
+ TimestampWritableV2 tswInput = new TimestampWritableV2(
+ org.apache.hadoop.hive.common.type.Timestamp.ofEpochMilli(in.getTime(), in.getNanos()));
+ TimestampWritableV2 tswOutput = (TimestampWritableV2) udf
+ .evaluate(new GenericUDF.DeferredObject[] {new GenericUDF.DeferredJavaObject(tswInput)});
+ Assert.assertEquals(tswOutput.getTimestamp(), Timestamp.ofEpochMilli(out.getTime()));
+ Assert.assertEquals(tswOutput.getNanos(), out.getNanos());
+ }
+
+ @Test
+ public void testVectorUDFDatetimeLegacyHybridCalendarDate() throws HiveException {
+ VectorizedRowBatch batch = getFreshBatchOfDates(VectorizedRowBatch.DEFAULT_SIZE);
+ Assert.assertTrue(((LongColumnVector) batch.cols[1]).noNulls);
+ Assert.assertFalse(((LongColumnVector) batch.cols[1]).isRepeating);
+ verifyVectorUDFDatetimeLegacyHybridCalendarDate(batch);
+ TestVectorizedRowBatch.addRandomNulls(batch.cols[0]);
+ verifyVectorUDFDatetimeLegacyHybridCalendarDate(batch);
+
+ batch = getFreshBatchOfDates(1);
+ batch.cols[0].isRepeating = true; //
+ verifyVectorUDFDatetimeLegacyHybridCalendarDate(batch);
+ batch.cols[0].noNulls = false;
+ batch.cols[0].isNull[0] = true;
+ verifyVectorUDFDatetimeLegacyHybridCalendarDate(batch);
+
+ batch = getFreshBatchOfDates(3);
+ batch.cols[0].isRepeating = false;
+ batch.selectedInUse = true;
+ batch.selected = new int[] {0, 1, 2};
+ verifyVectorUDFDatetimeLegacyHybridCalendarDate(batch);
+ batch.cols[0].noNulls = false;
+ batch.cols[0].isNull[0] = true;
+ verifyVectorUDFDatetimeLegacyHybridCalendarDate(batch);
+ }
+
+ private VectorizedRowBatch getFreshBatchOfDates(int size) {
+ return getVectorizedRowBatch(new Long[] {
+ (long) Date.valueOf("0001-01-01").toEpochDay(),
+ (long) Date.valueOf("1400-01-01").toEpochDay(),
+ (long) Date.valueOf("1500-01-01").toEpochDay(),
+ (long) Date.valueOf("1583-01-01").toEpochDay(),
+ },
+ size);
+ }
+
+ /**
+ * Input array is used to fill the entire specified size of the vector row batch.
+ */
+ private VectorizedRowBatch getVectorizedRowBatch(Long[] inputs, int size) {
+ VectorizedRowBatch batch = new VectorizedRowBatch(2, size);
+ LongColumnVector inputCol = new LongColumnVector(size);
+ for (int i = 0; i < size; i++) {
+ inputCol.vector[i] = inputs[i % inputs.length];
+ }
+ batch.cols[0] = inputCol;
+ batch.cols[1] = new LongColumnVector(size);
+ batch.size = size;
+ return batch;
+ }
+
+
+ private void verifyVectorUDFDatetimeLegacyHybridCalendarDate(VectorizedRowBatch batch)
+ throws HiveException {
+ GenericUDF genUdf = new GenericUDFDatetimeLegacyHybridCalendar();
+ genUdf.initialize(
+ new ObjectInspector[] {PrimitiveObjectInspectorFactory.writableDateObjectInspector});
+
+ VectorExpression vecUdf = new VectorUDFDatetimeLegacyHybridCalendarDate(0, 1);
+ vecUdf.evaluate(batch);
+ final int in = 0;
+ final int out = 1;
+
+ for (int i = 0; i < batch.size; i++) {
+ if (batch.cols[in].noNulls || !batch.cols[in].isNull[i]) {
+ long input = ((LongColumnVector) batch.cols[in]).vector[i];
+ long output = ((LongColumnVector) batch.cols[out]).vector[i];
+ compareToUDFDatetimeLegacyHybridCalendar(genUdf, input, output);
+ } else {
+ Assert.assertEquals(batch.cols[out].isNull[i], batch.cols[in].isNull[i]);
+ }
+ }
+ }
+
+ private void compareToUDFDatetimeLegacyHybridCalendar(GenericUDF udf, long in, long out)
+ throws HiveException {
+ DateWritableV2 dateWInput = new DateWritableV2((int) in);
+ DateWritableV2 dateWOutput = (DateWritableV2) udf
+ .evaluate(new GenericUDF.DeferredObject[] {
+ new GenericUDF.DeferredJavaObject(dateWInput)});
+ Assert.assertEquals(dateWOutput.get(), Date.ofEpochDay((int) out));
+ }
+}
diff --git a/ql/src/test/queries/clientpositive/udf_datetime_legacy_hybrid_calendar.q b/ql/src/test/queries/clientpositive/udf_datetime_legacy_hybrid_calendar.q
index ce58a34..dab733d 100644
--- a/ql/src/test/queries/clientpositive/udf_datetime_legacy_hybrid_calendar.q
+++ b/ql/src/test/queries/clientpositive/udf_datetime_legacy_hybrid_calendar.q
@@ -10,3 +10,32 @@ SELECT
'0501-03-07 17:03:00.4321' AS tss,
CAST('0501-03-07 17:03:00.4321' AS TIMESTAMP) AS ts,
datetime_legacy_hybrid_calendar(CAST('0501-03-07 17:03:00.4321' AS TIMESTAMP)) AS tsp;
+
+--newer timestamps shouldn't be changed
+SELECT
+ '1600-03-07 17:03:00.4321' AS tss,
+ CAST('1600-03-07 17:03:00.4321' AS TIMESTAMP) AS ts,
+ datetime_legacy_hybrid_calendar(CAST('1600-03-07 17:03:00.4321' AS TIMESTAMP)) AS tsp;
+
+
+--test vectorized UDF--
+set hive.fetch.task.conversion=none;
+
+create table datetime_legacy_hybrid_calendar(dt date, ts timestamp) stored as orc;
+insert into datetime_legacy_hybrid_calendar values
+('0601-03-07', '0501-03-07 17:03:00.4321'),
+--post-1582 datetimes shouldn't be changed
+('1600-03-07', '1600-03-07 17:03:00.4321');
+
+EXPLAIN
+SELECT
+ dt, datetime_legacy_hybrid_calendar(dt) AS dtp,
+ ts, datetime_legacy_hybrid_calendar(ts) AS tsp
+FROM datetime_legacy_hybrid_calendar;
+
+SELECT
+ dt, datetime_legacy_hybrid_calendar(dt) AS dtp,
+ ts, datetime_legacy_hybrid_calendar(ts) AS tsp
+FROM datetime_legacy_hybrid_calendar;
+
+drop table datetime_legacy_hybrid_calendar;
diff --git a/ql/src/test/results/clientpositive/udf_datetime_legacy_hybrid_calendar.q.out b/ql/src/test/results/clientpositive/udf_datetime_legacy_hybrid_calendar.q.out
index 572c6c1..bd22442 100644
--- a/ql/src/test/results/clientpositive/udf_datetime_legacy_hybrid_calendar.q.out
+++ b/ql/src/test/results/clientpositive/udf_datetime_legacy_hybrid_calendar.q.out
@@ -2,19 +2,20 @@ PREHOOK: query: DESCRIBE FUNCTION datetime_legacy_hybrid_calendar
PREHOOK: type: DESCFUNCTION
POSTHOOK: query: DESCRIBE FUNCTION datetime_legacy_hybrid_calendar
POSTHOOK: type: DESCFUNCTION
-datetime_legacy_hybrid_calendar(date/timestamp) - Converts a date/timestamp to new proleptic Gregorian calendar
-assuming that its internal days/milliseconds since epoch is calculated using legacy Gregorian-Julian hybrid calendar.
+datetime_legacy_hybrid_calendar(date/timestamp) - Converts a date/timestamp to legacy hybrid Julian-Gregorian calendar
+assuming that its internal days/milliseconds since epoch is calculated using the proleptic Gregorian calendar.
PREHOOK: query: DESCRIBE FUNCTION EXTENDED datetime_legacy_hybrid_calendar
PREHOOK: type: DESCFUNCTION
POSTHOOK: query: DESCRIBE FUNCTION EXTENDED datetime_legacy_hybrid_calendar
POSTHOOK: type: DESCFUNCTION
-datetime_legacy_hybrid_calendar(date/timestamp) - Converts a date/timestamp to new proleptic Gregorian calendar
-assuming that its internal days/milliseconds since epoch is calculated using legacy Gregorian-Julian hybrid calendar.
-Converts a date/timestamp to new proleptic Gregorian calendar (ISO 8601 standard), which is produced
-by extending the Gregorian calendar backward to dates preceding its official introduction in 1582, assuming
-that its internal days/milliseconds since epoch is calculated using legacy Gregorian-Julian hybrid calendar,
-i.e., calendar that supports both the Julian and Gregorian calendar systems with the support of a single
-discontinuity, which corresponds by default to the Gregorian date when the Gregorian calendar was instituted.
+datetime_legacy_hybrid_calendar(date/timestamp) - Converts a date/timestamp to legacy hybrid Julian-Gregorian calendar
+assuming that its internal days/milliseconds since epoch is calculated using the proleptic Gregorian calendar.
+Converts a date/timestamp to legacy Gregorian-Julian hybrid calendar, i.e., calendar that supports both
+the Julian and Gregorian calendar systems with the support of a single discontinuity, which corresponds by
+default to the Gregorian date when the Gregorian calendar was instituted; assuming that its internal
+days/milliseconds since epoch is calculated using new proleptic Gregorian calendar (ISO 8601 standard), which
+is produced by extending the Gregorian calendar backward to dates preceding its official introduction in 1582.
+
Function class:org.apache.hadoop.hive.ql.udf.generic.GenericUDFDatetimeLegacyHybridCalendar
Function type:BUILTIN
PREHOOK: query: SELECT
@@ -47,3 +48,112 @@ POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
#### A masked pattern was here ####
0501-03-07 17:03:00.4321 0501-03-07 17:03:00.4321 0501-03-05 17:03:00.4321
+PREHOOK: query: SELECT
+ '1600-03-07 17:03:00.4321' AS tss,
+ CAST('1600-03-07 17:03:00.4321' AS TIMESTAMP) AS ts,
+ datetime_legacy_hybrid_calendar(CAST('1600-03-07 17:03:00.4321' AS TIMESTAMP)) AS tsp
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT
+ '1600-03-07 17:03:00.4321' AS tss,
+ CAST('1600-03-07 17:03:00.4321' AS TIMESTAMP) AS ts,
+ datetime_legacy_hybrid_calendar(CAST('1600-03-07 17:03:00.4321' AS TIMESTAMP)) AS tsp
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+1600-03-07 17:03:00.4321 1600-03-07 17:03:00.4321 1600-03-07 17:03:00.4321
+PREHOOK: query: create table datetime_legacy_hybrid_calendar(dt date, ts timestamp) stored as orc
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@datetime_legacy_hybrid_calendar
+POSTHOOK: query: create table datetime_legacy_hybrid_calendar(dt date, ts timestamp) stored as orc
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@datetime_legacy_hybrid_calendar
+PREHOOK: query: insert into datetime_legacy_hybrid_calendar values
+('0601-03-07', '0501-03-07 17:03:00.4321'),
+
+('1600-03-07', '1600-03-07 17:03:00.4321')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@datetime_legacy_hybrid_calendar
+POSTHOOK: query: insert into datetime_legacy_hybrid_calendar values
+('0601-03-07', '0501-03-07 17:03:00.4321'),
+
+('1600-03-07', '1600-03-07 17:03:00.4321')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@datetime_legacy_hybrid_calendar
+POSTHOOK: Lineage: datetime_legacy_hybrid_calendar.dt SCRIPT []
+POSTHOOK: Lineage: datetime_legacy_hybrid_calendar.ts SCRIPT []
+PREHOOK: query: EXPLAIN
+SELECT
+ dt, datetime_legacy_hybrid_calendar(dt) AS dtp,
+ ts, datetime_legacy_hybrid_calendar(ts) AS tsp
+FROM datetime_legacy_hybrid_calendar
+PREHOOK: type: QUERY
+PREHOOK: Input: default@datetime_legacy_hybrid_calendar
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN
+SELECT
+ dt, datetime_legacy_hybrid_calendar(dt) AS dtp,
+ ts, datetime_legacy_hybrid_calendar(ts) AS tsp
+FROM datetime_legacy_hybrid_calendar
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@datetime_legacy_hybrid_calendar
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Map Reduce
+ Map Operator Tree:
+ TableScan
+ alias: datetime_legacy_hybrid_calendar
+ Statistics: Num rows: 2 Data size: 192 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: dt (type: date), datetime_legacy_hybrid_calendar(dt) (type: date), ts (type: timestamp), datetime_legacy_hybrid_calendar(ts) (type: timestamp)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 2 Data size: 384 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 2 Data size: 384 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Execution mode: vectorized
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: SELECT
+ dt, datetime_legacy_hybrid_calendar(dt) AS dtp,
+ ts, datetime_legacy_hybrid_calendar(ts) AS tsp
+FROM datetime_legacy_hybrid_calendar
+PREHOOK: type: QUERY
+PREHOOK: Input: default@datetime_legacy_hybrid_calendar
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT
+ dt, datetime_legacy_hybrid_calendar(dt) AS dtp,
+ ts, datetime_legacy_hybrid_calendar(ts) AS tsp
+FROM datetime_legacy_hybrid_calendar
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@datetime_legacy_hybrid_calendar
+#### A masked pattern was here ####
+0601-03-07 0601-03-04 0501-03-07 17:03:00.4321 0501-03-05 17:03:00.4321
+1600-03-07 1600-03-07 1600-03-07 17:03:00.4321 1600-03-07 17:03:00.4321
+PREHOOK: query: drop table datetime_legacy_hybrid_calendar
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@datetime_legacy_hybrid_calendar
+PREHOOK: Output: default@datetime_legacy_hybrid_calendar
+POSTHOOK: query: drop table datetime_legacy_hybrid_calendar
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@datetime_legacy_hybrid_calendar
+POSTHOOK: Output: default@datetime_legacy_hybrid_calendar