You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2017/04/21 21:24:36 UTC
hive git commit: HIVE-15982 : Support the width_bucket function
(Sahil Takiar via Ashutosh Chauhan)
Repository: hive
Updated Branches:
refs/heads/master 13967d8f2 -> 6566065c0
HIVE-15982 : Support the width_bucket function (Sahil Takiar via Ashutosh Chauhan)
Signed-off-by: Ashutosh Chauhan <ha...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/6566065c
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/6566065c
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/6566065c
Branch: refs/heads/master
Commit: 6566065c0f36fa14057947252b0258919ffcbcee
Parents: 13967d8
Author: Sahil Takiar <ta...@gmail.com>
Authored: Fri Apr 21 14:24:00 2017 -0700
Committer: Ashutosh Chauhan <ha...@apache.org>
Committed: Fri Apr 21 14:24:00 2017 -0700
----------------------------------------------------------------------
.../hadoop/hive/ql/exec/FunctionRegistry.java | 1 +
.../hadoop/hive/ql/udf/generic/GenericUDF.java | 3 +-
.../ql/udf/generic/GenericUDFWidthBucket.java | 89 +++++++++++++++
.../udf/generic/TestGenericUDFWidthBucket.java | 69 ++++++++++++
.../queries/clientpositive/udf_width_bucket.q | 29 +++++
.../results/clientpositive/show_functions.q.out | 1 +
.../clientpositive/udf_width_bucket.q.out | 111 +++++++++++++++++++
7 files changed, 302 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/6566065c/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
index ccfb455..8dc5f2e 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
@@ -481,6 +481,7 @@ public final class FunctionRegistry {
system.registerGenericUDF("greatest", GenericUDFGreatest.class);
system.registerGenericUDF("least", GenericUDFLeast.class);
system.registerGenericUDF("cardinality_violation", GenericUDFCardinalityViolation.class);
+ system.registerGenericUDF("width_bucket", GenericUDFWidthBucket.class);
system.registerGenericUDF("from_utc_timestamp", GenericUDFFromUtcTimestamp.class);
system.registerGenericUDF("to_utc_timestamp", GenericUDFToUtcTimestamp.class);
http://git-wip-us.apache.org/repos/asf/hive/blob/6566065c/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDF.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDF.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDF.java
index 00a4f38..68d98f5 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDF.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDF.java
@@ -366,6 +366,7 @@ public abstract class GenericUDF implements Closeable {
case SHORT:
case INT:
case LONG:
+ case VOID:
break;
default:
throw new UDFArgumentTypeException(i, getFuncName()
@@ -375,7 +376,7 @@ public abstract class GenericUDF implements Closeable {
Converter converter = ObjectInspectorConverters.getConverter(
arguments[i],
- PrimitiveObjectInspectorFactory.writableIntObjectInspector);
+ PrimitiveObjectInspectorFactory.writableLongObjectInspector);
converters[i] = converter;
inputTypes[i] = inputType;
}
http://git-wip-us.apache.org/repos/asf/hive/blob/6566065c/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFWidthBucket.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFWidthBucket.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFWidthBucket.java
new file mode 100644
index 0000000..c767d35
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFWidthBucket.java
@@ -0,0 +1,89 @@
+package org.apache.hadoop.hive.ql.udf.generic;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+
+import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils.PrimitiveGrouping.NUMERIC_GROUP;
+import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils.PrimitiveGrouping.VOID_GROUP;
+
+
+@Description(name = "width_bucket",
+ value = "_FUNC_(expr, min_value, max_value, num_buckets) - Returns an integer between 0 and num_buckets+1 by "
+ + "mapping the expr into buckets defined by the range [min_value, max_value]",
+ extended = "Returns an integer between 0 and num_buckets+1 by "
+ + "mapping expr into the ith equally sized bucket. Buckets are made by dividing [min_value, max_value] into "
+ + "equally sized regions. If expr < min_value, return 1, if expr > max_value return num_buckets+1\n"
+ + "Example: expr is an integer column withs values 1, 10, 20, 30.\n"
+ + " > SELECT _FUNC_(expr, 5, 25, 4) FROM src;\n1\n1\n3\n5")
+public class GenericUDFWidthBucket extends GenericUDF {
+
+ private transient PrimitiveObjectInspector.PrimitiveCategory[] inputTypes = new PrimitiveObjectInspector.PrimitiveCategory[4];
+ private transient ObjectInspectorConverters.Converter[] converters = new ObjectInspectorConverters.Converter[4];
+
+ private final IntWritable output = new IntWritable();
+
+ @Override
+ public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
+ checkArgsSize(arguments, 4, 4);
+
+ checkArgPrimitive(arguments, 0);
+ checkArgPrimitive(arguments, 1);
+ checkArgPrimitive(arguments, 2);
+ checkArgPrimitive(arguments, 3);
+
+ checkArgGroups(arguments, 0, inputTypes, NUMERIC_GROUP, VOID_GROUP);
+ checkArgGroups(arguments, 1, inputTypes, NUMERIC_GROUP, VOID_GROUP);
+ checkArgGroups(arguments, 2, inputTypes, NUMERIC_GROUP, VOID_GROUP);
+ checkArgGroups(arguments, 3, inputTypes, NUMERIC_GROUP, VOID_GROUP);
+
+ obtainLongConverter(arguments, 0, inputTypes, converters);
+ obtainLongConverter(arguments, 1, inputTypes, converters);
+ obtainLongConverter(arguments, 2, inputTypes, converters);
+ obtainIntConverter(arguments, 3, inputTypes, converters);
+
+ return PrimitiveObjectInspectorFactory.writableIntObjectInspector;
+ }
+
+ @Override
+ public Object evaluate(DeferredObject[] arguments) throws HiveException {
+ Long exprValue = getLongValue(arguments, 0, converters);
+ Long minValue = getLongValue(arguments, 1, converters);
+ Long maxValue = getLongValue(arguments, 2, converters);
+ Integer numBuckets = getIntValue(arguments, 3, converters);
+
+ if (exprValue == null || minValue == null || maxValue == null || numBuckets == null) {
+ return null;
+ }
+
+ Preconditions.checkArgument(numBuckets > 0, "numBuckets in width_bucket function must be above 0");
+ long intervalSize = (maxValue - minValue) / numBuckets;
+
+ if (exprValue < minValue) {
+ output.set(0);
+ } else if (exprValue > maxValue) {
+ output.set(numBuckets + 1);
+ } else {
+ long diff = exprValue - minValue;
+ if (diff % intervalSize == 0) {
+ output.set((int) (diff/intervalSize + 1));
+ } else {
+ output.set((int) Math.ceil((double) (diff) / intervalSize));
+ }
+ }
+
+ return output;
+ }
+
+ @Override
+ public String getDisplayString(String[] children) {
+ return getStandardDisplayString("width_bucket", children);
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/6566065c/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFWidthBucket.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFWidthBucket.java b/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFWidthBucket.java
new file mode 100644
index 0000000..4cefcf8
--- /dev/null
+++ b/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFWidthBucket.java
@@ -0,0 +1,69 @@
+package org.apache.hadoop.hive.ql.udf.generic;
+
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.io.IntWritable;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+
+public class TestGenericUDFWidthBucket {
+
+ @Test
+ public void testExprLessThanMinValue() throws HiveException {
+ assertEquals(0, testWidthBucketWithValues(99L, 100L, 5000L, 10).get());
+ }
+
+ @Test
+ public void testExprEqualsMinValue() throws HiveException {
+ assertEquals(1, testWidthBucketWithValues(100L, 100L, 5000L, 10).get());
+ }
+
+ @Test
+ public void testExprEqualsBoundaryValue() throws HiveException {
+ assertEquals(2, testWidthBucketWithValues(590L, 100L, 5000L, 10).get());
+ }
+
+ @Test
+ public void testExprEqualsMaxValue() throws HiveException {
+ assertEquals(11, testWidthBucketWithValues(5000L, 100L, 5000L, 10).get());
+ }
+
+ @Test
+ public void testExprAboveMaxValue() throws HiveException {
+ assertEquals(11, testWidthBucketWithValues(6000L, 100L, 5000L, 10).get());
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testNegativeBucketValue() throws HiveException {
+ testWidthBucketWithValues(100L, 100L, 5000L, -1);
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testZeroBucketValue() throws HiveException {
+ testWidthBucketWithValues(100L, 100L, 5000L, 0);
+ }
+
+ private IntWritable testWidthBucketWithValues(Long expr, Long minValue, Long maxValue, Integer numBuckets) throws HiveException {
+ GenericUDFWidthBucket udf = new GenericUDFWidthBucket();
+ ObjectInspector valueOI1 = PrimitiveObjectInspectorFactory.javaLongObjectInspector;
+ ObjectInspector valueOI2 = PrimitiveObjectInspectorFactory.javaLongObjectInspector;
+ ObjectInspector valueOI3 = PrimitiveObjectInspectorFactory.javaLongObjectInspector;
+ ObjectInspector valueOI4 = PrimitiveObjectInspectorFactory.javaIntObjectInspector;
+ ObjectInspector[] arguments = {valueOI1, valueOI2, valueOI3, valueOI4};
+
+ udf.initialize(arguments);
+
+ GenericUDF.DeferredObject valueObj1 = new GenericUDF.DeferredJavaObject(expr);
+ GenericUDF.DeferredObject valueObj2 = new GenericUDF.DeferredJavaObject(minValue);
+ GenericUDF.DeferredObject valueObj3 = new GenericUDF.DeferredJavaObject(maxValue);
+ GenericUDF.DeferredObject valueObj4 = new GenericUDF.DeferredJavaObject(numBuckets);
+ GenericUDF.DeferredObject[] args = {valueObj1, valueObj2, valueObj3, valueObj4};
+
+ return (IntWritable) udf.evaluate(args);
+ }
+}
http://git-wip-us.apache.org/repos/asf/hive/blob/6566065c/ql/src/test/queries/clientpositive/udf_width_bucket.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/udf_width_bucket.q b/ql/src/test/queries/clientpositive/udf_width_bucket.q
new file mode 100644
index 0000000..6ac60d6
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/udf_width_bucket.q
@@ -0,0 +1,29 @@
+describe function width_bucket;
+desc function extended width_bucket;
+
+explain select width_bucket(10, 5, 25, 4);
+
+select
+width_bucket(1, 5, 25, 4),
+width_bucket(10, 5, 25, 4),
+width_bucket(20, 5, 25, 4),
+width_bucket(30, 5, 25, 4);
+
+select
+width_bucket(1, NULL, 25, 4),
+width_bucket(NULL, 5, 25, 4),
+width_bucket(20, 5, NULL, 4),
+width_bucket(30, 5, 25, NULL),
+width_bucket(NULL, NULL, NULL, NULL);
+
+select
+width_bucket(-1, -25, -5, 4),
+width_bucket(-10, -25, -5, 4),
+width_bucket(-20, -25, -5, 4),
+width_bucket(-30, -25, -5, 4);
+
+select
+width_bucket(-10, -5, 15, 4),
+width_bucket(0, -5, 15, 4),
+width_bucket(10, -5, 15, 4),
+width_bucket(20, -5, 15, 4);
http://git-wip-us.apache.org/repos/asf/hive/blob/6566065c/ql/src/test/results/clientpositive/show_functions.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/show_functions.q.out b/ql/src/test/results/clientpositive/show_functions.q.out
index 68e248a..ac5ca41 100644
--- a/ql/src/test/results/clientpositive/show_functions.q.out
+++ b/ql/src/test/results/clientpositive/show_functions.q.out
@@ -261,6 +261,7 @@ variance
version
weekofyear
when
+width_bucket
windowingtablefunction
xpath
xpath_boolean
http://git-wip-us.apache.org/repos/asf/hive/blob/6566065c/ql/src/test/results/clientpositive/udf_width_bucket.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/udf_width_bucket.q.out b/ql/src/test/results/clientpositive/udf_width_bucket.q.out
new file mode 100644
index 0000000..a72e977
--- /dev/null
+++ b/ql/src/test/results/clientpositive/udf_width_bucket.q.out
@@ -0,0 +1,111 @@
+PREHOOK: query: describe function width_bucket
+PREHOOK: type: DESCFUNCTION
+POSTHOOK: query: describe function width_bucket
+POSTHOOK: type: DESCFUNCTION
+width_bucket(expr, min_value, max_value, num_buckets) - Returns an integer between 0 and num_buckets+1 by mapping the expr into buckets defined by the range [min_value, max_value]
+PREHOOK: query: desc function extended width_bucket
+PREHOOK: type: DESCFUNCTION
+POSTHOOK: query: desc function extended width_bucket
+POSTHOOK: type: DESCFUNCTION
+width_bucket(expr, min_value, max_value, num_buckets) - Returns an integer between 0 and num_buckets+1 by mapping the expr into buckets defined by the range [min_value, max_value]
+Returns an integer between 0 and num_buckets+1 by mapping expr into the ith equally sized bucket. Buckets are made by dividing [min_value, max_value] into equally sized regions. If expr < min_value, return 1, if expr > max_value return num_buckets+1
+Example: expr is an integer column withs values 1, 10, 20, 30.
+ > SELECT width_bucket(expr, 5, 25, 4) FROM src;
+1
+1
+3
+5
+Function class:org.apache.hadoop.hive.ql.udf.generic.GenericUDFWidthBucket
+Function type:BUILTIN
+PREHOOK: query: explain select width_bucket(10, 5, 25, 4)
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select width_bucket(10, 5, 25, 4)
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ TableScan
+ alias: _dummy_table
+ Row Limit Per Split: 1
+ Statistics: Num rows: 1 Data size: 1 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: 2 (type: int)
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE
+ ListSink
+
+PREHOOK: query: select
+width_bucket(1, 5, 25, 4),
+width_bucket(10, 5, 25, 4),
+width_bucket(20, 5, 25, 4),
+width_bucket(30, 5, 25, 4)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+POSTHOOK: query: select
+width_bucket(1, 5, 25, 4),
+width_bucket(10, 5, 25, 4),
+width_bucket(20, 5, 25, 4),
+width_bucket(30, 5, 25, 4)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+0 2 4 5
+PREHOOK: query: select
+width_bucket(1, NULL, 25, 4),
+width_bucket(NULL, 5, 25, 4),
+width_bucket(20, 5, NULL, 4),
+width_bucket(30, 5, 25, NULL),
+width_bucket(NULL, NULL, NULL, NULL)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+POSTHOOK: query: select
+width_bucket(1, NULL, 25, 4),
+width_bucket(NULL, 5, 25, 4),
+width_bucket(20, 5, NULL, 4),
+width_bucket(30, 5, 25, NULL),
+width_bucket(NULL, NULL, NULL, NULL)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+NULL NULL NULL NULL NULL
+PREHOOK: query: select
+width_bucket(-1, -25, -5, 4),
+width_bucket(-10, -25, -5, 4),
+width_bucket(-20, -25, -5, 4),
+width_bucket(-30, -25, -5, 4)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+POSTHOOK: query: select
+width_bucket(-1, -25, -5, 4),
+width_bucket(-10, -25, -5, 4),
+width_bucket(-20, -25, -5, 4),
+width_bucket(-30, -25, -5, 4)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+5 4 2 0
+PREHOOK: query: select
+width_bucket(-10, -5, 15, 4),
+width_bucket(0, -5, 15, 4),
+width_bucket(10, -5, 15, 4),
+width_bucket(20, -5, 15, 4)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+POSTHOOK: query: select
+width_bucket(-10, -5, 15, 4),
+width_bucket(0, -5, 15, 4),
+width_bucket(10, -5, 15, 4),
+width_bucket(20, -5, 15, 4)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+0 2 4 5