You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2017/04/21 21:24:36 UTC

hive git commit: HIVE-15982 : Support the width_bucket function (Sahil Takiar via Ashutosh Chauhan)

Repository: hive
Updated Branches:
  refs/heads/master 13967d8f2 -> 6566065c0


HIVE-15982 : Support the width_bucket function (Sahil Takiar via Ashutosh Chauhan)

Signed-off-by: Ashutosh Chauhan <ha...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/6566065c
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/6566065c
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/6566065c

Branch: refs/heads/master
Commit: 6566065c0f36fa14057947252b0258919ffcbcee
Parents: 13967d8
Author: Sahil Takiar <ta...@gmail.com>
Authored: Fri Apr 21 14:24:00 2017 -0700
Committer: Ashutosh Chauhan <ha...@apache.org>
Committed: Fri Apr 21 14:24:00 2017 -0700

----------------------------------------------------------------------
 .../hadoop/hive/ql/exec/FunctionRegistry.java   |   1 +
 .../hadoop/hive/ql/udf/generic/GenericUDF.java  |   3 +-
 .../ql/udf/generic/GenericUDFWidthBucket.java   |  89 +++++++++++++++
 .../udf/generic/TestGenericUDFWidthBucket.java  |  69 ++++++++++++
 .../queries/clientpositive/udf_width_bucket.q   |  29 +++++
 .../results/clientpositive/show_functions.q.out |   1 +
 .../clientpositive/udf_width_bucket.q.out       | 111 +++++++++++++++++++
 7 files changed, 302 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/6566065c/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
index ccfb455..8dc5f2e 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
@@ -481,6 +481,7 @@ public final class FunctionRegistry {
     system.registerGenericUDF("greatest", GenericUDFGreatest.class);
     system.registerGenericUDF("least", GenericUDFLeast.class);
     system.registerGenericUDF("cardinality_violation", GenericUDFCardinalityViolation.class);
+    system.registerGenericUDF("width_bucket", GenericUDFWidthBucket.class);
 
     system.registerGenericUDF("from_utc_timestamp", GenericUDFFromUtcTimestamp.class);
     system.registerGenericUDF("to_utc_timestamp", GenericUDFToUtcTimestamp.class);

http://git-wip-us.apache.org/repos/asf/hive/blob/6566065c/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDF.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDF.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDF.java
index 00a4f38..68d98f5 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDF.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDF.java
@@ -366,6 +366,7 @@ public abstract class GenericUDF implements Closeable {
     case SHORT:
     case INT:
     case LONG:
+    case VOID:
       break;
     default:
       throw new UDFArgumentTypeException(i, getFuncName()
@@ -375,7 +376,7 @@ public abstract class GenericUDF implements Closeable {
 
     Converter converter = ObjectInspectorConverters.getConverter(
         arguments[i],
-        PrimitiveObjectInspectorFactory.writableIntObjectInspector);
+        PrimitiveObjectInspectorFactory.writableLongObjectInspector);
     converters[i] = converter;
     inputTypes[i] = inputType;
   }

http://git-wip-us.apache.org/repos/asf/hive/blob/6566065c/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFWidthBucket.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFWidthBucket.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFWidthBucket.java
new file mode 100644
index 0000000..c767d35
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFWidthBucket.java
@@ -0,0 +1,89 @@
+package org.apache.hadoop.hive.ql.udf.generic;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+
+import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils.PrimitiveGrouping.NUMERIC_GROUP;
+import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils.PrimitiveGrouping.VOID_GROUP;
+
+
+@Description(name = "width_bucket",
+    value = "_FUNC_(expr, min_value, max_value, num_buckets) - Returns an integer between 0 and num_buckets+1 by "
+        + "mapping the expr into buckets defined by the range [min_value, max_value]",
+    extended = "Returns an integer between 0 and num_buckets+1 by "
+        + "mapping expr into the ith equally sized bucket. Buckets are made by dividing [min_value, max_value] into "
+        + "equally sized regions. If expr < min_value, return 1, if expr > max_value return num_buckets+1\n"
+        + "Example: expr is an integer column withs values 1, 10, 20, 30.\n"
+        + "  > SELECT _FUNC_(expr, 5, 25, 4) FROM src;\n1\n1\n3\n5")
+public class GenericUDFWidthBucket extends GenericUDF {
+
+  private transient PrimitiveObjectInspector.PrimitiveCategory[] inputTypes = new PrimitiveObjectInspector.PrimitiveCategory[4];
+  private transient ObjectInspectorConverters.Converter[] converters = new ObjectInspectorConverters.Converter[4];
+
+  private final IntWritable output = new IntWritable();
+
+  @Override
+  public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
+    checkArgsSize(arguments, 4, 4);
+
+    checkArgPrimitive(arguments, 0);
+    checkArgPrimitive(arguments, 1);
+    checkArgPrimitive(arguments, 2);
+    checkArgPrimitive(arguments, 3);
+
+    checkArgGroups(arguments, 0, inputTypes, NUMERIC_GROUP, VOID_GROUP);
+    checkArgGroups(arguments, 1, inputTypes, NUMERIC_GROUP, VOID_GROUP);
+    checkArgGroups(arguments, 2, inputTypes, NUMERIC_GROUP, VOID_GROUP);
+    checkArgGroups(arguments, 3, inputTypes, NUMERIC_GROUP, VOID_GROUP);
+
+    obtainLongConverter(arguments, 0, inputTypes, converters);
+    obtainLongConverter(arguments, 1, inputTypes, converters);
+    obtainLongConverter(arguments, 2, inputTypes, converters);
+    obtainIntConverter(arguments, 3, inputTypes, converters);
+
+    return PrimitiveObjectInspectorFactory.writableIntObjectInspector;
+  }
+
+  @Override
+  public Object evaluate(DeferredObject[] arguments) throws HiveException {
+    Long exprValue = getLongValue(arguments, 0, converters);
+    Long minValue = getLongValue(arguments, 1, converters);
+    Long maxValue = getLongValue(arguments, 2, converters);
+    Integer numBuckets = getIntValue(arguments, 3, converters);
+
+    if (exprValue == null || minValue == null || maxValue == null || numBuckets == null) {
+      return null;
+    }
+
+    Preconditions.checkArgument(numBuckets > 0, "numBuckets in width_bucket function must be above 0");
+    long intervalSize = (maxValue - minValue) / numBuckets;
+
+    if (exprValue < minValue) {
+      output.set(0);
+    } else if (exprValue > maxValue) {
+      output.set(numBuckets + 1);
+    } else {
+      long diff = exprValue - minValue;
+      if (diff % intervalSize == 0) {
+        output.set((int) (diff/intervalSize + 1));
+      } else {
+        output.set((int) Math.ceil((double) (diff) / intervalSize));
+      }
+    }
+
+    return output;
+  }
+
+  @Override
+  public String getDisplayString(String[] children) {
+    return getStandardDisplayString("width_bucket", children);
+  }
+}

http://git-wip-us.apache.org/repos/asf/hive/blob/6566065c/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFWidthBucket.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFWidthBucket.java b/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFWidthBucket.java
new file mode 100644
index 0000000..4cefcf8
--- /dev/null
+++ b/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFWidthBucket.java
@@ -0,0 +1,69 @@
+package org.apache.hadoop.hive.ql.udf.generic;
+
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.io.IntWritable;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+
+public class TestGenericUDFWidthBucket {
+
+  @Test
+  public void testExprLessThanMinValue() throws HiveException {
+    assertEquals(0, testWidthBucketWithValues(99L, 100L, 5000L, 10).get());
+  }
+
+  @Test
+  public void testExprEqualsMinValue() throws HiveException {
+    assertEquals(1, testWidthBucketWithValues(100L, 100L, 5000L, 10).get());
+  }
+
+  @Test
+  public void testExprEqualsBoundaryValue() throws HiveException {
+    assertEquals(2, testWidthBucketWithValues(590L, 100L, 5000L, 10).get());
+  }
+
+  @Test
+  public void testExprEqualsMaxValue() throws HiveException {
+    assertEquals(11, testWidthBucketWithValues(5000L, 100L, 5000L, 10).get());
+  }
+
+  @Test
+  public void testExprAboveMaxValue() throws HiveException {
+    assertEquals(11, testWidthBucketWithValues(6000L, 100L, 5000L, 10).get());
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void testNegativeBucketValue() throws HiveException {
+    testWidthBucketWithValues(100L, 100L, 5000L, -1);
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void testZeroBucketValue() throws HiveException {
+    testWidthBucketWithValues(100L, 100L, 5000L, 0);
+  }
+
+  private IntWritable testWidthBucketWithValues(Long expr, Long minValue, Long maxValue, Integer numBuckets) throws HiveException {
+    GenericUDFWidthBucket udf = new GenericUDFWidthBucket();
+    ObjectInspector valueOI1 = PrimitiveObjectInspectorFactory.javaLongObjectInspector;
+    ObjectInspector valueOI2 = PrimitiveObjectInspectorFactory.javaLongObjectInspector;
+    ObjectInspector valueOI3 = PrimitiveObjectInspectorFactory.javaLongObjectInspector;
+    ObjectInspector valueOI4 = PrimitiveObjectInspectorFactory.javaIntObjectInspector;
+    ObjectInspector[] arguments = {valueOI1, valueOI2, valueOI3, valueOI4};
+
+    udf.initialize(arguments);
+
+    GenericUDF.DeferredObject valueObj1 = new GenericUDF.DeferredJavaObject(expr);
+    GenericUDF.DeferredObject valueObj2 = new GenericUDF.DeferredJavaObject(minValue);
+    GenericUDF.DeferredObject valueObj3 = new GenericUDF.DeferredJavaObject(maxValue);
+    GenericUDF.DeferredObject valueObj4 = new GenericUDF.DeferredJavaObject(numBuckets);
+    GenericUDF.DeferredObject[] args = {valueObj1, valueObj2, valueObj3, valueObj4};
+
+    return (IntWritable) udf.evaluate(args);
+  }
+}

http://git-wip-us.apache.org/repos/asf/hive/blob/6566065c/ql/src/test/queries/clientpositive/udf_width_bucket.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/udf_width_bucket.q b/ql/src/test/queries/clientpositive/udf_width_bucket.q
new file mode 100644
index 0000000..6ac60d6
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/udf_width_bucket.q
@@ -0,0 +1,29 @@
+describe function width_bucket;
+desc function extended width_bucket;
+
+explain select width_bucket(10, 5, 25, 4);
+
+select
+width_bucket(1, 5, 25, 4),
+width_bucket(10, 5, 25, 4),
+width_bucket(20, 5, 25, 4),
+width_bucket(30, 5, 25, 4);
+
+select
+width_bucket(1, NULL, 25, 4),
+width_bucket(NULL, 5, 25, 4),
+width_bucket(20, 5, NULL, 4),
+width_bucket(30, 5, 25, NULL),
+width_bucket(NULL, NULL, NULL, NULL);
+
+select
+width_bucket(-1, -25, -5, 4),
+width_bucket(-10, -25, -5, 4),
+width_bucket(-20, -25, -5, 4),
+width_bucket(-30, -25, -5, 4);
+
+select
+width_bucket(-10, -5, 15, 4),
+width_bucket(0, -5, 15, 4),
+width_bucket(10, -5, 15, 4),
+width_bucket(20, -5, 15, 4);

http://git-wip-us.apache.org/repos/asf/hive/blob/6566065c/ql/src/test/results/clientpositive/show_functions.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/show_functions.q.out b/ql/src/test/results/clientpositive/show_functions.q.out
index 68e248a..ac5ca41 100644
--- a/ql/src/test/results/clientpositive/show_functions.q.out
+++ b/ql/src/test/results/clientpositive/show_functions.q.out
@@ -261,6 +261,7 @@ variance
 version
 weekofyear
 when
+width_bucket
 windowingtablefunction
 xpath
 xpath_boolean

http://git-wip-us.apache.org/repos/asf/hive/blob/6566065c/ql/src/test/results/clientpositive/udf_width_bucket.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/udf_width_bucket.q.out b/ql/src/test/results/clientpositive/udf_width_bucket.q.out
new file mode 100644
index 0000000..a72e977
--- /dev/null
+++ b/ql/src/test/results/clientpositive/udf_width_bucket.q.out
@@ -0,0 +1,111 @@
+PREHOOK: query: describe function width_bucket
+PREHOOK: type: DESCFUNCTION
+POSTHOOK: query: describe function width_bucket
+POSTHOOK: type: DESCFUNCTION
+width_bucket(expr, min_value, max_value, num_buckets) - Returns an integer between 0 and num_buckets+1 by mapping the expr into buckets defined by the range [min_value, max_value]
+PREHOOK: query: desc function extended width_bucket
+PREHOOK: type: DESCFUNCTION
+POSTHOOK: query: desc function extended width_bucket
+POSTHOOK: type: DESCFUNCTION
+width_bucket(expr, min_value, max_value, num_buckets) - Returns an integer between 0 and num_buckets+1 by mapping the expr into buckets defined by the range [min_value, max_value]
+Returns an integer between 0 and num_buckets+1 by mapping expr into the ith equally sized bucket. Buckets are made by dividing [min_value, max_value] into equally sized regions. If expr < min_value, return 1, if expr > max_value return num_buckets+1
+Example: expr is an integer column withs values 1, 10, 20, 30.
+  > SELECT width_bucket(expr, 5, 25, 4) FROM src;
+1
+1
+3
+5
+Function class:org.apache.hadoop.hive.ql.udf.generic.GenericUDFWidthBucket
+Function type:BUILTIN
+PREHOOK: query: explain select width_bucket(10, 5, 25, 4)
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select width_bucket(10, 5, 25, 4)
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-0 is a root stage
+
+STAGE PLANS:
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        TableScan
+          alias: _dummy_table
+          Row Limit Per Split: 1
+          Statistics: Num rows: 1 Data size: 1 Basic stats: COMPLETE Column stats: COMPLETE
+          Select Operator
+            expressions: 2 (type: int)
+            outputColumnNames: _col0
+            Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE
+            ListSink
+
+PREHOOK: query: select
+width_bucket(1, 5, 25, 4),
+width_bucket(10, 5, 25, 4),
+width_bucket(20, 5, 25, 4),
+width_bucket(30, 5, 25, 4)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+POSTHOOK: query: select
+width_bucket(1, 5, 25, 4),
+width_bucket(10, 5, 25, 4),
+width_bucket(20, 5, 25, 4),
+width_bucket(30, 5, 25, 4)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+0	2	4	5
+PREHOOK: query: select
+width_bucket(1, NULL, 25, 4),
+width_bucket(NULL, 5, 25, 4),
+width_bucket(20, 5, NULL, 4),
+width_bucket(30, 5, 25, NULL),
+width_bucket(NULL, NULL, NULL, NULL)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+POSTHOOK: query: select
+width_bucket(1, NULL, 25, 4),
+width_bucket(NULL, 5, 25, 4),
+width_bucket(20, 5, NULL, 4),
+width_bucket(30, 5, 25, NULL),
+width_bucket(NULL, NULL, NULL, NULL)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+NULL	NULL	NULL	NULL	NULL
+PREHOOK: query: select
+width_bucket(-1, -25, -5, 4),
+width_bucket(-10, -25, -5, 4),
+width_bucket(-20, -25, -5, 4),
+width_bucket(-30, -25, -5, 4)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+POSTHOOK: query: select
+width_bucket(-1, -25, -5, 4),
+width_bucket(-10, -25, -5, 4),
+width_bucket(-20, -25, -5, 4),
+width_bucket(-30, -25, -5, 4)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+5	4	2	0
+PREHOOK: query: select
+width_bucket(-10, -5, 15, 4),
+width_bucket(0, -5, 15, 4),
+width_bucket(10, -5, 15, 4),
+width_bucket(20, -5, 15, 4)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+POSTHOOK: query: select
+width_bucket(-10, -5, 15, 4),
+width_bucket(0, -5, 15, 4),
+width_bucket(10, -5, 15, 4),
+width_bucket(20, -5, 15, 4)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+0	2	4	5