You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@hivemall.apache.org by GitBox <gi...@apache.org> on 2019/01/08 11:02:16 UTC

[GitHub] asfgit closed pull request #176: [HIVEMALL-231] Replaced subarray UDF implementation with SubarrayUDF

asfgit closed pull request #176: [HIVEMALL-231] Replaced subarray UDF implementation with SubarrayUDF
URL: https://github.com/apache/incubator-hivemall/pull/176
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/bin/update_func_md.sh b/bin/update_func_md.sh
index bb0afda3a..bb590e553 100755
--- a/bin/update_func_md.sh
+++ b/bin/update_func_md.sh
@@ -34,14 +34,14 @@ HIVEMALL_HOME=`pwd`
 
 # Deploy to local Maven repos
 
-export MAVEN_OPTS=-XX:MaxPermSize=256m
+export MAVEN_OPTS="-XX:MaxPermSize=256m -Dhttps.protocols=TLSv1,TLSv1.1,TLSv1.2"
 mvn clean install -DskipTests=true -Dmaven.test.skip=true -pl '.,core,nlp,xgboost,tools/hivemall-docs'
 
 # Generate docs
 
 mvn org.apache.hivemall:hivemall-docs:generate-funcs-list -pl '.,core,nlp,xgboost,tools/hivemall-docs' -X
 
-# Run HTTP server on localhost:040
+# Run HTTP server on localhost:4000
 
 cd ${HIVEMALL_HOME}/docs/gitbook
 gitbook install && gitbook serve
diff --git a/core/src/main/java/hivemall/tools/array/ArraySliceUDF.java b/core/src/main/java/hivemall/tools/array/ArraySliceUDF.java
index e842df6ef..e57caadfd 100644
--- a/core/src/main/java/hivemall/tools/array/ArraySliceUDF.java
+++ b/core/src/main/java/hivemall/tools/array/ArraySliceUDF.java
@@ -43,7 +43,7 @@
 @Description(name = "array_slice",
         value = "_FUNC_(array<ANY> values, int offset [, int length]) - Slices the given array by the given offset and length parameters.",
         extended = "SELECT \n" + 
-                "  array_slice(array(1,2,3,4,5,6), 2,4),\n" + 
+                "  array_slice(array(1,2,3,4,5,6),2,4),\n" + 
                 "  array_slice(\n" + 
                 "   array(\"zero\", \"one\", \"two\", \"three\", \"four\", \"five\", \"six\", \"seven\", \"eight\", \"nine\", \"ten\"),\n" + 
                 "   0, -- offset\n" + 
diff --git a/core/src/main/java/hivemall/tools/array/SubarrayUDF.java b/core/src/main/java/hivemall/tools/array/SubarrayUDF.java
new file mode 100644
index 000000000..0a25eb665
--- /dev/null
+++ b/core/src/main/java/hivemall/tools/array/SubarrayUDF.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.tools.array;
+
+import static hivemall.utils.lang.StringUtils.join;
+
+import hivemall.utils.hadoop.HiveUtils;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.UDFType;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
+
+// @formatter:off
+@Description(name = "subarray",
+        value = "_FUNC_(array<ANY> values, int fromIndex [, int toIndex])" +
+                "- Returns a slice of the original array between the inclusive fromIndex and the exclusive toIndex.",
+        extended = "SELECT \n" + 
+                "  subarray(array(0,1,2,3,4,5),4),\n" + 
+                "  subarray(array(0,1,2,3,4,5),3,4),\n" + 
+                "  subarray(array(0,1,2,3,4,5),3,3),\n" + 
+                "  subarray(array(0,1,2,3,4,5),3,2),\n" + 
+                "  subarray(array(0,1,2,3,4,5),0,2),\n" + 
+                "  subarray(array(0,1,2,3,4,5),-1,2),\n" +  
+                "  subarray(array(1,2,3,4,5,6),4),\n" + 
+                "  subarray(array(1,2,3,4,5,6),4,6),\n" + 
+                "  subarray(array(1,2,3,4,5,6),2,4),\n" + 
+                "  subarray(array(1,2,3,4,5,6),0,2),\n" + 
+                "  subarray(array(1,2,3,4,5,6),4,6),\n" + 
+                "  subarray(array(1,2,3,4,5,6),4,7);\n" + 
+                "\n" + 
+                " [4,5]\n" + 
+                " [3]\n" + 
+                " []\n" + 
+                " []\n" + 
+                " [0,1]\n" + 
+                " [0,1]\n" + 
+                " [5,6]\n" + 
+                " [5,6]\n" + 
+                " [3,4]\n" + 
+                " [1,2]\n" + 
+                " [5,6]\n" + 
+                " [5,6]")
+// @formatter:on
+@UDFType(deterministic = true, stateful = false)
+public final class SubarrayUDF extends GenericUDF {
+
+    private ListObjectInspector valuesOI;
+    private PrimitiveObjectInspector fromIndexOI;
+    @Nullable
+    private PrimitiveObjectInspector toIndexOI;
+
+    private final List<Object> result = new ArrayList<>();
+
+    @Override
+    public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {
+        if (argOIs.length != 2 && argOIs.length != 3) {
+            throw new UDFArgumentLengthException(
+                "Expected 2 or 3 arguments, but got " + argOIs.length);
+        }
+
+        this.valuesOI = HiveUtils.asListOI(argOIs[0]);
+        this.fromIndexOI = HiveUtils.asIntegerOI(argOIs[1]);
+        if (argOIs.length == 3) {
+            this.toIndexOI = HiveUtils.asIntegerOI(argOIs[2]);
+        }
+
+        ObjectInspector elemOI = valuesOI.getListElementObjectInspector();
+        return ObjectInspectorFactory.getStandardListObjectInspector(elemOI);
+    }
+
+    @Nullable
+    @Override
+    public List<Object> evaluate(@Nonnull DeferredObject[] args) throws HiveException {
+        Object arg0 = args[0].get();
+        if (arg0 == null) {
+            return null;
+        }
+        result.clear();
+
+        final int size = valuesOI.getListLength(arg0);
+
+        Object arg1 = args[1].get();
+        if (arg1 == null) {
+            throw new UDFArgumentException("2nd argument MUST NOT be null");
+        }
+        int fromIndex = PrimitiveObjectInspectorUtils.getInt(arg1, fromIndexOI);
+        if (fromIndex < 0) {
+            fromIndex = 0;
+        }
+
+        int toIndex;
+        if (args.length == 3) {
+            Object arg2 = args[2].get();
+            if (arg2 == null) {
+                throw new UDFArgumentException("3rd argument MUST NOT be null");
+            }
+            toIndex = PrimitiveObjectInspectorUtils.getInt(arg2, toIndexOI);
+            if (toIndex > size) {
+                toIndex = size;
+            }
+        } else {
+            toIndex = size;
+        }
+
+        for (int i = fromIndex; i < toIndex; i++) {
+            Object e = valuesOI.getListElement(arg0, i);
+            result.add(e);
+        }
+
+        return result;
+    }
+
+    @Override
+    public String getDisplayString(String[] args) {
+        return "subarray(" + join(args, ',') + ")";
+    }
+
+}
diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive
index 69dcf6934..5e2be023c 100644
--- a/resources/ddl/define-all-as-permanent.hive
+++ b/resources/ddl/define-all-as-permanent.hive
@@ -416,9 +416,8 @@ CREATE FUNCTION array_concat as 'hivemall.tools.array.ArrayConcatUDF' USING JAR
 DROP FUNCTION IF EXISTS concat_array;
 CREATE FUNCTION concat_array as 'hivemall.tools.array.ArrayConcatUDF' USING JAR '${hivemall_jar}';
 
--- alias for backward compatibility
 DROP FUNCTION IF EXISTS subarray;
-CREATE FUNCTION subarray as 'hivemall.tools.array.ArraySliceUDF' USING JAR '${hivemall_jar}';
+CREATE FUNCTION subarray as 'hivemall.tools.array.SubarrayUDF' USING JAR '${hivemall_jar}';
 
 DROP FUNCTION IF EXISTS array_slice;
 CREATE FUNCTION array_slice as 'hivemall.tools.array.ArraySliceUDF' USING JAR '${hivemall_jar}';
diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive
index f39aea3be..6c6d92929 100644
--- a/resources/ddl/define-all.hive
+++ b/resources/ddl/define-all.hive
@@ -408,9 +408,8 @@ create temporary function array_concat as 'hivemall.tools.array.ArrayConcatUDF';
 drop temporary function if exists concat_array;
 create temporary function concat_array as 'hivemall.tools.array.ArrayConcatUDF';
 
--- alias for backward compatibility
 drop temporary function if exists subarray;
-create temporary function subarray as 'hivemall.tools.array.ArraySliceUDF';
+create temporary function subarray as 'hivemall.tools.array.SubarrayUDF';
 
 drop temporary function if exists array_slice;
 create temporary function array_slice as 'hivemall.tools.array.ArraySliceUDF';
diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark
index 4d46694ba..466e48b49 100644
--- a/resources/ddl/define-all.spark
+++ b/resources/ddl/define-all.spark
@@ -408,7 +408,7 @@ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS array_concat")
 sqlContext.sql("CREATE TEMPORARY FUNCTION array_concat AS 'hivemall.tools.array.ArrayConcatUDF'")
 
 sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS subarray")
-sqlContext.sql("CREATE TEMPORARY FUNCTION subarray AS 'hivemall.tools.array.ArraySliceUDF'")
+sqlContext.sql("CREATE TEMPORARY FUNCTION subarray AS 'hivemall.tools.array.SubarrayUDF'")
 
 sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS array_slice")
 sqlContext.sql("CREATE TEMPORARY FUNCTION array_slice AS 'hivemall.tools.array.ArraySliceUDF'")
diff --git a/spark/spark-2.2/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala b/spark/spark-2.2/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala
index b02ef0249..b8c2722c0 100644
--- a/spark/spark-2.2/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala
+++ b/spark/spark-2.2/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala
@@ -1937,14 +1937,12 @@ object HivemallOps {
   }
 
   /**
-   * Alias of array_slice for a backward compatibility.
-   *
-   * @see [[hivemall.tools.array.ArraySliceUDF]]
+   * @see [[hivemall.tools.array.SubarrayUDF]]
    * @group tools.array
    */
   def subarray(original: Column, fromIndex: Column, toIndex: Column): Column = withExpr {
     planHiveGenericUDF(
-      "hivemall.tools.array.ArraySliceUDF",
+      "hivemall.tools.array.SubarrayUDF",
       "subarray",
       original :: fromIndex :: toIndex :: Nil
     )
@@ -1954,11 +1952,11 @@ object HivemallOps {
    * @see [[hivemall.tools.array.ArraySliceUDF]]
    * @group tools.array
    */
-  def array_slice(original: Column, fromIndex: Column, toIndex: Column): Column = withExpr {
+  def array_slice(original: Column, fromIndex: Column, length: Column): Column = withExpr {
     planHiveGenericUDF(
       "hivemall.tools.array.ArraySliceUDF",
       "array_slice",
-      original :: fromIndex :: toIndex :: Nil
+      original :: fromIndex :: length :: Nil
     )
   }
 
diff --git a/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala b/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala
index b77dc59ca..f8d377a65 100644
--- a/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala
+++ b/spark/spark-2.2/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala
@@ -427,6 +427,10 @@ class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest {
     )
     checkAnswer(
       DummyInputData.select(subarray(typedLit(Seq(1, 2, 3, 4, 5)), lit(2), lit(4))),
+      Row(Seq(3, 4))
+    )
+    checkAnswer(
+      DummyInputData.select(array_slice(typedLit(Seq(1, 2, 3, 4, 5)), lit(2), lit(4))),
       Row(Seq(3, 4, 5))
     )
     checkAnswer(
diff --git a/spark/spark-2.3/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala b/spark/spark-2.3/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala
index c0fa6c59e..12852966a 100644
--- a/spark/spark-2.3/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala
+++ b/spark/spark-2.3/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala
@@ -1937,19 +1937,29 @@ object HivemallOps {
   }
 
   /**
-   * Alias of array_slice for a backward compatibility.
-   *
-   * @see [[hivemall.tools.array.ArraySliceUDF]]
+   * @see [[hivemall.tools.array.SubarrayUDF]]
    * @group tools.array
    */
   def subarray(original: Column, fromIndex: Column, toIndex: Column): Column = withExpr {
     planHiveGenericUDF(
-      "hivemall.tools.array.ArraySliceUDF",
+      "hivemall.tools.array.SubarrayUDF",
       "subarray",
       original :: fromIndex :: toIndex :: Nil
     )
   }
 
+  /**
+   * @see [[hivemall.tools.array.ArraySliceUDF]]
+   * @group tools.array
+   */
+  def array_slice(original: Column, fromIndex: Column, length: Column): Column = withExpr {
+    planHiveGenericUDF(
+      "hivemall.tools.array.ArraySliceUDF",
+      "array_slice",
+      original :: fromIndex :: length :: Nil
+    )
+  }
+
   /**
    * @see [[hivemall.tools.array.ToStringArrayUDF]]
    * @group tools.array
diff --git a/spark/spark-2.3/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala b/spark/spark-2.3/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala
index 268be0542..52e93b38c 100644
--- a/spark/spark-2.3/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala
+++ b/spark/spark-2.3/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala
@@ -427,6 +427,10 @@ class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest {
     )
     checkAnswer(
       DummyInputData.select(subarray(typedLit(Seq(1, 2, 3, 4, 5)), lit(2), lit(4))),
+      Row(Seq(3, 4))
+    )
+    checkAnswer(
+      DummyInputData.select(array_slice(typedLit(Seq(1, 2, 3, 4, 5)), lit(2), lit(4))),
       Row(Seq(3, 4, 5))
     )
     checkAnswer(


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services