You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/04/02 02:26:12 UTC

[GitHub] [arrow] vibhatha commented on a change in pull request #12590: ARROW-15639 [C++][Python] UDF Scalar Function Implementation

vibhatha commented on a change in pull request #12590:
URL: https://github.com/apache/arrow/pull/12590#discussion_r840995247



##########
File path: python/pyarrow/tests/test_udf.py
##########
@@ -0,0 +1,350 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from typing import List
+
+import pytest
+
+import pyarrow as pa
+from pyarrow import compute as pc
+from pyarrow.compute import register_function
+from pyarrow.compute import InputType
+
+
+def get_function_doc(summary: str, desc: str, arg_names: List[str]):
+    func_doc = {}
+    func_doc["summary"] = summary
+    func_doc["description"] = desc
+    func_doc["arg_names"] = arg_names
+    return func_doc
+
+# scalar unary function data
+
+
+unary_doc = get_function_doc("add function",
+                             "test add function",
+                             ["scalar1"])
+
+
+def unary_function(scalar1):
+    return pc.call_function("add", [scalar1, 1])
+
+# scalar binary function data
+
+
+binary_doc = get_function_doc("y=mx",
+                              "find y from y = mx",
+                              ["m", "x"])
+
+
+def binary_function(m, x):
+    return pc.call_function("multiply", [m, x])
+
+# scalar ternary function data
+
+
+ternary_doc = get_function_doc("y=mx+c",
+                               "find y from y = mx + c",
+                               ["m", "x", "c"])
+
+
+def ternary_function(m, x, c):
+    mx = pc.call_function("multiply", [m, x])
+    return pc.call_function("add", [mx, c])
+
+# scalar varargs function data
+
+
+varargs_doc = get_function_doc("z=ax+by+c",
+                               "find z from z = ax + by + c",
+                               ["a", "x", "b", "y", "c"])
+
+
+def varargs_function(a, x, b, y, c):
+    ax = pc.call_function("multiply", [a, x])
+    by = pc.call_function("multiply", [b, y])
+    ax_by = pc.call_function("add", [ax, by])
+    return pc.call_function("add", [ax_by, c])
+
+
+@pytest.fixture
+def function_input_types():
+    return [
+        # scalar data input types
+        [
+            InputType.scalar(pa.int64())
+        ],
+        [
+            InputType.scalar(pa.int64()),
+            InputType.scalar(pa.int64())
+        ],
+        [
+            InputType.scalar(pa.int64()),
+            InputType.scalar(pa.int64()),
+            InputType.scalar(pa.int64())
+        ],
+        [
+            InputType.scalar(pa.int64()),
+            InputType.scalar(pa.int64()),
+            InputType.scalar(pa.int64()),
+            InputType.scalar(pa.int64()),
+            InputType.scalar(pa.int64())
+        ],
+        # array data input types
+        [
+            InputType.array(pa.int64())
+        ],
+        [
+            InputType.array(pa.int64()),
+            InputType.array(pa.int64())
+        ],
+        [
+            InputType.array(pa.int64()),
+            InputType.array(pa.int64()),
+            InputType.array(pa.int64())
+        ],
+        [
+            InputType.array(pa.int64()),
+            InputType.array(pa.int64()),
+            InputType.array(pa.int64()),
+            InputType.array(pa.int64()),
+            InputType.array(pa.int64())
+        ]
+    ]
+
+
+@pytest.fixture
+def function_output_types():
+    return [
+        pa.int64(),
+        pa.int64(),
+        pa.int64(),
+        pa.int64()
+    ]
+
+
+@pytest.fixture
+def function_names():
+    return [
+        # scalar data function names
+        "scalar_y=x+k",
+        "scalar_y=mx",
+        "scalar_y=mx+c",
+        "scalar_z=ax+by+c",
+        # array data function names
+        "array_y=x+k",
+        "array_y=mx",
+        "array_y=mx+c",
+        "array_z=ax+by+c"
+    ]
+
+
+@pytest.fixture
+def function_arities():
+    return [
+        1,
+        2,
+        3,
+        5,
+    ]
+
+
+@pytest.fixture
+def function_docs():
+    return [
+        unary_doc,
+        binary_doc,
+        ternary_doc,
+        varargs_doc
+    ]
+
+
+@pytest.fixture
+def functions():
+    return [
+        unary_function,
+        binary_function,
+        ternary_function,
+        varargs_function
+    ]
+
+
+@pytest.fixture
+def function_inputs():
+    return [
+        # scalar input data
+        [
+            pa.scalar(10, pa.int64())
+        ],
+        [
+            pa.scalar(10, pa.int64()),
+            pa.scalar(2, pa.int64())
+        ],
+        [
+            pa.scalar(10, pa.int64()),
+            pa.scalar(2, pa.int64()),
+            pa.scalar(5, pa.int64())
+        ],
+        [
+            pa.scalar(2, pa.int64()),
+            pa.scalar(10, pa.int64()),
+            pa.scalar(3, pa.int64()),
+            pa.scalar(20, pa.int64()),
+            pa.scalar(5, pa.int64())
+        ],
+        # array input data
+        [
+            pa.array([10, 20], pa.int64())
+        ],
+        [
+            pa.array([10, 20], pa.int64()),
+            pa.array([2, 4], pa.int64())
+        ],
+        [
+            pa.array([10, 20], pa.int64()),
+            pa.array([2, 4], pa.int64()),
+            pa.array([5, 10], pa.int64())
+        ],
+        [
+            pa.array([2, 3], pa.int64()),
+            pa.array([10, 20], pa.int64()),
+            pa.array([3, 7], pa.int64()),
+            pa.array([20, 30], pa.int64()),
+            pa.array([5, 10], pa.int64())
+        ]
+    ]
+
+
+@pytest.fixture
+def expected_outputs():
+    return [
+        # scalar output data
+        pa.scalar(11, pa.int64()),  # 10 + 1
+        pa.scalar(20, pa.int64()),  # 10 * 2
+        pa.scalar(25, pa.int64()),  # 10 * 2 + 5
+        pa.scalar(85, pa.int64()),  # (2 * 10) + (3 * 20) + 5
+        # array output data
+        pa.array([11, 21], pa.int64()),  # [10 + 1, 20 + 1]
+        pa.array([20, 80], pa.int64()),  # [10 * 2, 20 * 4]
+        pa.array([25, 90], pa.int64()),  # [(10 * 2) + 5, (20 * 4) + 10]
+        # [(2 * 10) + (3 * 20) + 5, (3 * 20) + (7 * 30) + 10]
+        pa.array([85, 280], pa.int64())
+    ]
+
+
+def test_scalar_udf_function_with_scalar_data(function_names,

Review comment:
       It will be better to fix it in this PR itself since it is a core functionality. Along with this, I also thought about how to provide access to register a function which can have multiple input, output configurations. Simply we can take a list of such functions from the user and add multiple kernels when we register the function. It seems to be the most straightforward way to solve this problem. Other way is to create a hash with the name and input and output types, so even though the user calls a function by name, internally we infer the types and get the registered name. But it could be overly complicating things. Would like to know your thoughts on this. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org