You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2022/07/11 07:30:56 UTC
[spark] branch master updated: [SPARK-39728][PYTHON] Add explicit PySpark SQL function parity check
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new a68bcc62244 [SPARK-39728][PYTHON] Add explicit PySpark SQL function parity check
a68bcc62244 is described below
commit a68bcc622446fa85414286da9563da3bcdf1fbaa
Author: Andrew Ray <ra...@gmail.com>
AuthorDate: Mon Jul 11 16:30:45 2022 +0900
[SPARK-39728][PYTHON] Add explicit PySpark SQL function parity check
### What changes were proposed in this pull request?
This PR adds a test that compares the available list of Python DataFrame functions in pyspark.sql.functions with those available in the Scala/Java DataFrame API in org.apache.spark.sql.functions. If a function is added to only one but not the other this test will fail until its exclusions are updated.
### Why are the changes needed?
Currently there is no easy way to verify what functions are missing from the Python DataFrame API
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
This PR is testing only
Closes #37144 from aray/python-function-parity-test.
Authored-by: Andrew Ray <ra...@gmail.com>
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
python/pyspark/sql/tests/test_functions.py | 56 ++++++++++++++++++++++++++++++
1 file changed, 56 insertions(+)
diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py
index 5c6acaffa32..5091fa711a8 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -16,6 +16,7 @@
#
import datetime
+from inspect import getmembers, isfunction
from itertools import chain
import re
import math
@@ -51,10 +52,65 @@ from pyspark.sql.functions import (
slice,
least,
)
+from pyspark.sql import functions
from pyspark.testing.sqlutils import ReusedSQLTestCase, SQLTestUtils
class FunctionsTests(ReusedSQLTestCase):
+ def test_function_parity(self):
+ # This test compares the available list of functions in pyspark.sql.functions with those
+ # available in the Scala/Java DataFrame API in org.apache.spark.sql.functions.
+ #
+ # NOTE FOR DEVELOPERS:
+ # If this test fails one of the following needs to happen
+ # * If a function was added to org.apache.spark.sql.functions it either needs to be added to
+ # pyspark.sql.functions or added to the below expected_missing_in_py set.
+ # * If a function was added to pyspark.sql.functions that was already in
+ # org.apache.spark.sql.functions then it needs to be removed from expected_missing_in_py
+ # below. If the function has a different name it needs to be added to py_equiv_jvm
+ # mapping.
+ # * If it's not related to an added/removed function then likely the exclusion list
+ # jvm_excluded_fn needs to be updated.
+
+ jvm_fn_set = {name for (name, value) in getmembers(self.sc._jvm.functions)}
+ py_fn_set = {name for (name, value) in getmembers(functions, isfunction) if name[0] != "_"}
+
+ # Functions on the JVM side we do not expect to be available in python because they are
+ # depreciated, irrelevant to python, or have equivalents.
+ jvm_excluded_fn = [
+ "callUDF", # depreciated, use call_udf
+ "typedlit", # Scala only
+ "typedLit", # Scala only
+ "monotonicallyIncreasingId", # depreciated, use monotonically_increasing_id
+ "negate", # equivalent to python -expression
+ "not", # equivalent to python ~expression
+ "udaf", # used for creating UDAF's which are not supported in PySpark
+ ]
+
+ jvm_fn_set.difference_update(jvm_excluded_fn)
+
+ # For functions that are named differently in pyspark this is the mapping of their
+ # python name to the JVM equivalent
+ py_equiv_jvm = {"create_map": "map"}
+ for py_name, jvm_name in py_equiv_jvm.items():
+ if py_name in py_fn_set:
+ py_fn_set.remove(py_name)
+ py_fn_set.add(jvm_name)
+
+ missing_in_py = jvm_fn_set.difference(py_fn_set)
+
+ # Functions that we expect to be missing in python until they are added to pyspark
+ expected_missing_in_py = {
+ "call_udf", # TODO(SPARK-39734)
+ "localtimestamp", # TODO(SPARK-36259)
+ "map_contains_key", # TODO(SPARK-39733)
+ "pmod", # TODO(SPARK-37348)
+ }
+
+ self.assertEqual(
+ expected_missing_in_py, missing_in_py, "Missing functions in pyspark not as expected"
+ )
+
def test_explode(self):
from pyspark.sql.functions import explode, explode_outer, posexplode_outer
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org