You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2023/03/20 00:26:21 UTC

[spark] branch branch-3.4 updated: [SPARK-41818][SPARK-41843][CONNECT][PYTHON][TESTS] Enable more parity tests

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.4 by this push:
     new a0993ba76f2 [SPARK-41818][SPARK-41843][CONNECT][PYTHON][TESTS] Enable more parity tests
a0993ba76f2 is described below

commit a0993ba76f2c62558faafd57cf4b8281f5dbe451
Author: Takuya UESHIN <ue...@databricks.com>
AuthorDate: Mon Mar 20 09:25:52 2023 +0900

    [SPARK-41818][SPARK-41843][CONNECT][PYTHON][TESTS] Enable more parity tests
    
    ### What changes were proposed in this pull request?
    
    Enables more parity tests.
    
    ### Why are the changes needed?
    
    We can enable more parity tests.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Enabled related parity tests.
    
    Closes #40470 from ueshin/issues/SPARK-41818/parity.
    
    Authored-by: Takuya UESHIN <ue...@databricks.com>
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
    (cherry picked from commit e911c5efea7d12800fdbe3cb8effbc0d5ce763f0)
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
 python/pyspark/sql/connect/catalog.py              |  3 ---
 python/pyspark/sql/connect/functions.py            |  3 ---
 .../sql/tests/connect/test_parity_pandas_udf.py    |  8 --------
 .../pyspark/sql/tests/connect/test_parity_udf.py   | 18 ++--------------
 python/pyspark/sql/tests/test_udf.py               | 24 ++++++++++++++--------
 5 files changed, 17 insertions(+), 39 deletions(-)

diff --git a/python/pyspark/sql/connect/catalog.py b/python/pyspark/sql/connect/catalog.py
index 261f87b4cc6..788c48e037b 100644
--- a/python/pyspark/sql/connect/catalog.py
+++ b/python/pyspark/sql/connect/catalog.py
@@ -331,9 +331,6 @@ def _test() -> None:
         PySparkSession.builder.appName("sql.connect.catalog tests").remote("local[4]").getOrCreate()
     )
 
-    # TODO(SPARK-41818): java.lang.ClassNotFoundException) .DefaultSource
-    del pyspark.sql.connect.catalog.Catalog.recoverPartitions.__doc__
-
     (failure_count, test_count) = doctest.testmod(
         pyspark.sql.connect.catalog,
         globs=globs,
diff --git a/python/pyspark/sql/connect/functions.py b/python/pyspark/sql/connect/functions.py
index e8bb06a3903..f01e333c9d7 100644
--- a/python/pyspark/sql/connect/functions.py
+++ b/python/pyspark/sql/connect/functions.py
@@ -2482,9 +2482,6 @@ def _test() -> None:
     # Spark Connect does not support Spark Context but the test depends on that.
     del pyspark.sql.connect.functions.monotonically_increasing_id.__doc__
 
-    # TODO(SPARK-41843): Implement SparkSession.udf
-    del pyspark.sql.connect.functions.call_udf.__doc__
-
     globs["spark"] = (
         PySparkSession.builder.appName("sql.connect.functions tests")
         .remote("local[4]")
diff --git a/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py b/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py
index 4b1ce0a9587..841ade40f5e 100644
--- a/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py
+++ b/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py
@@ -32,14 +32,6 @@ class PandasUDFParityTests(PandasUDFTestsMixin, ReusedConnectTestCase):
     def test_pandas_udf_timestamp_ntz(self):
         super().test_pandas_udf_timestamp_ntz()
 
-    @unittest.skip("Spark Connect does not support spark.conf but the test depends on it.")
-    def test_pandas_udf_detect_unsafe_type_conversion(self):
-        super().test_pandas_udf_detect_unsafe_type_conversion()
-
-    @unittest.skip("Spark Connect does not support spark.conf but the test depends on it.")
-    def test_pandas_udf_arrow_overflow(self):
-        super().test_pandas_udf_arrow_overflow()
-
     # TODO(SPARK-42247): standardize `returnType` attribute of UDF
     @unittest.skip("Fails in Spark Connect, should enable.")
     def test_pandas_udf_decorator(self):
diff --git a/python/pyspark/sql/tests/connect/test_parity_udf.py b/python/pyspark/sql/tests/connect/test_parity_udf.py
index 50f0d36be5d..10d93b71ebf 100644
--- a/python/pyspark/sql/tests/connect/test_parity_udf.py
+++ b/python/pyspark/sql/tests/connect/test_parity_udf.py
@@ -44,10 +44,6 @@ class UDFParityTests(BaseUDFTestsMixin, ReusedConnectTestCase):
     def test_same_accumulator_in_udfs(self):
         super().test_same_accumulator_in_udfs()
 
-    @unittest.skip("Spark Connect does not support spark.conf but the test depends on it.")
-    def test_udf_with_column_vector(self):
-        super().test_udf_with_column_vector()
-
     @unittest.skip("Spark Connect does not support spark.conf but the test depends on it.")
     def test_udf_timestamp_ntz(self):
         super().test_udf_timestamp_ntz()
@@ -56,14 +52,6 @@ class UDFParityTests(BaseUDFTestsMixin, ReusedConnectTestCase):
     def test_broadcast_in_udf(self):
         super().test_broadcast_in_udf()
 
-    @unittest.skip("Spark Connect does not support sql_conf but the test depends on it.")
-    def test_file_dsv2_with_udf_filter(self):
-        super().test_file_dsv2_with_udf_filter()
-
-    @unittest.skip("Spark Connect does not support sql_conf but the test depends on it.")
-    def test_udf_in_join_condition(self):
-        super().test_udf_in_join_condition()
-
     @unittest.skip("Spark Connect does not support cache() but the test depends on it.")
     def test_udf_cache(self):
         super().test_udf_cache()
@@ -76,13 +64,11 @@ class UDFParityTests(BaseUDFTestsMixin, ReusedConnectTestCase):
     def test_nondeterministic_udf3(self):
         super().test_nondeterministic_udf3()
 
-    @unittest.skip("Requires JVM access.")
     def test_nondeterministic_udf_in_aggregate(self):
-        super().test_nondeterministic_udf_in_aggregate()
+        self.check_nondeterministic_udf_in_aggregate()
 
-    @unittest.skip("Requires JVM access.")
     def test_udf_registration_return_type_not_none(self):
-        super().test_udf_registration_return_type_not_none()
+        self.check_udf_registration_return_type_not_none()
 
     @unittest.skip("Spark Connect doesn't support RDD but the test depends on it.")
     def test_worker_original_stdin_closed(self):
diff --git a/python/pyspark/sql/tests/test_udf.py b/python/pyspark/sql/tests/test_udf.py
index b766b0c0178..250386d8c6e 100644
--- a/python/pyspark/sql/tests/test_udf.py
+++ b/python/pyspark/sql/tests/test_udf.py
@@ -106,10 +106,13 @@ class BaseUDFTestsMixin(object):
 
     def test_udf_registration_return_type_not_none(self):
         with QuietTest(self.sc):
-            with self.assertRaisesRegex(TypeError, "Invalid return type"):
-                self.spark.catalog.registerFunction(
-                    "f", UserDefinedFunction(lambda x, y: len(x) + y, StringType()), StringType()
-                )
+            self.check_udf_registration_return_type_not_none()
+
+    def check_udf_registration_return_type_not_none(self):
+        with self.assertRaisesRegex(TypeError, "Invalid return type"):
+            self.spark.catalog.registerFunction(
+                "f", UserDefinedFunction(lambda x, y: len(x) + y, StringType()), StringType()
+            )
 
     def test_nondeterministic_udf(self):
         # Test that nondeterministic UDFs are evaluated only once in chained UDF evaluations
@@ -154,17 +157,20 @@ class BaseUDFTestsMixin(object):
         self.assertFalse(deterministic)
 
     def test_nondeterministic_udf_in_aggregate(self):
+        with QuietTest(self.sc):
+            self.check_nondeterministic_udf_in_aggregate()
+
+    def check_nondeterministic_udf_in_aggregate(self):
         from pyspark.sql.functions import sum
         import random
 
         udf_random_col = udf(lambda: int(100 * random.random()), "int").asNondeterministic()
         df = self.spark.range(10)
 
-        with QuietTest(self.sc):
-            with self.assertRaisesRegex(AnalysisException, "nondeterministic"):
-                df.groupby("id").agg(sum(udf_random_col())).collect()
-            with self.assertRaisesRegex(AnalysisException, "nondeterministic"):
-                df.agg(sum(udf_random_col())).collect()
+        with self.assertRaisesRegex(AnalysisException, "nondeterministic"):
+            df.groupby("id").agg(sum(udf_random_col())).collect()
+        with self.assertRaisesRegex(AnalysisException, "nondeterministic"):
+            df.agg(sum(udf_random_col())).collect()
 
     def test_chained_udf(self):
         self.spark.catalog.registerFunction("double", lambda x: x + x, IntegerType())


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org