You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2021/07/09 03:44:01 UTC
[spark] branch branch-3.2 updated: [SPARK-36001][PYTHON] Assume result's index to be disordered in tests with operations on different Series

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.2 by this push:
     new cb9bd5f  [SPARK-36001][PYTHON] Assume result's index to be disordered in tests with operations on different Series
cb9bd5f is described below

commit cb9bd5f455ca26d79067f112115723944f6e01be
Author: Xinrong Meng <xi...@databricks.com>
AuthorDate: Fri Jul 9 12:42:48 2021 +0900

    [SPARK-36001][PYTHON] Assume result's index to be disordered in tests with operations on different Series
    
    ### What changes were proposed in this pull request?
    For tests with operations on different Series, sort index of results before comparing them with pandas.
    
    ### Why are the changes needed?
    We have many tests with operations on different Series in `spark/python/pyspark/pandas/tests/data_type_ops/` that assume the result's index to be sorted and then compare to the pandas' behavior.
    
    The assumption on the result's index ordering is wrong since Spark DataFrame join is used internally and the order is not preserved if the data being in different partitions.
    
    So we should assume the result to be disordered and sort the index of such results before comparing them with pandas.
    
    ### Does this PR introduce _any_ user-facing change?
    No.
    
    ### How was this patch tested?
    Unit tests.
    
    Closes #33274 from xinrong-databricks/datatypeops_testdiffframe.
    
    Authored-by: Xinrong Meng <xi...@databricks.com>
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
    (cherry picked from commit af81ad0d7ebfbaa20f41b5e1e7445c8ad0f4c498)
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
 .../pandas/tests/data_type_ops/test_binary_ops.py  |  2 +-
 .../pandas/tests/data_type_ops/test_boolean_ops.py | 30 ++++++++++++----------
 .../pandas/tests/data_type_ops/test_complex_ops.py |  6 +++--
 3 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py
index 9a52a6f..139ad11 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_binary_ops.py
@@ -53,7 +53,7 @@ class BinaryOpsTest(PandasOnSparkTestCase, TestCasesUtils):
         with option_context("compute.ops_on_diff_frames", True):
             for psser in self.pssers:
                 self.assertRaises(TypeError, lambda: self.psser + psser)
-            self.assert_eq(self.psser + self.psser, self.pser + self.pser)
+            self.assert_eq(self.pser + self.pser, (self.psser + self.psser).sort_index())
 
     def test_sub(self):
         self.assertRaises(TypeError, lambda: self.psser - "x")
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
index d714d7e..2ca6d16 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
@@ -72,7 +72,7 @@ class BooleanOpsTest(PandasOnSparkTestCase, TestCasesUtils):
 
             for pser, psser in self.non_numeric_pser_psser_pairs:
                 if isinstance(psser.spark.data_type, BooleanType):
-                    self.assert_eq(self.pser + pser, self.psser + psser)
+                    self.assert_eq(self.pser + pser, (self.psser + psser).sort_index())
                 else:
                     self.assertRaises(TypeError, lambda: self.psser + psser)
 
@@ -108,7 +108,7 @@ class BooleanOpsTest(PandasOnSparkTestCase, TestCasesUtils):
 
             for pser, psser in self.non_numeric_pser_psser_pairs:
                 if isinstance(psser.spark.data_type, BooleanType):
-                    self.assert_eq(self.pser * pser, self.psser * psser)
+                    self.assert_eq(self.pser * pser, (self.psser * psser).sort_index())
                 else:
                     self.assertRaises(TypeError, lambda: self.psser * psser)
 
@@ -260,11 +260,12 @@ class BooleanOpsTest(PandasOnSparkTestCase, TestCasesUtils):
         other_pser = pd.Series([False, None, True], dtype="bool")
         other_psser = ps.from_pandas(other_pser)
         with option_context("compute.ops_on_diff_frames", True):
-            self.assert_eq(pser & other_pser, psser & other_psser)
+            self.assert_eq(pser & other_pser, (psser & other_psser).sort_index())
             self.check_extension(
-                pser & other_pser.astype("boolean"), psser & other_psser.astype("boolean")
+                pser & other_pser.astype("boolean"),
+                (psser & other_psser.astype("boolean")).sort_index(),
             )
-            self.assert_eq(other_pser & pser, other_psser & psser)
+            self.assert_eq(other_pser & pser, (other_psser & psser).sort_index())
 
     def test_rand(self):
         pser = pd.Series([True, False, None], dtype="bool")
@@ -284,11 +285,12 @@ class BooleanOpsTest(PandasOnSparkTestCase, TestCasesUtils):
         other_pser = pd.Series([False, None, True], dtype="bool")
         other_psser = ps.from_pandas(other_pser)
         with option_context("compute.ops_on_diff_frames", True):
-            self.assert_eq(pser | other_pser, psser | other_psser)
+            self.assert_eq(pser | other_pser, (psser | other_psser).sort_index())
             self.check_extension(
-                pser | other_pser.astype("boolean"), psser | other_psser.astype("boolean")
+                pser | other_pser.astype("boolean"),
+                (psser | other_psser.astype("boolean")).sort_index(),
             )
-            self.assert_eq(other_pser | pser, other_psser | psser)
+            self.assert_eq(other_pser | pser, (other_psser | psser).sort_index())
 
     def test_ror(self):
         pser = pd.Series([True, False, None], dtype="bool")
@@ -413,7 +415,7 @@ class BooleanExtensionOpsTest(PandasOnSparkTestCase, TestCasesUtils):
                     self.assertRaises(TypeError, lambda: self.psser + psser)
             bool_pser = pd.Series([False, False, False])
             bool_psser = ps.from_pandas(bool_pser)
-            self.check_extension(self.pser + bool_pser, self.psser + bool_psser)
+            self.check_extension(self.pser + bool_pser, (self.psser + bool_psser).sort_index())
 
     def test_sub(self):
         pser = self.pser
@@ -448,7 +450,7 @@ class BooleanExtensionOpsTest(PandasOnSparkTestCase, TestCasesUtils):
                     self.assertRaises(TypeError, lambda: self.psser * psser)
             bool_pser = pd.Series([True, True, True])
             bool_psser = ps.from_pandas(bool_pser)
-            self.check_extension(self.pser * bool_pser, self.psser * bool_psser)
+            self.check_extension(self.pser * bool_pser, (self.psser * bool_psser).sort_index())
 
     def test_truediv(self):
         pser = self.pser
@@ -596,8 +598,8 @@ class BooleanExtensionOpsTest(PandasOnSparkTestCase, TestCasesUtils):
         self.check_extension(pser & pser, psser & psser)
 
         with option_context("compute.ops_on_diff_frames", True):
-            self.check_extension(pser & self.other_pser, psser & self.other_psser)
-            self.check_extension(self.other_pser & pser, self.other_psser & psser)
+            self.check_extension(pser & self.other_pser, (psser & self.other_psser).sort_index())
+            self.check_extension(self.other_pser & pser, (self.other_psser & psser).sort_index())
 
     def test_rand(self):
         self.check_extension(True & self.pser, True & self.psser)
@@ -611,8 +613,8 @@ class BooleanExtensionOpsTest(PandasOnSparkTestCase, TestCasesUtils):
         self.check_extension(pser | pser, psser | psser)
 
         with option_context("compute.ops_on_diff_frames", True):
-            self.check_extension(pser | self.other_pser, psser | self.other_psser)
-            self.check_extension(self.other_pser | pser, self.other_psser | psser)
+            self.check_extension(pser | self.other_pser, (psser | self.other_psser).sort_index())
+            self.check_extension(self.other_pser | pser, (self.other_psser | psser).sort_index())
 
     def test_ror(self):
         self.check_extension(True | self.pser, True | self.psser)
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py
index c62ec3f..0480285 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py
@@ -120,8 +120,10 @@ class ComplexOpsTest(PandasOnSparkTestCase, TestCasesUtils):
                 self.assert_eq(
                     self.non_numeric_array_psers.get(data_type)
                     + self.non_numeric_array_psers.get(data_type),
-                    self.non_numeric_array_pssers.get(data_type)
-                    + self.non_numeric_array_pssers.get(data_type),
+                    (
+                        self.non_numeric_array_pssers.get(data_type)
+                        + self.non_numeric_array_pssers.get(data_type)
+                    ).sort_index(),
                 )
 
             # Numeric array + Non-numeric array

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org