You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2020/07/06 12:43:07 UTC
[spark] branch branch-3.0 updated: [SPARK-32162][PYTHON][TESTS] Improve error message of Pandas grouped map test with window

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 6a4200c  [SPARK-32162][PYTHON][TESTS] Improve error message of Pandas grouped map test with window
6a4200c is described below

commit 6a4200c057b4d4827099715969603cd92876336a
Author: Bryan Cutler <cu...@gmail.com>
AuthorDate: Mon Jul 6 21:39:41 2020 +0900

    [SPARK-32162][PYTHON][TESTS] Improve error message of Pandas grouped map test with window
    
    ### What changes were proposed in this pull request?
    
    Improve the error message in test GroupedMapInPandasTests.test_grouped_over_window_with_key to show the incorrect values.
    
    ### Why are the changes needed?
    
    This test failure has come up often in Arrow testing because it tests a struct  with timestamp values through a Pandas UDF. The current error message is not helpful as it doesn't show the incorrect values, only that it failed. This change will instead raise an assertion error with the incorrect values on a failure.
    
    Before:
    
    ```
    ======================================================================
    FAIL: test_grouped_over_window_with_key (pyspark.sql.tests.test_pandas_grouped_map.GroupedMapInPandasTests)
    ----------------------------------------------------------------------
    Traceback (most recent call last):
      File "/spark/python/pyspark/sql/tests/test_pandas_grouped_map.py", line 588, in test_grouped_over_window_with_key
        self.assertTrue(all([r[0] for r in result]))
    AssertionError: False is not true
    ```
    
    After:
    ```
    ======================================================================
    ERROR: test_grouped_over_window_with_key (pyspark.sql.tests.test_pandas_grouped_map.GroupedMapInPandasTests)
    ----------------------------------------------------------------------
    ...
    AssertionError: {'start': datetime.datetime(2018, 3, 20, 0, 0), 'end': datetime.datetime(2018, 3, 25, 0, 0)}, != {'start': datetime.datetime(2020, 3, 20, 0, 0), 'end': datetime.datetime(2020, 3, 25, 0, 0)}
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    
    No
    
    ### How was this patch tested?
    
    Improved existing test
    
    Closes #28987 from BryanCutler/pandas-grouped-map-test-output-SPARK-32162.
    
    Authored-by: Bryan Cutler <cu...@gmail.com>
    Signed-off-by: HyukjinKwon <gu...@apache.org>
---
 .../pyspark/sql/tests/test_pandas_grouped_map.py   | 57 +++++++++++++---------
 1 file changed, 35 insertions(+), 22 deletions(-)

diff --git a/python/pyspark/sql/tests/test_pandas_grouped_map.py b/python/pyspark/sql/tests/test_pandas_grouped_map.py
index 7611943..cc6167e 100644
--- a/python/pyspark/sql/tests/test_pandas_grouped_map.py
+++ b/python/pyspark/sql/tests/test_pandas_grouped_map.py
@@ -545,13 +545,13 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
 
     def test_grouped_over_window_with_key(self):
 
-        data = [(0, 1, "2018-03-10T00:00:00+00:00", False),
-                (1, 2, "2018-03-11T00:00:00+00:00", False),
-                (2, 2, "2018-03-12T00:00:00+00:00", False),
-                (3, 3, "2018-03-15T00:00:00+00:00", False),
-                (4, 3, "2018-03-16T00:00:00+00:00", False),
-                (5, 3, "2018-03-17T00:00:00+00:00", False),
-                (6, 3, "2018-03-21T00:00:00+00:00", False)]
+        data = [(0, 1, "2018-03-10T00:00:00+00:00", [0]),
+                (1, 2, "2018-03-11T00:00:00+00:00", [0]),
+                (2, 2, "2018-03-12T00:00:00+00:00", [0]),
+                (3, 3, "2018-03-15T00:00:00+00:00", [0]),
+                (4, 3, "2018-03-16T00:00:00+00:00", [0]),
+                (5, 3, "2018-03-17T00:00:00+00:00", [0]),
+                (6, 3, "2018-03-21T00:00:00+00:00", [0])]
 
         expected_window = [
             {'start': datetime.datetime(2018, 3, 10, 0, 0),
@@ -562,30 +562,43 @@ class GroupedMapInPandasTests(ReusedSQLTestCase):
              'end': datetime.datetime(2018, 3, 25, 0, 0)},
         ]
 
-        expected = {0: (1, expected_window[0]),
-                    1: (2, expected_window[0]),
-                    2: (2, expected_window[0]),
-                    3: (3, expected_window[1]),
-                    4: (3, expected_window[1]),
-                    5: (3, expected_window[1]),
-                    6: (3, expected_window[2])}
+        expected_key = {0: (1, expected_window[0]),
+                        1: (2, expected_window[0]),
+                        2: (2, expected_window[0]),
+                        3: (3, expected_window[1]),
+                        4: (3, expected_window[1]),
+                        5: (3, expected_window[1]),
+                        6: (3, expected_window[2])}
+
+        # id -> array of group with len of num records in window
+        expected = {0: [1],
+                    1: [2, 2],
+                    2: [2, 2],
+                    3: [3, 3, 3],
+                    4: [3, 3, 3],
+                    5: [3, 3, 3],
+                    6: [3]}
 
         df = self.spark.createDataFrame(data, ['id', 'group', 'ts', 'result'])
         df = df.select(col('id'), col('group'), col('ts').cast('timestamp'), col('result'))
 
-        @pandas_udf(df.schema, PandasUDFType.GROUPED_MAP)
         def f(key, pdf):
             group = key[0]
             window_range = key[1]
-            # Result will be True if group and window range equal to expected
-            is_expected = pdf.id.apply(lambda id: (expected[id][0] == group and
-                                                   expected[id][1] == window_range))
-            return pdf.assign(result=is_expected)
 
-        result = df.groupby('group', window('ts', '5 days')).apply(f).select('result').collect()
+            # Make sure the key with group and window values are correct
+            for _, i in pdf.id.iteritems():
+                assert expected_key[i][0] == group, "{} != {}".format(expected_key[i][0], group)
+                assert expected_key[i][1] == window_range, \
+                    "{} != {}".format(expected_key[i][1], window_range)
 
-        # Check that all group and window_range values from udf matched expected
-        self.assertTrue(all([r[0] for r in result]))
+            return pdf.assign(result=[[group] * len(pdf)] * len(pdf))
+
+        result = df.groupby('group', window('ts', '5 days')).applyInPandas(f, df.schema)\
+            .select('id', 'result').collect()
+
+        for r in result:
+            self.assertListEqual(expected[r[0]], r[1])
 
     def test_case_insensitive_grouping_column(self):
         # SPARK-31915: case-insensitive grouping column should work.


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org