You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@beam.apache.org by GitBox <gi...@apache.org> on 2021/02/01 18:20:36 UTC

[GitHub] [beam] ibzib commented on a change in pull request #13853: [BEAM-9547] Produce better errors for some groupby() and set_index() configurations

ibzib commented on a change in pull request #13853:
URL: https://github.com/apache/beam/pull/13853#discussion_r568029618



##########
File path: sdks/python/apache_beam/dataframe/frames_test.py
##########
@@ -332,6 +383,8 @@ def test_dataframe_getitem(self):
 
   def test_loc(self):
     dates = pd.date_range('1/1/2000', periods=8)
+    # We do not preserve the freq attribute on a DateTime index. Is this a bug?

Review comment:
       Should we track this with a JIRA?

##########
File path: sdks/python/apache_beam/dataframe/frames_test.py
##########
@@ -182,14 +194,53 @@ def test_groupby_project(self):
     self._run_test(lambda df: df.groupby('group')['foo'].median(), df)
     self._run_test(lambda df: df.groupby('group')['baz'].median(), df)
     self._run_test(lambda df: df.groupby('group')[['bar', 'baz']].median(), df)
+
+  def test_groupby_errors(self):
+    df = pd.DataFrame({
+        'group': ['a' if i % 5 == 0 or i % 3 == 0 else 'b' for i in range(100)],
+        'foo': [None if i % 11 == 0 else i for i in range(100)],
+        'bar': [None if i % 7 == 0 else 99 - i for i in range(100)],
+        'baz': [None if i % 13 == 0 else i * 2 for i in range(100)],
+    })
+
+    # non-existent projection column
     self._run_test(
         lambda df: df.groupby('group')[['bar', 'baz']].bar.median(),
         df,
         expect_error=True)
     self._run_test(
-        lambda df: df.groupby('group')[['bat']].median(), df, expect_error=True)
+        lambda df: df.groupby('group')[['bad']].median(), df, expect_error=True)
+
+    self._run_test(
+        lambda df: df.groupby('group').bad.median(), df, expect_error=True)
+
+    # non-existent grouping label
+    self._run_test(
+        lambda df: df.groupby(['really_bad', 'foo', 'bad']).foo.sum(),
+        df,
+        expect_error=True)
+    self._run_test(
+        lambda df: df.groupby('bad').foo.sum(), df, expect_error=True)
+
+  def test_set_index(self):
+    df = pd.DataFrame({
+        # Generate some unique columns to use for indexes

Review comment:
       Is it possible to simplify this test?
   - Randomness means when that a failure happens it may be difficult to repro. Even if you seed the RNG with some known value, it's clearer to just hard code the result.
   - 100 values is a lot. I'm guessing you should be able to achieve the same coverage with a much smaller example.

##########
File path: sdks/python/apache_beam/dataframe/frames_test.py
##########
@@ -63,23 +63,35 @@ def _run_test(self, func, *args, distributed=True, expect_error=False):
             "Expected an error:\n{expected}\nbut successfully "
             f"returned:\n{actual}")
 
-    if not expect_error:
-      if hasattr(expected, 'equals'):
+    if expect_error:
+      if not isinstance(actual,
+                        type(expected)) or not str(actual) == str(expected):
+        raise AssertionError(
+            f'Expected {expected!r} to be raised, but got {actual!r}'
+        ) from actual
+    else:
+      if isinstance(expected, pd.core.generic.NDFrame):
         if distributed:
-          cmp = lambda df: expected.sort_index().equals(df.sort_index())
+          expected = expected.sort_index()
+          actual = actual.sort_index()
+
+        if isinstance(expected, pd.Series):
+          pd.testing.assert_series_equal(expected, actual)
+        elif isinstance(expected, pd.DataFrame):
+          pd.testing.assert_frame_equal(expected, actual)
         else:
-          cmp = expected.equals
-      elif isinstance(expected, float):
-        cmp = lambda x: (math.isnan(x) and math.isnan(expected)
-                         ) or x == expected == 0 or abs(expected - x) / (
-                             abs(expected) + abs(x)) < 1e-8
+          raise ValueError(
+              "Expected value is an NDFrame, but not a Series or DataFrame.")

Review comment:
       Nit: include the actual type in the error message.

##########
File path: sdks/python/apache_beam/dataframe/frames_test.py
##########
@@ -182,14 +194,53 @@ def test_groupby_project(self):
     self._run_test(lambda df: df.groupby('group')['foo'].median(), df)
     self._run_test(lambda df: df.groupby('group')['baz'].median(), df)
     self._run_test(lambda df: df.groupby('group')[['bar', 'baz']].median(), df)
+
+  def test_groupby_errors(self):
+    df = pd.DataFrame({
+        'group': ['a' if i % 5 == 0 or i % 3 == 0 else 'b' for i in range(100)],
+        'foo': [None if i % 11 == 0 else i for i in range(100)],
+        'bar': [None if i % 7 == 0 else 99 - i for i in range(100)],
+        'baz': [None if i % 13 == 0 else i * 2 for i in range(100)],
+    })
+
+    # non-existent projection column
     self._run_test(
         lambda df: df.groupby('group')[['bar', 'baz']].bar.median(),
         df,
         expect_error=True)
     self._run_test(
-        lambda df: df.groupby('group')[['bat']].median(), df, expect_error=True)
+        lambda df: df.groupby('group')[['bad']].median(), df, expect_error=True)
+
+    self._run_test(
+        lambda df: df.groupby('group').bad.median(), df, expect_error=True)
+
+    # non-existent grouping label

Review comment:
       You should extract `df` as a constant and separate these blocks into at least two separate test methods.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org