You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@beam.apache.org by GitBox <gi...@apache.org> on 2022/03/14 22:45:42 UTC

[GitHub] [beam] TheNeuralBit commented on a change in pull request #17043: [BEAM-13966] Add pivot(), a non-deferred column operation on categorical columns

TheNeuralBit commented on a change in pull request #17043:
URL: https://github.com/apache/beam/pull/17043#discussion_r826434766



##########
File path: sdks/python/apache_beam/dataframe/frames_test.py
##########
@@ -2209,6 +2209,112 @@ def test_nlargest_any(self):
     # but not necessarily with the same index
     self.assert_frame_data_equivalent(result, df.population.nlargest(3))
 
+  def test_pivot_non_categorical(self):
+    df = pd.DataFrame({
+        'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
+        'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
+        'baz': [1, 2, 3, 4, 5, 6],
+        'zoo': ['x', 'y', 'z', 'q', 'w', 't']
+    })
+    with self.assertRaisesRegex(
+        frame_base.WontImplementError,
+        r"pivot\(\) of non-categorical type is not supported"):
+      self._evaluate(
+          lambda df: df.pivot(index='foo', columns='bar', values='baz'), df)
+
+  def test_pivot_pandas_example1(self):
+    # Simple test 1
+    df = pd.DataFrame({
+        'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
+        'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
+        'baz': [1, 2, 3, 4, 5, 6],
+        'zoo': ['x', 'y', 'z', 'q', 'w', 't']
+    })
+    df['bar'] = df['bar'].astype(
+        pd.CategoricalDtype(categories=['A', 'B', 'C']))
+    result = self._evaluate(
+        lambda df: df.pivot(index='foo', columns='bar', values='baz'), df)
+    self.assert_frame_data_equivalent(
+        result, df.pivot(index='foo', columns='bar', values='baz'))

Review comment:
       I think it would be preferable to test the positive test cases the standard way (in `DeferredFrameTest` rather than in `BeamSpecificTest`). These cases _should_ work the same as they in standard pandas. We should only test it here if we don't expect the results to be identical for some reason.

##########
File path: sdks/python/apache_beam/dataframe/frames_test.py
##########
@@ -2209,6 +2209,112 @@ def test_nlargest_any(self):
     # but not necessarily with the same index
     self.assert_frame_data_equivalent(result, df.population.nlargest(3))
 
+  def test_pivot_non_categorical(self):
+    df = pd.DataFrame({
+        'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
+        'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
+        'baz': [1, 2, 3, 4, 5, 6],
+        'zoo': ['x', 'y', 'z', 'q', 'w', 't']
+    })
+    with self.assertRaisesRegex(
+        frame_base.WontImplementError,
+        r"pivot\(\) of non-categorical type is not supported"):
+      self._evaluate(
+          lambda df: df.pivot(index='foo', columns='bar', values='baz'), df)
+
+  def test_pivot_pandas_example1(self):
+    # Simple test 1
+    df = pd.DataFrame({
+        'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
+        'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
+        'baz': [1, 2, 3, 4, 5, 6],
+        'zoo': ['x', 'y', 'z', 'q', 'w', 't']
+    })
+    df['bar'] = df['bar'].astype(
+        pd.CategoricalDtype(categories=['A', 'B', 'C']))
+    result = self._evaluate(
+        lambda df: df.pivot(index='foo', columns='bar', values='baz'), df)
+    self.assert_frame_data_equivalent(
+        result, df.pivot(index='foo', columns='bar', values='baz'))
+
+  def test_pivot_pandas_example2(self):
+    # Simple test 2
+    df = pd.DataFrame({
+        'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
+        'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
+        'baz': [1, 2, 3, 4, 5, 6],
+        'zoo': ['x', 'y', 'z', 'q', 'w', 't']
+    })
+    df['bar'] = df['bar'].astype(
+        pd.CategoricalDtype(categories=['A', 'B', 'C']))
+    result = self._evaluate(lambda df: df.pivot(index='foo', columns='bar'), df)
+    self.assert_frame_data_equivalent(
+        result['baz'], df.pivot(index='foo', columns='bar')['baz'])

Review comment:
       Ah, this looks like at least one case that should be tested here, since we're only verifying `baz`. Is there some reasonable verification we can do on the rest of `result` (e.g. should it be all NaNs)?

##########
File path: sdks/python/apache_beam/dataframe/frames.py
##########
@@ -3655,6 +3655,40 @@ def shift(self, axis, freq, **kwargs):
   describe = _agg_method(pd.DataFrame, 'describe')
   max = _agg_method(pd.DataFrame, 'max')
   min = _agg_method(pd.DataFrame, 'min')
+
+  @frame_base.with_docs_from(pd.DataFrame)
+  @frame_base.args_to_kwargs(pd.DataFrame)
+  @frame_base.populate_defaults(pd.DataFrame)
+  def pivot(self, **kwargs):
+    columns = kwargs.get('columns', None)
+    selected_cols = self._expr.proxy()[columns]
+
+    if isinstance(selected_cols, pd.Series):
+      all_cols_are_categorical = isinstance(
+        selected_cols.dtype, pd.CategoricalDtype
+      )
+    else:
+      all_cols_are_categorical = all(
+        isinstance(c, pd.CategoricalDtype) for c in selected_cols.dtypes
+      )
+    if not all_cols_are_categorical:
+      raise frame_base.WontImplementError(
+          "pivot() of non-categorical type is not supported because "
+          "the type of the output column depends on the data. Please use "
+          "pd.CategoricalDtype with explicit categories.",
+          reason="non-deferred-columns")
+
+    proxy = pd.DataFrame(columns=self._expr.proxy().pivot(**kwargs).columns)
+
+    with expressions.allow_non_parallel_operations():

Review comment:
       I don't think it's a good idea to bypass the non-parallel operations here. The places we do that are cases where we know the data has already been reduced substantially (e.g. we just need to combine partial results for a global aggregation). In this case the input _could_ be very large, so we'd want the user to have to acknowledge that they want to use Singleton partitioning.
   
   That being said, I think we should be able to do this without Singleton partitioning. What if we first move the `index` column(s) to the index in one expression (if set), then apply the `columns`/`values` part in a separate expression, with `requires=Index()`?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@beam.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org