You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@beam.apache.org by GitBox <gi...@apache.org> on 2022/04/04 21:30:34 UTC

[GitHub] [beam] TheNeuralBit commented on a diff in pull request #17043: [BEAM-13966] Add pivot(), a non-deferred column operation on categorical columns

TheNeuralBit commented on code in PR #17043:
URL: https://github.com/apache/beam/pull/17043#discussion_r842156113


##########
sdks/python/apache_beam/dataframe/frames_test.py:
##########
@@ -1431,6 +1433,165 @@ def test_unstack_series_multiple_index_and_column_levels(self):
     self._run_test(lambda df: df.unstack(level=['second', 'third']), df)
     self._run_test(lambda df: df.unstack(level=['second']), df)
 
+  def test_pivot_non_categorical(self):
+    df = pd.DataFrame({
+        'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
+        'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
+        'baz': [1, 2, 3, 4, 5, 6],
+        'zoo': ['x', 'y', 'z', 'q', 'w', 't']
+    })
+    with self.assertRaisesRegex(
+        frame_base.WontImplementError,
+        r"pivot\(\) of non-categorical type is not supported"):
+      self._run_test(
+          lambda df: df.pivot(index='foo', columns='bar', values='baz'), df)
+
+  def test_pivot_pandas_example1(self):
+    # Simple test 1
+    df = pd.DataFrame({
+        'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
+        'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
+        'baz': [1, 2, 3, 4, 5, 6],
+        'zoo': ['x', 'y', 'z', 'q', 'w', 't']
+    })
+    df['bar'] = df['bar'].astype(
+        pd.CategoricalDtype(categories=['A', 'B', 'C']))
+    self._run_test(
+        lambda df: df.pivot(index='foo', columns='bar', values='baz'), df)
+    self._run_test(
+        lambda df: df.pivot(index=['foo'], columns='bar', values='baz'), df)
+
+  def test_pivot_pandas_example3(self):
+    # Multiple values
+    df = pd.DataFrame({
+        'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
+        'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
+        'baz': [1, 2, 3, 4, 5, 6],
+        'zoo': ['x', 'y', 'z', 'q', 'w', 't']
+    })
+    df['bar'] = df['bar'].astype(
+        pd.CategoricalDtype(categories=['A', 'B', 'C']))
+    self._run_test(
+        lambda df: df.pivot(index='foo', columns='bar', values=['baz', 'zoo']),
+        df)
+    self._run_test(
+        lambda df: df.pivot(
+            index='foo', columns=['bar'], values=['baz', 'zoo']),
+        df)
+
+  def test_pivot_pandas_example4(self):
+    # Multiple columns
+    df = pd.DataFrame({
+        "lev1": [1, 1, 1, 2, 2, 2],
+        "lev2": [1, 1, 2, 1, 1, 2],
+        "lev3": [1, 2, 1, 2, 1, 2],
+        "lev4": [1, 2, 3, 4, 5, 6],
+        "values": [0, 1, 2, 3, 4, 5]
+    })
+    df['lev2'] = df['lev2'].astype(pd.CategoricalDtype(categories=[1, 2]))
+    df['lev3'] = df['lev3'].astype(pd.CategoricalDtype(categories=[1, 2]))
+    df['values'] = df['values'].astype('Int64')
+    self._run_test(
+        lambda df: df.pivot(
+            index="lev1", columns=["lev2", "lev3"], values="values"),
+        df)
+
+  def test_pivot_pandas_example5(self):
+    # Multiple index
+    df = pd.DataFrame({
+        "lev1": [1, 1, 1, 2, 2, 2],
+        "lev2": [1, 1, 2, 1, 1, 2],
+        "lev3": [1, 2, 1, 2, 1, 2],
+        "lev4": [1, 2, 3, 4, 5, 6],
+        "values": [0, 1, 2, 3, 4, 5]
+    })
+    df['lev3'] = df['lev3'].astype(pd.CategoricalDtype(categories=[1, 2]))
+    # Cast to nullable Int64 because Beam doesn't do the correct conversion to
+    # float64
+    df['values'] = df['values'].astype('Int64')
+    if PD_VERSION < (1, 4):
+      with self.assertRaisesRegex(
+          frame_base.WontImplementError,
+          r"pivot\(\) is not supported when pandas<1.4 and index is a Multi"):
+        self._run_test(
+            lambda df: df.pivot(
+                index=["lev1", "lev2"], columns=["lev3"], values="values"),
+            df)
+    else:
+      self._run_test(
+          lambda df: df.pivot(
+              index=["lev1", "lev2"], columns=["lev3"], values="values"),
+          df)
+
+  def test_pivot_pandas_example6(self):
+    # Value error when there are duplicates
+    df = pd.DataFrame({
+        "foo": ['one', 'one', 'two', 'two'],
+        "bar": ['A', 'A', 'B', 'C'],
+        "baz": [1, 2, 3, 4]
+    })
+    df['bar'] = df['bar'].astype(
+        pd.CategoricalDtype(categories=['A', 'B', 'C']))
+    self._run_error_test(
+        lambda df: df.pivot(index='foo', columns='bar', values='baz'),
+        df,
+        construction_time=False)
+
+  def test_pivot_no_index_provided_on_single_level_index(self):
+    # Multiple columns, no index value provided
+    df = pd.DataFrame({
+        "lev1": [1, 1, 1, 2, 2, 2],
+        "lev2": [1, 1, 2, 1, 1, 2],
+        "lev3": [1, 2, 1, 2, 1, 2],
+        "lev4": [1, 2, 3, 4, 5, 6],
+        "values": [0, 1, 2, 3, 4, 5]
+    })
+    df['lev2'] = df['lev2'].astype(pd.CategoricalDtype(categories=[1, 2]))
+    df['lev3'] = df['lev3'].astype(pd.CategoricalDtype(categories=[1, 2]))
+    df['values'] = df['values'].astype('Int64')
+    self._run_test(
+        lambda df: df.pivot(columns=["lev2", "lev3"], values="values"), df)
+
+  def test_pivot_no_index_provided_on_multiindex(self):
+    # Multiple columns, no index value provided
+    tuples = list(
+        zip(
+            *[
+                ["bar", "bar", "bar", "baz", "baz", "baz"],
+                [
+                    "one",
+                    "two",
+                    "three",
+                    "one",
+                    "two",
+                    "three",
+                ],
+            ]))
+    index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])
+    df = pd.DataFrame({
+        "lev1": [1, 1, 1, 2, 2, 2],
+        "lev2": [1, 1, 2, 1, 1, 2],
+        "lev3": [1, 2, 1, 2, 1, 2],
+        "lev4": [1, 2, 3, 4, 5, 6],
+        "values": [0, 1, 2, 3, 4, 5]
+    },
+                      index=index)
+    df['lev2'] = df['lev2'].astype(pd.CategoricalDtype(categories=[1, 2]))
+    df['lev3'] = df['lev3'].astype(pd.CategoricalDtype(categories=[1, 2]))
+    df['values'] = df['values'].astype('Float64')

Review Comment:
   This is failing just with pandas 1.1, maybe the capitalized aliases don't work there?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@beam.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org