You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@beam.apache.org by ib...@apache.org on 2021/08/25 17:34:23 UTC
[beam] branch master updated: [BEAM-12764] Revert "Merge pull request #15165 from [BEAM-12593] Verify DataFrame API on pandas 1.3.0"

This is an automated email from the ASF dual-hosted git repository.

ibzib pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git


The following commit(s) were added to refs/heads/master by this push:
     new fc7df4b  [BEAM-12764] Revert "Merge pull request #15165 from [BEAM-12593] Verify DataFrame API on pandas 1.3.0"
     new aa1c8e5  Merge pull request #15385 from ibzib/rollback-dataframes
fc7df4b is described below

commit fc7df4b97c571ff15dd5c388051fca1bf613665d
Author: Kyle Weaver <kc...@google.com>
AuthorDate: Tue Aug 24 16:13:39 2021 -0700

    [BEAM-12764] Revert "Merge pull request #15165 from [BEAM-12593] Verify DataFrame API on pandas 1.3.0"
    
    This reverts commit faac725e98f8422b172a77e8c898af03c15b74c6.
---
 sdks/python/apache_beam/dataframe/frames.py        | 21 +++-----
 sdks/python/apache_beam/dataframe/frames_test.py   | 46 ++---------------
 .../apache_beam/dataframe/pandas_doctests_test.py  | 58 ++++------------------
 sdks/python/setup.py                               |  2 +-
 4 files changed, 22 insertions(+), 105 deletions(-)

diff --git a/sdks/python/apache_beam/dataframe/frames.py b/sdks/python/apache_beam/dataframe/frames.py
index 45ae8c6..b834d9c 100644
--- a/sdks/python/apache_beam/dataframe/frames.py
+++ b/sdks/python/apache_beam/dataframe/frames.py
@@ -55,9 +55,6 @@ __all__ = [
     'DeferredDataFrame',
 ]
 
-# Get major, minor version
-PD_VERSION = tuple(map(int, pd.__version__.split('.')[0:2]))
-
 
 def populate_not_implemented(pd_type):
   def wrapper(deferred_type):
@@ -1935,7 +1932,7 @@ class DeferredSeries(DeferredDataFrameOrSeries):
     else:
       column = self
 
-    result = column.groupby(column, dropna=dropna).size()
+    result = column.groupby(column).size()
 
     # groupby.size() names the index, which we don't need
     result.index.name = None
@@ -2395,8 +2392,8 @@ class DeferredDataFrame(DeferredDataFrameOrSeries):
     if func in ('quantile',):
       return getattr(self, func)(*args, axis=axis, **kwargs)
 
-    # In pandas<1.3.0, maps to a property, args are ignored
-    if func in ('size',) and PD_VERSION < (1, 3):
+    # Maps to a property, args are ignored
+    if func in ('size',):
       return getattr(self, func)
 
     # We also have specialized distributed implementations for these. They only
@@ -3395,7 +3392,7 @@ class DeferredDataFrame(DeferredDataFrameOrSeries):
 
   @frame_base.with_docs_from(pd.DataFrame)
   def value_counts(self, subset=None, sort=False, normalize=False,
-                   ascending=False, dropna=True):
+                   ascending=False):
     """``sort`` is ``False`` by default, and ``sort=True`` is not supported
     because it imposes an ordering on the dataset which likely will not be
     preserved."""
@@ -3406,16 +3403,10 @@ class DeferredDataFrame(DeferredDataFrameOrSeries):
           "ordering on the dataset which likely will not be preserved.",
           reason="order-sensitive")
     columns = subset or list(self.columns)
-
-    if dropna:
-      dropped = self.dropna()
-    else:
-      dropped = self
-
-    result = dropped.groupby(columns, dropna=dropna).size()
+    result = self.groupby(columns).size()
 
     if normalize:
-      return result/dropped.length()
+      return result/self.dropna().length()
     else:
       return result
 
diff --git a/sdks/python/apache_beam/dataframe/frames_test.py b/sdks/python/apache_beam/dataframe/frames_test.py
index a2703d8..c3972ad 100644
--- a/sdks/python/apache_beam/dataframe/frames_test.py
+++ b/sdks/python/apache_beam/dataframe/frames_test.py
@@ -25,8 +25,7 @@ from apache_beam.dataframe import expressions
 from apache_beam.dataframe import frame_base
 from apache_beam.dataframe import frames
 
-# Get major, minor version
-PD_VERSION = tuple(map(int, pd.__version__.split('.')[0:2]))
+PD_VERSION = tuple(map(int, pd.__version__.split('.')))
 
 GROUPBY_DF = pd.DataFrame({
     'group': ['a' if i % 5 == 0 or i % 3 == 0 else 'b' for i in range(100)],
@@ -236,17 +235,6 @@ class DeferredFrameTest(_AbstractFrameTest):
     self._run_test(
         lambda df, df2: df.subtract(2).multiply(df2).divide(df), df, df2)
 
-  @unittest.skipIf(PD_VERSION < (1, 3), "dropna=False is new in pandas 1.3")
-  def test_value_counts_dropna_false(self):
-    df = pd.DataFrame({
-        'first_name': ['John', 'Anne', 'John', 'Beth'],
-        'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']
-    })
-    # TODO(BEAM-12495): Remove the assertRaises this when the underlying bug in
-    # https://github.com/pandas-dev/pandas/issues/36470 is fixed.
-    with self.assertRaises(NotImplementedError):
-      self._run_test(lambda df: df.value_counts(dropna=False), df)
-
   def test_get_column(self):
     df = pd.DataFrame({
         'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'],
@@ -381,15 +369,10 @@ class DeferredFrameTest(_AbstractFrameTest):
         nonparallel=True)
 
   def test_combine_Series(self):
-    s1 = pd.Series({'falcon': 330.0, 'eagle': 160.0})
-    s2 = pd.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0})
-    self._run_test(
-        lambda s1,
-        s2: s1.combine(s2, max),
-        s1,
-        s2,
-        nonparallel=True,
-        check_proxy=False)
+    with expressions.allow_non_parallel_operations():
+      s1 = pd.Series({'falcon': 330.0, 'eagle': 160.0})
+      s2 = pd.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0})
+      self._run_test(lambda s1, s2: s1.combine(s2, max), s1, s2)
 
   def test_combine_first_dataframe(self):
     df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})
@@ -604,27 +587,8 @@ class DeferredFrameTest(_AbstractFrameTest):
     self._run_test(lambda df: df.value_counts(), df)
     self._run_test(lambda df: df.value_counts(normalize=True), df)
 
-    if PD_VERSION >= (1, 3):
-      # dropna=False is new in pandas 1.3
-      # TODO(BEAM-12495): Remove the assertRaises this when the underlying bug
-      # in https://github.com/pandas-dev/pandas/issues/36470 is fixed.
-      with self.assertRaises(NotImplementedError):
-        self._run_test(lambda df: df.value_counts(dropna=False), df)
-
-    # Test the defaults.
     self._run_test(lambda df: df.num_wings.value_counts(), df)
     self._run_test(lambda df: df.num_wings.value_counts(normalize=True), df)
-    self._run_test(lambda df: df.num_wings.value_counts(dropna=False), df)
-
-    # Test the combination interactions.
-    for normalize in (True, False):
-      for dropna in (True, False):
-        self._run_test(
-            lambda df,
-            dropna=dropna,
-            normalize=normalize: df.num_wings.value_counts(
-                dropna=dropna, normalize=normalize),
-            df)
 
   def test_value_counts_does_not_support_sort(self):
     df = pd.DataFrame({
diff --git a/sdks/python/apache_beam/dataframe/pandas_doctests_test.py b/sdks/python/apache_beam/dataframe/pandas_doctests_test.py
index 755e4e5..edc42f1 100644
--- a/sdks/python/apache_beam/dataframe/pandas_doctests_test.py
+++ b/sdks/python/apache_beam/dataframe/pandas_doctests_test.py
@@ -20,7 +20,6 @@ import unittest
 import pandas as pd
 
 from apache_beam.dataframe import doctests
-from apache_beam.dataframe.frames import PD_VERSION
 from apache_beam.dataframe.pandas_top_level_functions import _is_top_level_function
 
 
@@ -69,8 +68,7 @@ class DoctestTest(unittest.TestCase):
                 "df.replace(regex={r'^ba.$': 'new', 'foo': 'xyz'})"
             ],
             'pandas.core.generic.NDFrame.fillna': [
-                'df.fillna(method=\'ffill\')',
-                'df.fillna(method="ffill")',
+                "df.fillna(method='ffill')",
                 'df.fillna(value=values, limit=1)',
             ],
             'pandas.core.generic.NDFrame.sort_values': ['*'],
@@ -166,8 +164,7 @@ class DoctestTest(unittest.TestCase):
             'pandas.core.frame.DataFrame.cumprod': ['*'],
             'pandas.core.frame.DataFrame.diff': ['*'],
             'pandas.core.frame.DataFrame.fillna': [
-                'df.fillna(method=\'ffill\')',
-                'df.fillna(method="ffill")',
+                "df.fillna(method='ffill')",
                 'df.fillna(value=values, limit=1)',
             ],
             'pandas.core.frame.DataFrame.items': ['*'],
@@ -240,8 +237,6 @@ class DoctestTest(unittest.TestCase):
                 # reindex not supported
                 's2 = s.reindex([1, 0, 2, 3])',
             ],
-            'pandas.core.frame.DataFrame.resample': ['*'],
-            'pandas.core.frame.DataFrame.values': ['*'],
         },
         not_implemented_ok={
             'pandas.core.frame.DataFrame.transform': [
@@ -249,8 +244,6 @@ class DoctestTest(unittest.TestCase):
                 # frames_test.py::DeferredFrameTest::test_groupby_transform_sum
                 "df.groupby('Date')['Data'].transform('sum')",
             ],
-            'pandas.core.frame.DataFrame.swaplevel': ['*'],
-            'pandas.core.frame.DataFrame.melt': ['*'],
             'pandas.core.frame.DataFrame.reindex_axis': ['*'],
             'pandas.core.frame.DataFrame.round': [
                 'df.round(decimals)',
@@ -274,11 +267,6 @@ class DoctestTest(unittest.TestCase):
             'pandas.core.frame.DataFrame.set_index': [
                 "df.set_index([s, s**2])",
             ],
-
-            # TODO(BEAM-12495)
-            'pandas.core.frame.DataFrame.value_counts': [
-              'df.value_counts(dropna=False)'
-            ],
         },
         skip={
             # s2 created with reindex
@@ -286,8 +274,6 @@ class DoctestTest(unittest.TestCase):
                 'df.dot(s2)',
             ],
 
-            'pandas.core.frame.DataFrame.resample': ['df'],
-            'pandas.core.frame.DataFrame.asfreq': ['*'],
             # Throws NotImplementedError when modifying df
             'pandas.core.frame.DataFrame.axes': [
                 # Returns deferred index.
@@ -316,14 +302,6 @@ class DoctestTest(unittest.TestCase):
             'pandas.core.frame.DataFrame.to_markdown': ['*'],
             'pandas.core.frame.DataFrame.to_parquet': ['*'],
 
-            # Raises right exception, but testing framework has matching issues.
-            # Tested in `frames_test.py`.
-            'pandas.core.frame.DataFrame.insert': [
-                'df',
-                'df.insert(1, "newcol", [99, 99])',
-                'df.insert(0, "col1", [100, 100], allow_duplicates=True)'
-            ],
-
             'pandas.core.frame.DataFrame.to_records': [
                 'df.index = df.index.rename("I")',
                 'index_dtypes = f"<S{df.index.str.len().max()}"', # 1.x
@@ -407,8 +385,7 @@ class DoctestTest(unittest.TestCase):
                 's.dot(arr)',  # non-deferred result
             ],
             'pandas.core.series.Series.fillna': [
-                'df.fillna(method=\'ffill\')',
-                'df.fillna(method="ffill")',
+                "df.fillna(method='ffill')",
                 'df.fillna(value=values, limit=1)',
             ],
             'pandas.core.series.Series.items': ['*'],
@@ -457,11 +434,11 @@ class DoctestTest(unittest.TestCase):
                 's.drop_duplicates()',
                 "s.drop_duplicates(keep='last')",
             ],
+            'pandas.core.series.Series.repeat': [
+                's.repeat([1, 2, 3])'
+            ],
             'pandas.core.series.Series.reindex': ['*'],
             'pandas.core.series.Series.autocorr': ['*'],
-            'pandas.core.series.Series.repeat': ['s.repeat([1, 2, 3])'],
-            'pandas.core.series.Series.resample': ['*'],
-            'pandas.core.series.Series': ['ser.iloc[0] = 999'],
         },
         not_implemented_ok={
             'pandas.core.series.Series.transform': [
@@ -474,11 +451,8 @@ class DoctestTest(unittest.TestCase):
                 'ser.groupby(["a", "b", "a", np.nan]).mean()',
                 'ser.groupby(["a", "b", "a", np.nan], dropna=False).mean()',
             ],
-            'pandas.core.series.Series.swaplevel' :['*']
         },
         skip={
-            # Relies on setting values with iloc
-            'pandas.core.series.Series': ['ser', 'r'],
             'pandas.core.series.Series.groupby': [
                 # TODO(BEAM-11393): This example requires aligning two series
                 # with non-unique indexes. It only works in pandas because
@@ -486,7 +460,6 @@ class DoctestTest(unittest.TestCase):
                 # alignment.
                 'ser.groupby(ser > 100).mean()',
             ],
-            'pandas.core.series.Series.asfreq': ['*'],
             # error formatting
             'pandas.core.series.Series.append': [
                 's1.append(s2, verify_integrity=True)',
@@ -518,12 +491,12 @@ class DoctestTest(unittest.TestCase):
                 # Inspection after modification.
                 's'
             ],
-            'pandas.core.series.Series.resample': ['df'],
         })
     self.assertEqual(result.failed, 0)
 
   def test_string_tests(self):
-    if PD_VERSION < (1, 2):
+    PD_VERSION = tuple(int(v) for v in pd.__version__.split('.'))
+    if PD_VERSION < (1, 2, 0):
       module = pd.core.strings
     else:
       # Definitions were moved to accessor in pandas 1.2.0
@@ -695,13 +668,11 @@ class DoctestTest(unittest.TestCase):
             'pandas.core.groupby.generic.SeriesGroupBy.diff': ['*'],
             'pandas.core.groupby.generic.DataFrameGroupBy.hist': ['*'],
             'pandas.core.groupby.generic.DataFrameGroupBy.fillna': [
-                'df.fillna(method=\'ffill\')',
-                'df.fillna(method="ffill")',
+                "df.fillna(method='ffill')",
                 'df.fillna(value=values, limit=1)',
             ],
             'pandas.core.groupby.generic.SeriesGroupBy.fillna': [
-                'df.fillna(method=\'ffill\')',
-                'df.fillna(method="ffill")',
+                "df.fillna(method='ffill')",
                 'df.fillna(value=values, limit=1)',
             ],
         },
@@ -711,7 +682,6 @@ class DoctestTest(unittest.TestCase):
             'pandas.core.groupby.generic.SeriesGroupBy.transform': ['*'],
             'pandas.core.groupby.generic.SeriesGroupBy.idxmax': ['*'],
             'pandas.core.groupby.generic.SeriesGroupBy.idxmin': ['*'],
-            'pandas.core.groupby.generic.SeriesGroupBy.apply': ['*'],
         },
         skip={
             'pandas.core.groupby.generic.SeriesGroupBy.cov': [
@@ -728,14 +698,6 @@ class DoctestTest(unittest.TestCase):
             # These examples rely on grouping by a list
             'pandas.core.groupby.generic.SeriesGroupBy.aggregate': ['*'],
             'pandas.core.groupby.generic.DataFrameGroupBy.aggregate': ['*'],
-            'pandas.core.groupby.generic.SeriesGroupBy.transform': [
-                # Dropping invalid columns during a transform is unsupported.
-                'grouped.transform(lambda x: (x - x.mean()) / x.std())'
-            ],
-            'pandas.core.groupby.generic.DataFrameGroupBy.transform': [
-                # Dropping invalid columns during a transform is unsupported.
-                'grouped.transform(lambda x: (x - x.mean()) / x.std())'
-            ],
         })
     self.assertEqual(result.failed, 0)
 
diff --git a/sdks/python/setup.py b/sdks/python/setup.py
index 338251d..f4e02b8 100644
--- a/sdks/python/setup.py
+++ b/sdks/python/setup.py
@@ -166,7 +166,7 @@ if sys.platform == 'win32' and sys.maxsize <= 2**32:
 REQUIRED_TEST_PACKAGES = [
     'freezegun>=0.3.12',
     'mock>=1.0.1,<3.0.0',
-    'pandas>=1.0,<1.4.0',
+    'pandas>=1.0,<1.3.0',
     'parameterized>=0.7.1,<0.8.0',
     'pyhamcrest>=1.9,!=1.10.0,<2.0.0',
     'pyyaml>=3.12,<6.0.0',