You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@beam.apache.org by ib...@apache.org on 2021/08/25 17:34:23 UTC
[beam] branch master updated: [BEAM-12764] Revert "Merge pull
request #15165 from [BEAM-12593] Verify DataFrame API on pandas 1.3.0"
This is an automated email from the ASF dual-hosted git repository.
ibzib pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git
The following commit(s) were added to refs/heads/master by this push:
new fc7df4b [BEAM-12764] Revert "Merge pull request #15165 from [BEAM-12593] Verify DataFrame API on pandas 1.3.0"
new aa1c8e5 Merge pull request #15385 from ibzib/rollback-dataframes
fc7df4b is described below
commit fc7df4b97c571ff15dd5c388051fca1bf613665d
Author: Kyle Weaver <kc...@google.com>
AuthorDate: Tue Aug 24 16:13:39 2021 -0700
[BEAM-12764] Revert "Merge pull request #15165 from [BEAM-12593] Verify DataFrame API on pandas 1.3.0"
This reverts commit faac725e98f8422b172a77e8c898af03c15b74c6.
---
sdks/python/apache_beam/dataframe/frames.py | 21 +++-----
sdks/python/apache_beam/dataframe/frames_test.py | 46 ++---------------
.../apache_beam/dataframe/pandas_doctests_test.py | 58 ++++------------------
sdks/python/setup.py | 2 +-
4 files changed, 22 insertions(+), 105 deletions(-)
diff --git a/sdks/python/apache_beam/dataframe/frames.py b/sdks/python/apache_beam/dataframe/frames.py
index 45ae8c6..b834d9c 100644
--- a/sdks/python/apache_beam/dataframe/frames.py
+++ b/sdks/python/apache_beam/dataframe/frames.py
@@ -55,9 +55,6 @@ __all__ = [
'DeferredDataFrame',
]
-# Get major, minor version
-PD_VERSION = tuple(map(int, pd.__version__.split('.')[0:2]))
-
def populate_not_implemented(pd_type):
def wrapper(deferred_type):
@@ -1935,7 +1932,7 @@ class DeferredSeries(DeferredDataFrameOrSeries):
else:
column = self
- result = column.groupby(column, dropna=dropna).size()
+ result = column.groupby(column).size()
# groupby.size() names the index, which we don't need
result.index.name = None
@@ -2395,8 +2392,8 @@ class DeferredDataFrame(DeferredDataFrameOrSeries):
if func in ('quantile',):
return getattr(self, func)(*args, axis=axis, **kwargs)
- # In pandas<1.3.0, maps to a property, args are ignored
- if func in ('size',) and PD_VERSION < (1, 3):
+ # Maps to a property, args are ignored
+ if func in ('size',):
return getattr(self, func)
# We also have specialized distributed implementations for these. They only
@@ -3395,7 +3392,7 @@ class DeferredDataFrame(DeferredDataFrameOrSeries):
@frame_base.with_docs_from(pd.DataFrame)
def value_counts(self, subset=None, sort=False, normalize=False,
- ascending=False, dropna=True):
+ ascending=False):
"""``sort`` is ``False`` by default, and ``sort=True`` is not supported
because it imposes an ordering on the dataset which likely will not be
preserved."""
@@ -3406,16 +3403,10 @@ class DeferredDataFrame(DeferredDataFrameOrSeries):
"ordering on the dataset which likely will not be preserved.",
reason="order-sensitive")
columns = subset or list(self.columns)
-
- if dropna:
- dropped = self.dropna()
- else:
- dropped = self
-
- result = dropped.groupby(columns, dropna=dropna).size()
+ result = self.groupby(columns).size()
if normalize:
- return result/dropped.length()
+ return result/self.dropna().length()
else:
return result
diff --git a/sdks/python/apache_beam/dataframe/frames_test.py b/sdks/python/apache_beam/dataframe/frames_test.py
index a2703d8..c3972ad 100644
--- a/sdks/python/apache_beam/dataframe/frames_test.py
+++ b/sdks/python/apache_beam/dataframe/frames_test.py
@@ -25,8 +25,7 @@ from apache_beam.dataframe import expressions
from apache_beam.dataframe import frame_base
from apache_beam.dataframe import frames
-# Get major, minor version
-PD_VERSION = tuple(map(int, pd.__version__.split('.')[0:2]))
+PD_VERSION = tuple(map(int, pd.__version__.split('.')))
GROUPBY_DF = pd.DataFrame({
'group': ['a' if i % 5 == 0 or i % 3 == 0 else 'b' for i in range(100)],
@@ -236,17 +235,6 @@ class DeferredFrameTest(_AbstractFrameTest):
self._run_test(
lambda df, df2: df.subtract(2).multiply(df2).divide(df), df, df2)
- @unittest.skipIf(PD_VERSION < (1, 3), "dropna=False is new in pandas 1.3")
- def test_value_counts_dropna_false(self):
- df = pd.DataFrame({
- 'first_name': ['John', 'Anne', 'John', 'Beth'],
- 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']
- })
- # TODO(BEAM-12495): Remove the assertRaises this when the underlying bug in
- # https://github.com/pandas-dev/pandas/issues/36470 is fixed.
- with self.assertRaises(NotImplementedError):
- self._run_test(lambda df: df.value_counts(dropna=False), df)
-
def test_get_column(self):
df = pd.DataFrame({
'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'],
@@ -381,15 +369,10 @@ class DeferredFrameTest(_AbstractFrameTest):
nonparallel=True)
def test_combine_Series(self):
- s1 = pd.Series({'falcon': 330.0, 'eagle': 160.0})
- s2 = pd.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0})
- self._run_test(
- lambda s1,
- s2: s1.combine(s2, max),
- s1,
- s2,
- nonparallel=True,
- check_proxy=False)
+ with expressions.allow_non_parallel_operations():
+ s1 = pd.Series({'falcon': 330.0, 'eagle': 160.0})
+ s2 = pd.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0})
+ self._run_test(lambda s1, s2: s1.combine(s2, max), s1, s2)
def test_combine_first_dataframe(self):
df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})
@@ -604,27 +587,8 @@ class DeferredFrameTest(_AbstractFrameTest):
self._run_test(lambda df: df.value_counts(), df)
self._run_test(lambda df: df.value_counts(normalize=True), df)
- if PD_VERSION >= (1, 3):
- # dropna=False is new in pandas 1.3
- # TODO(BEAM-12495): Remove the assertRaises this when the underlying bug
- # in https://github.com/pandas-dev/pandas/issues/36470 is fixed.
- with self.assertRaises(NotImplementedError):
- self._run_test(lambda df: df.value_counts(dropna=False), df)
-
- # Test the defaults.
self._run_test(lambda df: df.num_wings.value_counts(), df)
self._run_test(lambda df: df.num_wings.value_counts(normalize=True), df)
- self._run_test(lambda df: df.num_wings.value_counts(dropna=False), df)
-
- # Test the combination interactions.
- for normalize in (True, False):
- for dropna in (True, False):
- self._run_test(
- lambda df,
- dropna=dropna,
- normalize=normalize: df.num_wings.value_counts(
- dropna=dropna, normalize=normalize),
- df)
def test_value_counts_does_not_support_sort(self):
df = pd.DataFrame({
diff --git a/sdks/python/apache_beam/dataframe/pandas_doctests_test.py b/sdks/python/apache_beam/dataframe/pandas_doctests_test.py
index 755e4e5..edc42f1 100644
--- a/sdks/python/apache_beam/dataframe/pandas_doctests_test.py
+++ b/sdks/python/apache_beam/dataframe/pandas_doctests_test.py
@@ -20,7 +20,6 @@ import unittest
import pandas as pd
from apache_beam.dataframe import doctests
-from apache_beam.dataframe.frames import PD_VERSION
from apache_beam.dataframe.pandas_top_level_functions import _is_top_level_function
@@ -69,8 +68,7 @@ class DoctestTest(unittest.TestCase):
"df.replace(regex={r'^ba.$': 'new', 'foo': 'xyz'})"
],
'pandas.core.generic.NDFrame.fillna': [
- 'df.fillna(method=\'ffill\')',
- 'df.fillna(method="ffill")',
+ "df.fillna(method='ffill')",
'df.fillna(value=values, limit=1)',
],
'pandas.core.generic.NDFrame.sort_values': ['*'],
@@ -166,8 +164,7 @@ class DoctestTest(unittest.TestCase):
'pandas.core.frame.DataFrame.cumprod': ['*'],
'pandas.core.frame.DataFrame.diff': ['*'],
'pandas.core.frame.DataFrame.fillna': [
- 'df.fillna(method=\'ffill\')',
- 'df.fillna(method="ffill")',
+ "df.fillna(method='ffill')",
'df.fillna(value=values, limit=1)',
],
'pandas.core.frame.DataFrame.items': ['*'],
@@ -240,8 +237,6 @@ class DoctestTest(unittest.TestCase):
# reindex not supported
's2 = s.reindex([1, 0, 2, 3])',
],
- 'pandas.core.frame.DataFrame.resample': ['*'],
- 'pandas.core.frame.DataFrame.values': ['*'],
},
not_implemented_ok={
'pandas.core.frame.DataFrame.transform': [
@@ -249,8 +244,6 @@ class DoctestTest(unittest.TestCase):
# frames_test.py::DeferredFrameTest::test_groupby_transform_sum
"df.groupby('Date')['Data'].transform('sum')",
],
- 'pandas.core.frame.DataFrame.swaplevel': ['*'],
- 'pandas.core.frame.DataFrame.melt': ['*'],
'pandas.core.frame.DataFrame.reindex_axis': ['*'],
'pandas.core.frame.DataFrame.round': [
'df.round(decimals)',
@@ -274,11 +267,6 @@ class DoctestTest(unittest.TestCase):
'pandas.core.frame.DataFrame.set_index': [
"df.set_index([s, s**2])",
],
-
- # TODO(BEAM-12495)
- 'pandas.core.frame.DataFrame.value_counts': [
- 'df.value_counts(dropna=False)'
- ],
},
skip={
# s2 created with reindex
@@ -286,8 +274,6 @@ class DoctestTest(unittest.TestCase):
'df.dot(s2)',
],
- 'pandas.core.frame.DataFrame.resample': ['df'],
- 'pandas.core.frame.DataFrame.asfreq': ['*'],
# Throws NotImplementedError when modifying df
'pandas.core.frame.DataFrame.axes': [
# Returns deferred index.
@@ -316,14 +302,6 @@ class DoctestTest(unittest.TestCase):
'pandas.core.frame.DataFrame.to_markdown': ['*'],
'pandas.core.frame.DataFrame.to_parquet': ['*'],
- # Raises right exception, but testing framework has matching issues.
- # Tested in `frames_test.py`.
- 'pandas.core.frame.DataFrame.insert': [
- 'df',
- 'df.insert(1, "newcol", [99, 99])',
- 'df.insert(0, "col1", [100, 100], allow_duplicates=True)'
- ],
-
'pandas.core.frame.DataFrame.to_records': [
'df.index = df.index.rename("I")',
'index_dtypes = f"<S{df.index.str.len().max()}"', # 1.x
@@ -407,8 +385,7 @@ class DoctestTest(unittest.TestCase):
's.dot(arr)', # non-deferred result
],
'pandas.core.series.Series.fillna': [
- 'df.fillna(method=\'ffill\')',
- 'df.fillna(method="ffill")',
+ "df.fillna(method='ffill')",
'df.fillna(value=values, limit=1)',
],
'pandas.core.series.Series.items': ['*'],
@@ -457,11 +434,11 @@ class DoctestTest(unittest.TestCase):
's.drop_duplicates()',
"s.drop_duplicates(keep='last')",
],
+ 'pandas.core.series.Series.repeat': [
+ 's.repeat([1, 2, 3])'
+ ],
'pandas.core.series.Series.reindex': ['*'],
'pandas.core.series.Series.autocorr': ['*'],
- 'pandas.core.series.Series.repeat': ['s.repeat([1, 2, 3])'],
- 'pandas.core.series.Series.resample': ['*'],
- 'pandas.core.series.Series': ['ser.iloc[0] = 999'],
},
not_implemented_ok={
'pandas.core.series.Series.transform': [
@@ -474,11 +451,8 @@ class DoctestTest(unittest.TestCase):
'ser.groupby(["a", "b", "a", np.nan]).mean()',
'ser.groupby(["a", "b", "a", np.nan], dropna=False).mean()',
],
- 'pandas.core.series.Series.swaplevel' :['*']
},
skip={
- # Relies on setting values with iloc
- 'pandas.core.series.Series': ['ser', 'r'],
'pandas.core.series.Series.groupby': [
# TODO(BEAM-11393): This example requires aligning two series
# with non-unique indexes. It only works in pandas because
@@ -486,7 +460,6 @@ class DoctestTest(unittest.TestCase):
# alignment.
'ser.groupby(ser > 100).mean()',
],
- 'pandas.core.series.Series.asfreq': ['*'],
# error formatting
'pandas.core.series.Series.append': [
's1.append(s2, verify_integrity=True)',
@@ -518,12 +491,12 @@ class DoctestTest(unittest.TestCase):
# Inspection after modification.
's'
],
- 'pandas.core.series.Series.resample': ['df'],
})
self.assertEqual(result.failed, 0)
def test_string_tests(self):
- if PD_VERSION < (1, 2):
+ PD_VERSION = tuple(int(v) for v in pd.__version__.split('.'))
+ if PD_VERSION < (1, 2, 0):
module = pd.core.strings
else:
# Definitions were moved to accessor in pandas 1.2.0
@@ -695,13 +668,11 @@ class DoctestTest(unittest.TestCase):
'pandas.core.groupby.generic.SeriesGroupBy.diff': ['*'],
'pandas.core.groupby.generic.DataFrameGroupBy.hist': ['*'],
'pandas.core.groupby.generic.DataFrameGroupBy.fillna': [
- 'df.fillna(method=\'ffill\')',
- 'df.fillna(method="ffill")',
+ "df.fillna(method='ffill')",
'df.fillna(value=values, limit=1)',
],
'pandas.core.groupby.generic.SeriesGroupBy.fillna': [
- 'df.fillna(method=\'ffill\')',
- 'df.fillna(method="ffill")',
+ "df.fillna(method='ffill')",
'df.fillna(value=values, limit=1)',
],
},
@@ -711,7 +682,6 @@ class DoctestTest(unittest.TestCase):
'pandas.core.groupby.generic.SeriesGroupBy.transform': ['*'],
'pandas.core.groupby.generic.SeriesGroupBy.idxmax': ['*'],
'pandas.core.groupby.generic.SeriesGroupBy.idxmin': ['*'],
- 'pandas.core.groupby.generic.SeriesGroupBy.apply': ['*'],
},
skip={
'pandas.core.groupby.generic.SeriesGroupBy.cov': [
@@ -728,14 +698,6 @@ class DoctestTest(unittest.TestCase):
# These examples rely on grouping by a list
'pandas.core.groupby.generic.SeriesGroupBy.aggregate': ['*'],
'pandas.core.groupby.generic.DataFrameGroupBy.aggregate': ['*'],
- 'pandas.core.groupby.generic.SeriesGroupBy.transform': [
- # Dropping invalid columns during a transform is unsupported.
- 'grouped.transform(lambda x: (x - x.mean()) / x.std())'
- ],
- 'pandas.core.groupby.generic.DataFrameGroupBy.transform': [
- # Dropping invalid columns during a transform is unsupported.
- 'grouped.transform(lambda x: (x - x.mean()) / x.std())'
- ],
})
self.assertEqual(result.failed, 0)
diff --git a/sdks/python/setup.py b/sdks/python/setup.py
index 338251d..f4e02b8 100644
--- a/sdks/python/setup.py
+++ b/sdks/python/setup.py
@@ -166,7 +166,7 @@ if sys.platform == 'win32' and sys.maxsize <= 2**32:
REQUIRED_TEST_PACKAGES = [
'freezegun>=0.3.12',
'mock>=1.0.1,<3.0.0',
- 'pandas>=1.0,<1.4.0',
+ 'pandas>=1.0,<1.3.0',
'parameterized>=0.7.1,<0.8.0',
'pyhamcrest>=1.9,!=1.10.0,<2.0.0',
'pyyaml>=3.12,<6.0.0',