You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2021/07/28 09:41:07 UTC
[spark] branch branch-3.2 updated: [SPARK-36320][PYTHON] Fix
Series/Index.copy() to drop extra columns
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.2 by this push:
new c9af94e [SPARK-36320][PYTHON] Fix Series/Index.copy() to drop extra columns
c9af94e is described below
commit c9af94ecb4893668e2522b82a02f679547eb8996
Author: Takuya UESHIN <ue...@databricks.com>
AuthorDate: Wed Jul 28 18:39:53 2021 +0900
[SPARK-36320][PYTHON] Fix Series/Index.copy() to drop extra columns
### What changes were proposed in this pull request?
Fix `Series`/`Index.copy()` to drop extra columns.
### Why are the changes needed?
Currently `Series`/`Index.copy()` keeps the copy of the anchor DataFrame which holds unnecessary columns.
We can drop those when `Series`/`Index.copy()`.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Existing tests.
Closes #33549 from ueshin/issues/SPARK-36320/index_ops_copy.
Authored-by: Takuya UESHIN <ue...@databricks.com>
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
(cherry picked from commit 3c76a924ce2c930a561afcc3e017d9ce5b54cdf5)
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
python/pyspark/pandas/categorical.py | 27 +++++++--------------------
python/pyspark/pandas/groupby.py | 2 +-
python/pyspark/pandas/indexes/base.py | 2 +-
python/pyspark/pandas/series.py | 8 ++++----
4 files changed, 13 insertions(+), 26 deletions(-)
diff --git a/python/pyspark/pandas/categorical.py b/python/pyspark/pandas/categorical.py
index c12227e..77a3cee 100644
--- a/python/pyspark/pandas/categorical.py
+++ b/python/pyspark/pandas/categorical.py
@@ -261,8 +261,7 @@ class CategoricalAccessor(object):
self._data._psdf._update_internal_frame(internal)
return None
else:
- psser = DataFrame(internal)._psser_for(self._data._column_label)
- return psser._with_new_scol(psser.spark.column, field=psser._internal.data_fields[0])
+ return DataFrame(internal)._psser_for(self._data._column_label).copy()
def _set_ordered(self, *, ordered: bool, inplace: bool) -> Optional["ps.Series"]:
from pyspark.pandas.frame import DataFrame
@@ -271,7 +270,7 @@ class CategoricalAccessor(object):
if inplace:
return None
else:
- psser = self._data
+ return self._data.copy()
else:
internal = self._data._psdf._internal.with_new_spark_column(
self._data._column_label,
@@ -284,9 +283,7 @@ class CategoricalAccessor(object):
self._data._psdf._update_internal_frame(internal)
return None
else:
- psser = DataFrame(internal)._psser_for(self._data._column_label)
-
- return psser._with_new_scol(psser.spark.column, field=psser._internal.data_fields[0])
+ return DataFrame(internal)._psser_for(self._data._column_label).copy()
def as_ordered(self, inplace: bool = False) -> Optional["ps.Series"]:
"""
@@ -455,10 +452,7 @@ class CategoricalAccessor(object):
if inplace:
return None
else:
- psser = self._data
- return psser._with_new_scol(
- psser.spark.column, field=psser._internal.data_fields[0]
- )
+ return self._data.copy()
else:
dtype = CategoricalDtype(
[cat for cat in self.categories if cat not in categories], ordered=self.ordered
@@ -646,8 +640,7 @@ class CategoricalAccessor(object):
self._data._psdf._update_internal_frame(internal)
return None
else:
- psser = DataFrame(internal)._psser_for(self._data._column_label)
- return psser._with_new_scol(psser.spark.column, field=psser._internal.data_fields[0])
+ return DataFrame(internal)._psser_for(self._data._column_label).copy()
def reorder_categories(
self,
@@ -739,10 +732,7 @@ class CategoricalAccessor(object):
if inplace:
return None
else:
- psser = self._data
- return psser._with_new_scol(
- psser.spark.column, field=psser._internal.data_fields[0]
- )
+ return self._data.copy()
else:
dtype = CategoricalDtype(categories=new_categories, ordered=ordered)
psser = self._data.astype(dtype)
@@ -897,10 +887,7 @@ class CategoricalAccessor(object):
self._data._psdf._update_internal_frame(internal)
return None
else:
- psser = DataFrame(internal)._psser_for(self._data._column_label)
- return psser._with_new_scol(
- psser.spark.column, field=psser._internal.data_fields[0]
- )
+ return DataFrame(internal)._psser_for(self._data._column_label).copy()
else:
psser = self._data.astype(new_dtype)
if inplace:
diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
index 89fd4f2..c91fcd7 100644
--- a/python/pyspark/pandas/groupby.py
+++ b/python/pyspark/pandas/groupby.py
@@ -2943,7 +2943,7 @@ class SeriesGroupBy(GroupBy[Series]):
internal = psser._internal.resolved_copy
return first_series(DataFrame(internal))
else:
- return psser
+ return psser.copy()
def _cleanup_and_return(self, pdf: pd.DataFrame) -> Series:
return first_series(pdf).rename().rename(self._psser.name)
diff --git a/python/pyspark/pandas/indexes/base.py b/python/pyspark/pandas/indexes/base.py
index 1be46fd..4158d61 100644
--- a/python/pyspark/pandas/indexes/base.py
+++ b/python/pyspark/pandas/indexes/base.py
@@ -1270,7 +1270,7 @@ class Index(IndexOpsMixin):
>>> df.index.copy(name='snake')
Index(['cobra', 'viper', 'sidewinder'], dtype='object', name='snake')
"""
- result = self._psdf.copy().index
+ result = self._psdf[[]].index
if name:
result.name = name
return result
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
index 2c1db8f..3eb3a2c 100644
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@@ -1895,7 +1895,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
self._psdf._update_internal_frame(psser._psdf._internal, requires_same_anchor=False)
return None
else:
- return psser._with_new_scol(psser.spark.column, field=psser._internal.data_fields[0])
+ return psser.copy()
def _fillna(
self,
@@ -1919,7 +1919,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
cond = scol.isNull() | F.isnan(scol)
else:
if not self.spark.nullable:
- return self.copy()
+ return self._psdf.copy()._psser_for(self._column_label)
cond = scol.isNull()
if value is not None:
@@ -4109,7 +4109,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
b 2
dtype: int64
"""
- return self._psdf.copy(deep=deep)._psser_for(self._column_label)
+ return first_series(DataFrame(self._internal))
def mode(self, dropna: bool = True) -> "Series":
"""
@@ -6185,7 +6185,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
internal = psser._internal.resolved_copy
return first_series(DataFrame(internal))
else:
- return psser
+ return psser.copy()
def _reduce_for_stat_function(
self,
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org