You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2021/07/28 09:41:07 UTC

[spark] branch branch-3.2 updated: [SPARK-36320][PYTHON] Fix Series/Index.copy() to drop extra columns

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.2 by this push:
     new c9af94e  [SPARK-36320][PYTHON] Fix Series/Index.copy() to drop extra columns
c9af94e is described below

commit c9af94ecb4893668e2522b82a02f679547eb8996
Author: Takuya UESHIN <ue...@databricks.com>
AuthorDate: Wed Jul 28 18:39:53 2021 +0900

    [SPARK-36320][PYTHON] Fix Series/Index.copy() to drop extra columns
    
    ### What changes were proposed in this pull request?
    
    Fix `Series`/`Index.copy()` to drop extra columns.
    
    ### Why are the changes needed?
    
    Currently `Series`/`Index.copy()` keeps the copy of the anchor DataFrame which holds unnecessary columns.
    We can drop those when `Series`/`Index.copy()`.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Existing tests.
    
    Closes #33549 from ueshin/issues/SPARK-36320/index_ops_copy.
    
    Authored-by: Takuya UESHIN <ue...@databricks.com>
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
    (cherry picked from commit 3c76a924ce2c930a561afcc3e017d9ce5b54cdf5)
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
 python/pyspark/pandas/categorical.py  | 27 +++++++--------------------
 python/pyspark/pandas/groupby.py      |  2 +-
 python/pyspark/pandas/indexes/base.py |  2 +-
 python/pyspark/pandas/series.py       |  8 ++++----
 4 files changed, 13 insertions(+), 26 deletions(-)

diff --git a/python/pyspark/pandas/categorical.py b/python/pyspark/pandas/categorical.py
index c12227e..77a3cee 100644
--- a/python/pyspark/pandas/categorical.py
+++ b/python/pyspark/pandas/categorical.py
@@ -261,8 +261,7 @@ class CategoricalAccessor(object):
             self._data._psdf._update_internal_frame(internal)
             return None
         else:
-            psser = DataFrame(internal)._psser_for(self._data._column_label)
-            return psser._with_new_scol(psser.spark.column, field=psser._internal.data_fields[0])
+            return DataFrame(internal)._psser_for(self._data._column_label).copy()
 
     def _set_ordered(self, *, ordered: bool, inplace: bool) -> Optional["ps.Series"]:
         from pyspark.pandas.frame import DataFrame
@@ -271,7 +270,7 @@ class CategoricalAccessor(object):
             if inplace:
                 return None
             else:
-                psser = self._data
+                return self._data.copy()
         else:
             internal = self._data._psdf._internal.with_new_spark_column(
                 self._data._column_label,
@@ -284,9 +283,7 @@ class CategoricalAccessor(object):
                 self._data._psdf._update_internal_frame(internal)
                 return None
             else:
-                psser = DataFrame(internal)._psser_for(self._data._column_label)
-
-        return psser._with_new_scol(psser.spark.column, field=psser._internal.data_fields[0])
+                return DataFrame(internal)._psser_for(self._data._column_label).copy()
 
     def as_ordered(self, inplace: bool = False) -> Optional["ps.Series"]:
         """
@@ -455,10 +452,7 @@ class CategoricalAccessor(object):
             if inplace:
                 return None
             else:
-                psser = self._data
-                return psser._with_new_scol(
-                    psser.spark.column, field=psser._internal.data_fields[0]
-                )
+                return self._data.copy()
         else:
             dtype = CategoricalDtype(
                 [cat for cat in self.categories if cat not in categories], ordered=self.ordered
@@ -646,8 +640,7 @@ class CategoricalAccessor(object):
             self._data._psdf._update_internal_frame(internal)
             return None
         else:
-            psser = DataFrame(internal)._psser_for(self._data._column_label)
-            return psser._with_new_scol(psser.spark.column, field=psser._internal.data_fields[0])
+            return DataFrame(internal)._psser_for(self._data._column_label).copy()
 
     def reorder_categories(
         self,
@@ -739,10 +732,7 @@ class CategoricalAccessor(object):
             if inplace:
                 return None
             else:
-                psser = self._data
-                return psser._with_new_scol(
-                    psser.spark.column, field=psser._internal.data_fields[0]
-                )
+                return self._data.copy()
         else:
             dtype = CategoricalDtype(categories=new_categories, ordered=ordered)
             psser = self._data.astype(dtype)
@@ -897,10 +887,7 @@ class CategoricalAccessor(object):
                 self._data._psdf._update_internal_frame(internal)
                 return None
             else:
-                psser = DataFrame(internal)._psser_for(self._data._column_label)
-                return psser._with_new_scol(
-                    psser.spark.column, field=psser._internal.data_fields[0]
-                )
+                return DataFrame(internal)._psser_for(self._data._column_label).copy()
         else:
             psser = self._data.astype(new_dtype)
             if inplace:
diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
index 89fd4f2..c91fcd7 100644
--- a/python/pyspark/pandas/groupby.py
+++ b/python/pyspark/pandas/groupby.py
@@ -2943,7 +2943,7 @@ class SeriesGroupBy(GroupBy[Series]):
             internal = psser._internal.resolved_copy
             return first_series(DataFrame(internal))
         else:
-            return psser
+            return psser.copy()
 
     def _cleanup_and_return(self, pdf: pd.DataFrame) -> Series:
         return first_series(pdf).rename().rename(self._psser.name)
diff --git a/python/pyspark/pandas/indexes/base.py b/python/pyspark/pandas/indexes/base.py
index 1be46fd..4158d61 100644
--- a/python/pyspark/pandas/indexes/base.py
+++ b/python/pyspark/pandas/indexes/base.py
@@ -1270,7 +1270,7 @@ class Index(IndexOpsMixin):
         >>> df.index.copy(name='snake')
         Index(['cobra', 'viper', 'sidewinder'], dtype='object', name='snake')
         """
-        result = self._psdf.copy().index
+        result = self._psdf[[]].index
         if name:
             result.name = name
         return result
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
index 2c1db8f..3eb3a2c 100644
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@@ -1895,7 +1895,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
             self._psdf._update_internal_frame(psser._psdf._internal, requires_same_anchor=False)
             return None
         else:
-            return psser._with_new_scol(psser.spark.column, field=psser._internal.data_fields[0])
+            return psser.copy()
 
     def _fillna(
         self,
@@ -1919,7 +1919,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
             cond = scol.isNull() | F.isnan(scol)
         else:
             if not self.spark.nullable:
-                return self.copy()
+                return self._psdf.copy()._psser_for(self._column_label)
             cond = scol.isNull()
 
         if value is not None:
@@ -4109,7 +4109,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
         b    2
         dtype: int64
         """
-        return self._psdf.copy(deep=deep)._psser_for(self._column_label)
+        return first_series(DataFrame(self._internal))
 
     def mode(self, dropna: bool = True) -> "Series":
         """
@@ -6185,7 +6185,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
             internal = psser._internal.resolved_copy
             return first_series(DataFrame(internal))
         else:
-            return psser
+            return psser.copy()
 
     def _reduce_for_stat_function(
         self,

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org