You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ru...@apache.org on 2024/02/02 01:04:47 UTC

(spark) branch master updated: [SPARK-46936][PS] Implement `Frame.to_feather`

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 53cbaeb20293 [SPARK-46936][PS] Implement `Frame.to_feather`
53cbaeb20293 is described below

commit 53cbaeb202931a918864ca4aecd9826144ad8307
Author: Ruifeng Zheng <ru...@apache.org>
AuthorDate: Fri Feb 2 09:04:26 2024 +0800

    [SPARK-46936][PS] Implement `Frame.to_feather`
    
    ### What changes were proposed in this pull request?
    Implement `Frame.to_feather`
    
    ### Why are the changes needed?
    for pandas parity
    
    ### Does this PR introduce _any_ user-facing change?
    yes
    ```
    In [3]: pdf = pd.DataFrame(
       ...:             [[1, 1.0, "a"]],
       ...:             columns=["x", "y", "z"],
       ...:         )
    
    In [4]: pdf
    Out[4]:
       x    y  z
    0  1  1.0  a
    
    In [5]: psdf = ps.from_pandas(pdf)
    
    In [6]: psdf
    
       x    y  z
    0  1  1.0  a
    
    In [7]: pdf.to_feather("/tmp/file1.feather")
    
    In [8]: psdf.to_feather("/tmp/file2.feather")
    
    In [9]: f1 = pd.read_feather("/tmp/file1.feather")
    
    In [10]: f1
    Out[10]:
       x    y  z
    0  1  1.0  a
    
    In [11]: f2 = pd.read_feather("/tmp/file2.feather")
    
    In [12]: f2
    Out[12]:
       x    y  z
    0  1  1.0  a
    
    ```
    
    ### How was this patch tested?
    added ut
    
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #44972 from zhengruifeng/ps_to_feather.
    
    Authored-by: Ruifeng Zheng <ru...@apache.org>
    Signed-off-by: Ruifeng Zheng <ru...@apache.org>
---
 dev/sparktestsupport/modules.py                    |  2 +
 .../docs/source/reference/pyspark.pandas/frame.rst |  1 +
 python/pyspark/pandas/frame.py                     | 35 +++++++++++
 python/pyspark/pandas/missing/frame.py             |  1 -
 .../pandas/tests/connect/io/test_parity_feather.py | 42 +++++++++++++
 python/pyspark/pandas/tests/io/test_feather.py     | 68 ++++++++++++++++++++++
 6 files changed, 148 insertions(+), 1 deletion(-)

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 508cf56b9c87..233dcf4e54b6 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -816,6 +816,7 @@ pyspark_pandas = Module(
         "pyspark.pandas.tests.series.test_stat",
         "pyspark.pandas.tests.io.test_io",
         "pyspark.pandas.tests.io.test_csv",
+        "pyspark.pandas.tests.io.test_feather",
         "pyspark.pandas.tests.io.test_dataframe_conversion",
         "pyspark.pandas.tests.io.test_dataframe_spark_io",
         "pyspark.pandas.tests.io.test_series_conversion",
@@ -1297,6 +1298,7 @@ pyspark_pandas_connect_part3 = Module(
         # pandas-on-Spark unittests
         "pyspark.pandas.tests.connect.io.test_parity_io",
         "pyspark.pandas.tests.connect.io.test_parity_csv",
+        "pyspark.pandas.tests.connect.io.test_parity_feather",
         "pyspark.pandas.tests.connect.io.test_parity_dataframe_conversion",
         "pyspark.pandas.tests.connect.io.test_parity_dataframe_spark_io",
         "pyspark.pandas.tests.connect.io.test_parity_series_conversion",
diff --git a/python/docs/source/reference/pyspark.pandas/frame.rst b/python/docs/source/reference/pyspark.pandas/frame.rst
index 77b60468b8fb..564ddb607a19 100644
--- a/python/docs/source/reference/pyspark.pandas/frame.rst
+++ b/python/docs/source/reference/pyspark.pandas/frame.rst
@@ -283,6 +283,7 @@ Serialization / IO / Conversion
    DataFrame.to_numpy
    DataFrame.to_spark
    DataFrame.to_string
+   DataFrame.to_feather
    DataFrame.to_json
    DataFrame.to_dict
    DataFrame.to_excel
diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index 7222b877bba1..3b3565f7ea9f 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -2648,6 +2648,41 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
             psdf._to_internal_pandas(), self.to_latex, pd.DataFrame.to_latex, args
         )
 
+    def to_feather(
+        self,
+        path: Union[str, IO[str]],
+        **kwargs: Any,
+    ) -> None:
+        """
+        Write a DataFrame to the binary Feather format.
+
+        .. note:: This method should only be used if the resulting DataFrame is expected
+                  to be small, as all the data is loaded into the driver's memory.
+
+        .. versionadded:: 4.0.0
+
+        Parameters
+        ----------
+        path : str, path object, file-like object
+            String, path object (implementing ``os.PathLike[str]``), or file-like
+            object implementing a binary ``write()`` function.
+        **kwargs :
+            Additional keywords passed to :func:`pyarrow.feather.write_feather`.
+            This includes the `compression`, `compression_level`, `chunksize`
+            and `version` keywords.
+
+        Examples
+        --------
+        >>> df = ps.DataFrame([[1, 2, 3], [4, 5, 6]])
+        >>> df.to_feather("file.feather")  # doctest: +SKIP
+        """
+        # Make sure locals() call is at the top of the function so we don't capture local variables.
+        args = locals()
+
+        return validate_arguments_and_invoke_function(
+            self._to_internal_pandas(), self.to_feather, pd.DataFrame.to_feather, args
+        )
+
     def transpose(self) -> "DataFrame":
         """
         Transpose index and columns.
diff --git a/python/pyspark/pandas/missing/frame.py b/python/pyspark/pandas/missing/frame.py
index 25a3a2afa3df..fdb6cec7c0f9 100644
--- a/python/pyspark/pandas/missing/frame.py
+++ b/python/pyspark/pandas/missing/frame.py
@@ -42,7 +42,6 @@ class MissingPandasLikeDataFrame:
     infer_objects = _unsupported_function("infer_objects")
     reorder_levels = _unsupported_function("reorder_levels")
     set_axis = _unsupported_function("set_axis")
-    to_feather = _unsupported_function("to_feather")
     to_period = _unsupported_function("to_period")
     to_sql = _unsupported_function("to_sql")
     to_stata = _unsupported_function("to_stata")
diff --git a/python/pyspark/pandas/tests/connect/io/test_parity_feather.py b/python/pyspark/pandas/tests/connect/io/test_parity_feather.py
new file mode 100644
index 000000000000..5d48fed547f5
--- /dev/null
+++ b/python/pyspark/pandas/tests/connect/io/test_parity_feather.py
@@ -0,0 +1,42 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark.pandas.tests.io.test_feather import FeatherMixin
+from pyspark.testing.connectutils import ReusedConnectTestCase
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils
+
+
+class FeatherParityTests(
+    FeatherMixin,
+    PandasOnSparkTestUtils,
+    ReusedConnectTestCase,
+    TestUtils,
+):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.connect.io.test_parity_feather import *  # noqa: F401
+
+    try:
+        import xmlrunner
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/io/test_feather.py b/python/pyspark/pandas/tests/io/test_feather.py
new file mode 100644
index 000000000000..74fa6bc7d7b6
--- /dev/null
+++ b/python/pyspark/pandas/tests/io/test_feather.py
@@ -0,0 +1,68 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+import pandas as pd
+
+from pyspark import pandas as ps
+from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
+
+
+class FeatherMixin:
+    @property
+    def pdf(self):
+        return pd.DataFrame(
+            [[1, 1.0, "a"]],
+            columns=["x", "y", "z"],
+        )
+
+    @property
+    def psdf(self):
+        return ps.from_pandas(self.pdf)
+
+    def test_to_feather(self):
+        with self.temp_dir() as dirpath:
+            path1 = f"{dirpath}/file1.feather"
+            path2 = f"{dirpath}/file2.feather"
+
+            self.pdf.to_feather(path1)
+            self.psdf.to_feather(path2)
+
+            self.assert_eq(
+                pd.read_feather(path1),
+                pd.read_feather(path2),
+            )
+
+
+class FeatherTests(
+    FeatherMixin,
+    PandasOnSparkTestCase,
+    TestUtils,
+):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.io.test_feather import *  # noqa: F401
+
+    try:
+        import xmlrunner
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org