You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ru...@apache.org on 2024/02/02 01:04:47 UTC
(spark) branch master updated: [SPARK-46936][PS] Implement `Frame.to_feather`
This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 53cbaeb20293 [SPARK-46936][PS] Implement `Frame.to_feather`
53cbaeb20293 is described below
commit 53cbaeb202931a918864ca4aecd9826144ad8307
Author: Ruifeng Zheng <ru...@apache.org>
AuthorDate: Fri Feb 2 09:04:26 2024 +0800
[SPARK-46936][PS] Implement `Frame.to_feather`
### What changes were proposed in this pull request?
Implement `Frame.to_feather`
### Why are the changes needed?
for pandas parity
### Does this PR introduce _any_ user-facing change?
yes
```
In [3]: pdf = pd.DataFrame(
...: [[1, 1.0, "a"]],
...: columns=["x", "y", "z"],
...: )
In [4]: pdf
Out[4]:
x y z
0 1 1.0 a
In [5]: psdf = ps.from_pandas(pdf)
In [6]: psdf
x y z
0 1 1.0 a
In [7]: pdf.to_feather("/tmp/file1.feather")
In [8]: psdf.to_feather("/tmp/file2.feather")
In [9]: f1 = pd.read_feather("/tmp/file1.feather")
In [10]: f1
Out[10]:
x y z
0 1 1.0 a
In [11]: f2 = pd.read_feather("/tmp/file2.feather")
In [12]: f2
Out[12]:
x y z
0 1 1.0 a
```
### How was this patch tested?
added ut
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #44972 from zhengruifeng/ps_to_feather.
Authored-by: Ruifeng Zheng <ru...@apache.org>
Signed-off-by: Ruifeng Zheng <ru...@apache.org>
---
dev/sparktestsupport/modules.py | 2 +
.../docs/source/reference/pyspark.pandas/frame.rst | 1 +
python/pyspark/pandas/frame.py | 35 +++++++++++
python/pyspark/pandas/missing/frame.py | 1 -
.../pandas/tests/connect/io/test_parity_feather.py | 42 +++++++++++++
python/pyspark/pandas/tests/io/test_feather.py | 68 ++++++++++++++++++++++
6 files changed, 148 insertions(+), 1 deletion(-)
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 508cf56b9c87..233dcf4e54b6 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -816,6 +816,7 @@ pyspark_pandas = Module(
"pyspark.pandas.tests.series.test_stat",
"pyspark.pandas.tests.io.test_io",
"pyspark.pandas.tests.io.test_csv",
+ "pyspark.pandas.tests.io.test_feather",
"pyspark.pandas.tests.io.test_dataframe_conversion",
"pyspark.pandas.tests.io.test_dataframe_spark_io",
"pyspark.pandas.tests.io.test_series_conversion",
@@ -1297,6 +1298,7 @@ pyspark_pandas_connect_part3 = Module(
# pandas-on-Spark unittests
"pyspark.pandas.tests.connect.io.test_parity_io",
"pyspark.pandas.tests.connect.io.test_parity_csv",
+ "pyspark.pandas.tests.connect.io.test_parity_feather",
"pyspark.pandas.tests.connect.io.test_parity_dataframe_conversion",
"pyspark.pandas.tests.connect.io.test_parity_dataframe_spark_io",
"pyspark.pandas.tests.connect.io.test_parity_series_conversion",
diff --git a/python/docs/source/reference/pyspark.pandas/frame.rst b/python/docs/source/reference/pyspark.pandas/frame.rst
index 77b60468b8fb..564ddb607a19 100644
--- a/python/docs/source/reference/pyspark.pandas/frame.rst
+++ b/python/docs/source/reference/pyspark.pandas/frame.rst
@@ -283,6 +283,7 @@ Serialization / IO / Conversion
DataFrame.to_numpy
DataFrame.to_spark
DataFrame.to_string
+ DataFrame.to_feather
DataFrame.to_json
DataFrame.to_dict
DataFrame.to_excel
diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index 7222b877bba1..3b3565f7ea9f 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -2648,6 +2648,41 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
psdf._to_internal_pandas(), self.to_latex, pd.DataFrame.to_latex, args
)
+ def to_feather(
+ self,
+ path: Union[str, IO[str]],
+ **kwargs: Any,
+ ) -> None:
+ """
+ Write a DataFrame to the binary Feather format.
+
+ .. note:: This method should only be used if the resulting DataFrame is expected
+ to be small, as all the data is loaded into the driver's memory.
+
+ .. versionadded:: 4.0.0
+
+ Parameters
+ ----------
+ path : str, path object, file-like object
+ String, path object (implementing ``os.PathLike[str]``), or file-like
+ object implementing a binary ``write()`` function.
+ **kwargs :
+ Additional keywords passed to :func:`pyarrow.feather.write_feather`.
+ This includes the `compression`, `compression_level`, `chunksize`
+ and `version` keywords.
+
+ Examples
+ --------
+ >>> df = ps.DataFrame([[1, 2, 3], [4, 5, 6]])
+ >>> df.to_feather("file.feather") # doctest: +SKIP
+ """
+ # Make sure locals() call is at the top of the function so we don't capture local variables.
+ args = locals()
+
+ return validate_arguments_and_invoke_function(
+ self._to_internal_pandas(), self.to_feather, pd.DataFrame.to_feather, args
+ )
+
def transpose(self) -> "DataFrame":
"""
Transpose index and columns.
diff --git a/python/pyspark/pandas/missing/frame.py b/python/pyspark/pandas/missing/frame.py
index 25a3a2afa3df..fdb6cec7c0f9 100644
--- a/python/pyspark/pandas/missing/frame.py
+++ b/python/pyspark/pandas/missing/frame.py
@@ -42,7 +42,6 @@ class MissingPandasLikeDataFrame:
infer_objects = _unsupported_function("infer_objects")
reorder_levels = _unsupported_function("reorder_levels")
set_axis = _unsupported_function("set_axis")
- to_feather = _unsupported_function("to_feather")
to_period = _unsupported_function("to_period")
to_sql = _unsupported_function("to_sql")
to_stata = _unsupported_function("to_stata")
diff --git a/python/pyspark/pandas/tests/connect/io/test_parity_feather.py b/python/pyspark/pandas/tests/connect/io/test_parity_feather.py
new file mode 100644
index 000000000000..5d48fed547f5
--- /dev/null
+++ b/python/pyspark/pandas/tests/connect/io/test_parity_feather.py
@@ -0,0 +1,42 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark.pandas.tests.io.test_feather import FeatherMixin
+from pyspark.testing.connectutils import ReusedConnectTestCase
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils
+
+
+class FeatherParityTests(
+ FeatherMixin,
+ PandasOnSparkTestUtils,
+ ReusedConnectTestCase,
+ TestUtils,
+):
+ pass
+
+
+if __name__ == "__main__":
+ from pyspark.pandas.tests.connect.io.test_parity_feather import * # noqa: F401
+
+ try:
+ import xmlrunner
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/io/test_feather.py b/python/pyspark/pandas/tests/io/test_feather.py
new file mode 100644
index 000000000000..74fa6bc7d7b6
--- /dev/null
+++ b/python/pyspark/pandas/tests/io/test_feather.py
@@ -0,0 +1,68 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+import pandas as pd
+
+from pyspark import pandas as ps
+from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
+
+
+class FeatherMixin:
+ @property
+ def pdf(self):
+ return pd.DataFrame(
+ [[1, 1.0, "a"]],
+ columns=["x", "y", "z"],
+ )
+
+ @property
+ def psdf(self):
+ return ps.from_pandas(self.pdf)
+
+ def test_to_feather(self):
+ with self.temp_dir() as dirpath:
+ path1 = f"{dirpath}/file1.feather"
+ path2 = f"{dirpath}/file2.feather"
+
+ self.pdf.to_feather(path1)
+ self.psdf.to_feather(path2)
+
+ self.assert_eq(
+ pd.read_feather(path1),
+ pd.read_feather(path2),
+ )
+
+
+class FeatherTests(
+ FeatherMixin,
+ PandasOnSparkTestCase,
+ TestUtils,
+):
+ pass
+
+
+if __name__ == "__main__":
+ from pyspark.pandas.tests.io.test_feather import * # noqa: F401
+
+ try:
+ import xmlrunner
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org