You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sr...@apache.org on 2019/02/27 00:22:52 UTC
[spark] branch master updated: [SPARK-26449][PYTHON] Add transform
method to DataFrame API
This is an automated email from the ASF dual-hosted git repository.
srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 387efe2 [SPARK-26449][PYTHON] Add transform method to DataFrame API
387efe2 is described below
commit 387efe29b7a5e2c64a898e50bd7dc98b641f7e96
Author: Hellsen83 <er...@gmail.com>
AuthorDate: Tue Feb 26 18:22:36 2019 -0600
[SPARK-26449][PYTHON] Add transform method to DataFrame API
## What changes were proposed in this pull request?
Added .transform() method to Python DataFrame API to be in sync with Scala API.
## How was this patch tested?
Addition has been tested manually.
Closes #23877 from Hellsen83/pyspark-dataframe-transform.
Authored-by: Hellsen83 <er...@gmail.com>
Signed-off-by: Sean Owen <se...@databricks.com>
---
python/pyspark/sql/dataframe.py | 25 +++++++++++++++++++++++++
1 file changed, 25 insertions(+)
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 472d296..75dd9fb 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -2046,6 +2046,31 @@ class DataFrame(object):
jdf = self._jdf.toDF(self._jseq(cols))
return DataFrame(jdf, self.sql_ctx)
+ @since(3.0)
+ def transform(self, func):
+ """Returns a new class:`DataFrame`. Concise syntax for chaining custom transformations.
+
+ :param func: a function that takes and returns a class:`DataFrame`.
+
+ >>> from pyspark.sql.functions import col
+ >>> df = spark.createDataFrame([(1, 1.0), (2, 2.0)], ["int", "float"])
+ >>> def cast_all_to_int(input_df):
+ ... return input_df.select([col(col_name).cast("int") for col_name in input_df.columns])
+ >>> def sort_columns_asc(input_df):
+ ... return input_df.select(*sorted(input_df.columns))
+ >>> df.transform(cast_all_to_int).transform(sort_columns_asc).show()
+ +-----+---+
+ |float|int|
+ +-----+---+
+ | 1| 1|
+ | 2| 2|
+ +-----+---+
+ """
+ result = func(self)
+ assert isinstance(result, DataFrame), "Func returned an instance of type [%s], " \
+ "should have been DataFrame." % type(result)
+ return result
+
@since(1.3)
def toPandas(self):
"""
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org