You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2021/10/13 06:56:48 UTC
[spark] branch master updated: [SPARK-36973][PYTHON] Deduplicate
prepare data method for HistogramPlotBase and KdePlotBase
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new f678c75 [SPARK-36973][PYTHON] Deduplicate prepare data method for HistogramPlotBase and KdePlotBase
f678c75 is described below
commit f678c75d3940b2887fdb2621691b791b95d79469
Author: dch nguyen <dg...@viettel.com.vn>
AuthorDate: Wed Oct 13 15:56:09 2021 +0900
[SPARK-36973][PYTHON] Deduplicate prepare data method for HistogramPlotBase and KdePlotBase
### What changes were proposed in this pull request?
Deduplicate prepare data method for HistogramPlotBase and KdePlotBase
### Why are the changes needed?
Deduplicate code
Remove 2 ```TODO``` comment
### Does this PR introduce _any_ user-facing change?
No, only for Dev
### How was this patch tested?
Existing tests
Closes #34251 from dchvn/SPARK-36973.
Lead-authored-by: dch nguyen <dg...@viettel.com.vn>
Co-authored-by: Hyukjin Kwon <gu...@gmail.com>
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
python/pyspark/pandas/plot/core.py | 31 +++++++++++--------------------
1 file changed, 11 insertions(+), 20 deletions(-)
diff --git a/python/pyspark/pandas/plot/core.py b/python/pyspark/pandas/plot/core.py
index dc95eac..89b8320 100644
--- a/python/pyspark/pandas/plot/core.py
+++ b/python/pyspark/pandas/plot/core.py
@@ -98,10 +98,9 @@ class SampledPlotBase:
)
-class HistogramPlotBase:
+class NumericPlotBase:
@staticmethod
- def prepare_hist_data(data, bins):
- # TODO: this logic is similar with KdePlotBase. Might have to deduplicate it.
+ def prepare_numeric_data(data):
from pyspark.pandas.series import Series
if isinstance(data, Series):
@@ -117,6 +116,13 @@ class HistogramPlotBase:
"Empty {0!r}: no numeric data to " "plot".format(numeric_data.__class__.__name__)
)
+ return data, numeric_data
+
+
+class HistogramPlotBase(NumericPlotBase):
+ @staticmethod
+ def prepare_hist_data(data, bins):
+ data, numeric_data = NumericPlotBase.prepare_numeric_data(data)
if is_integer(bins):
# computes boundaries for the column
bins = HistogramPlotBase.get_bins(data.to_spark(), bins)
@@ -340,25 +346,10 @@ class BoxPlotBase:
return fliers
-class KdePlotBase:
+class KdePlotBase(NumericPlotBase):
@staticmethod
def prepare_kde_data(data):
- # TODO: this logic is similar with HistogramPlotBase. Might have to deduplicate it.
- from pyspark.pandas.series import Series
-
- if isinstance(data, Series):
- data = data.to_frame()
-
- numeric_data = data.select_dtypes(
- include=["byte", "decimal", "integer", "float", "long", "double", np.datetime64]
- )
-
- # no empty frames or series allowed
- if len(numeric_data.columns) == 0:
- raise TypeError(
- "Empty {0!r}: no numeric data to " "plot".format(numeric_data.__class__.__name__)
- )
-
+ _, numeric_data = NumericPlotBase.prepare_numeric_data(data)
return numeric_data
@staticmethod
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org