You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2021/10/13 06:56:48 UTC
[spark] branch master updated: [SPARK-36973][PYTHON] Deduplicate prepare data method for HistogramPlotBase and KdePlotBase

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new f678c75  [SPARK-36973][PYTHON] Deduplicate prepare data method for HistogramPlotBase and KdePlotBase
f678c75 is described below

commit f678c75d3940b2887fdb2621691b791b95d79469
Author: dch nguyen <dg...@viettel.com.vn>
AuthorDate: Wed Oct 13 15:56:09 2021 +0900

    [SPARK-36973][PYTHON] Deduplicate prepare data method for HistogramPlotBase and KdePlotBase
    
    ### What changes were proposed in this pull request?
    Deduplicate prepare data method for HistogramPlotBase and KdePlotBase
    
    ### Why are the changes needed?
    Deduplicate code
    Remove 2 ```TODO``` comment
    
    ### Does this PR introduce _any_ user-facing change?
    No, only for Dev
    
    ### How was this patch tested?
    Existing tests
    
    Closes #34251 from dchvn/SPARK-36973.
    
    Lead-authored-by: dch nguyen <dg...@viettel.com.vn>
    Co-authored-by: Hyukjin Kwon <gu...@gmail.com>
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
 python/pyspark/pandas/plot/core.py | 31 +++++++++++--------------------
 1 file changed, 11 insertions(+), 20 deletions(-)

diff --git a/python/pyspark/pandas/plot/core.py b/python/pyspark/pandas/plot/core.py
index dc95eac..89b8320 100644
--- a/python/pyspark/pandas/plot/core.py
+++ b/python/pyspark/pandas/plot/core.py
@@ -98,10 +98,9 @@ class SampledPlotBase:
             )
 
 
-class HistogramPlotBase:
+class NumericPlotBase:
     @staticmethod
-    def prepare_hist_data(data, bins):
-        # TODO: this logic is similar with KdePlotBase. Might have to deduplicate it.
+    def prepare_numeric_data(data):
         from pyspark.pandas.series import Series
 
         if isinstance(data, Series):
@@ -117,6 +116,13 @@ class HistogramPlotBase:
                 "Empty {0!r}: no numeric data to " "plot".format(numeric_data.__class__.__name__)
             )
 
+        return data, numeric_data
+
+
+class HistogramPlotBase(NumericPlotBase):
+    @staticmethod
+    def prepare_hist_data(data, bins):
+        data, numeric_data = NumericPlotBase.prepare_numeric_data(data)
         if is_integer(bins):
             # computes boundaries for the column
             bins = HistogramPlotBase.get_bins(data.to_spark(), bins)
@@ -340,25 +346,10 @@ class BoxPlotBase:
         return fliers
 
 
-class KdePlotBase:
+class KdePlotBase(NumericPlotBase):
     @staticmethod
     def prepare_kde_data(data):
-        # TODO: this logic is similar with HistogramPlotBase. Might have to deduplicate it.
-        from pyspark.pandas.series import Series
-
-        if isinstance(data, Series):
-            data = data.to_frame()
-
-        numeric_data = data.select_dtypes(
-            include=["byte", "decimal", "integer", "float", "long", "double", np.datetime64]
-        )
-
-        # no empty frames or series allowed
-        if len(numeric_data.columns) == 0:
-            raise TypeError(
-                "Empty {0!r}: no numeric data to " "plot".format(numeric_data.__class__.__name__)
-            )
-
+        _, numeric_data = NumericPlotBase.prepare_numeric_data(data)
         return numeric_data
 
     @staticmethod

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org