You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@superset.apache.org by vi...@apache.org on 2020/09/16 06:35:00 UTC
[incubator-superset] 04/28: fix: support non-string groupbys for pie chart (#10493)

This is an automated email from the ASF dual-hosted git repository.

villebro pushed a commit to branch 0.37
in repository https://gitbox.apache.org/repos/asf/incubator-superset.git

commit ddac4b88386b13d1aeececfb26aac4ad64709a67
Author: Ville Brofeldt <33...@users.noreply.github.com>
AuthorDate: Fri Jul 31 11:19:21 2020 +0300

    fix: support non-string groupbys for pie chart (#10493)
    
    * chore: add unit tests to pie chart
    
    * refine logic for floats and nans and add more tests
---
 superset/viz.py    | 34 +++++++++++++++++++++++++++++++-
 tests/viz_tests.py | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 91 insertions(+), 1 deletion(-)

diff --git a/superset/viz.py b/superset/viz.py
index 3ce9434..77b228b 100644
--- a/superset/viz.py
+++ b/superset/viz.py
@@ -1544,11 +1544,43 @@ class DistributionPieViz(NVD3Viz):
     is_timeseries = False
 
     def get_data(self, df: pd.DataFrame) -> VizData:
+        def _label_aggfunc(labels: pd.Series) -> str:
+            """
+            Convert a single or multi column label into a single label, replacing
+            null values with `NULL_STRING` and joining multiple columns together
+            with a comma. Examples:
+
+            >>> _label_aggfunc(pd.Series(["abc"]))
+            'abc'
+            >>> _label_aggfunc(pd.Series([1]))
+            '1'
+            >>> _label_aggfunc(pd.Series(["abc", "def"]))
+            'abc, def'
+            >>> # note: integer floats are stripped of decimal digits
+            >>> _label_aggfunc(pd.Series([0.1, 2.0, 0.3]))
+            '0.1, 2, 0.3'
+            >>> _label_aggfunc(pd.Series([1, None, "abc", 0.8], dtype="object"))
+            '1, <NULL>, abc, 0.8'
+            """
+            label_list: List[str] = []
+            for label in labels:
+                if isinstance(label, str):
+                    label_recast = label
+                elif label is None or isinstance(label, float) and math.isnan(label):
+                    label_recast = NULL_STRING
+                elif isinstance(label, float) and label.is_integer():
+                    label_recast = str(int(label))
+                else:
+                    label_recast = str(label)
+                label_list.append(label_recast)
+
+            return ", ".join(label_list)
+
         if df.empty:
             return None
         metric = self.metric_labels[0]
         df = pd.DataFrame(
-            {"x": df[self.groupby].agg(func=", ".join, axis=1), "y": df[metric]}
+            {"x": df[self.groupby].agg(func=_label_aggfunc, axis=1), "y": df[metric]}
         )
         df.sort_values(by="y", ascending=False, inplace=True)
         return df.to_dict(orient="records")
diff --git a/tests/viz_tests.py b/tests/viz_tests.py
index 17e43d8..b76c95c 100644
--- a/tests/viz_tests.py
+++ b/tests/viz_tests.py
@@ -20,6 +20,7 @@ from datetime import datetime
 import logging
 from math import nan
 from unittest.mock import Mock, patch
+from typing import Any, Dict, List, Set
 
 import numpy as np
 import pandas as pd
@@ -1322,3 +1323,60 @@ class TestPivotTableViz(SupersetTestCase):
             viz.PivotTableViz.get_aggfunc("strcol", self.df, {"pandas_aggfunc": "min"})
             == "min"
         )
+
+
+class TestDistributionPieViz(SupersetTestCase):
+    base_df = pd.DataFrame(
+        data={
+            "intcol": [1, 2, 3, 4, None],
+            "floatcol": [1.0, 0.2, 0.3, 0.4, None],
+            "strcol_a": ["a", "a", "a", "a", None],
+            "strcol": ["a", "b", "c", None, "d"],
+        }
+    )
+
+    @staticmethod
+    def get_cols(data: List[Dict[str, Any]]) -> Set[str]:
+        return set([row["x"] for row in data])
+
+    def test_bool_groupby(self):
+        datasource = self.get_datasource_mock()
+        df = pd.DataFrame(data={"intcol": [1, 2, None], "boolcol": [True, None, False]})
+
+        pie_viz = viz.DistributionPieViz(
+            datasource, {"metrics": ["intcol"], "groupby": ["boolcol"]},
+        )
+        data = pie_viz.get_data(df)
+        assert self.get_cols(data) == {"True", "False", "<NULL>"}
+
+    def test_string_groupby(self):
+        datasource = self.get_datasource_mock()
+        pie_viz = viz.DistributionPieViz(
+            datasource, {"metrics": ["floatcol"], "groupby": ["strcol"]},
+        )
+        data = pie_viz.get_data(self.base_df)
+        assert self.get_cols(data) == {"<NULL>", "a", "b", "c", "d"}
+
+    def test_int_groupby(self):
+        datasource = self.get_datasource_mock()
+        pie_viz = viz.DistributionPieViz(
+            datasource, {"metrics": ["floatcol"], "groupby": ["intcol"]},
+        )
+        data = pie_viz.get_data(self.base_df)
+        assert self.get_cols(data) == {"<NULL>", "1", "2", "3", "4"}
+
+    def test_float_groupby(self):
+        datasource = self.get_datasource_mock()
+        pie_viz = viz.DistributionPieViz(
+            datasource, {"metrics": ["intcol"], "groupby": ["floatcol"]},
+        )
+        data = pie_viz.get_data(self.base_df)
+        assert self.get_cols(data) == {"<NULL>", "1", "0.2", "0.3", "0.4"}
+
+    def test_multi_groupby(self):
+        datasource = self.get_datasource_mock()
+        pie_viz = viz.DistributionPieViz(
+            datasource, {"metrics": ["floatcol"], "groupby": ["intcol", "strcol"]},
+        )
+        data = pie_viz.get_data(self.base_df)
+        assert self.get_cols(data) == {"1, a", "2, b", "3, c", "4, <NULL>", "<NULL>, d"}