You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@superset.apache.org by yj...@apache.org on 2021/02/04 18:47:53 UTC

[superset] branch master updated: fix(viz): improve dtype inference logic (#12933)

This is an automated email from the ASF dual-hosted git repository.

yjc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/superset.git


The following commit(s) were added to refs/heads/master by this push:
     new ac73991  fix(viz): improve dtype inference logic (#12933)
ac73991 is described below

commit ac73991913c596a0b2b37aaf96083c227088dad6
Author: Ville Brofeldt <33...@users.noreply.github.com>
AuthorDate: Thu Feb 4 20:47:27 2021 +0200

    fix(viz): improve dtype inference logic (#12933)
---
 superset-frontend/package-lock.json | 28 ++++++------------------
 superset-frontend/package.json      |  4 ++--
 superset/common/query_context.py    |  2 +-
 superset/utils/core.py              | 35 +++++++++++++++++++-----------
 tests/utils_tests.py                | 43 +++++++++++++++++++++++++++++++++----
 5 files changed, 72 insertions(+), 40 deletions(-)

diff --git a/superset-frontend/package-lock.json b/superset-frontend/package-lock.json
index d17af47..1307f99 100644
--- a/superset-frontend/package-lock.json
+++ b/superset-frontend/package-lock.json
@@ -19064,22 +19064,21 @@
       }
     },
     "@superset-ui/plugin-chart-echarts": {
-      "version": "0.17.5",
-      "resolved": "https://registry.npmjs.org/@superset-ui/plugin-chart-echarts/-/plugin-chart-echarts-0.17.5.tgz",
-      "integrity": "sha512-7HpREiMqUUR1wiYxZVOlR/25jMeWRsShBHulanGYGSR6DHuvjL1S/lq1WiK6qbUxqZ/iDhzoJpSsFzZxkhbwdA==",
+      "version": "0.17.6",
+      "resolved": "https://registry.npmjs.org/@superset-ui/plugin-chart-echarts/-/plugin-chart-echarts-0.17.6.tgz",
+      "integrity": "sha512-C0OXO7yrkVbHhILVb8fvxbM8O/+iB6eA8NCwwOiYNJZ/HfToZGr+ebbA6uCTf/A/WWdl8Qh/lIX8vz+GJe8d/A==",
       "requires": {
         "@superset-ui/chart-controls": "0.17.5",
         "@superset-ui/core": "0.17.5",
-        "@types/echarts": "^4.9.3",
         "@types/mathjs": "^6.0.7",
-        "echarts": "^5.0.0",
+        "echarts": "^5.0.1",
         "mathjs": "^8.0.1"
       }
     },
     "@superset-ui/plugin-chart-table": {
-      "version": "0.17.5",
-      "resolved": "https://registry.npmjs.org/@superset-ui/plugin-chart-table/-/plugin-chart-table-0.17.5.tgz",
-      "integrity": "sha512-OJmQJkCtNZORjl2sLhkyJPQaCOV7igHJRDnVbRBDa5rTVoYETK5lb3j6fO/Oxq1PXufqwsf58SJeVzl3NCogHA==",
+      "version": "0.17.6",
+      "resolved": "https://registry.npmjs.org/@superset-ui/plugin-chart-table/-/plugin-chart-table-0.17.6.tgz",
+      "integrity": "sha512-s5pfX1/AxKuiTlBOovBjI8fmMjkn2gcjxWsrhjrvo9sfeCtZ64PObTobNiKHlKrc43YLf2ZKiNY9MiWBlsT3ZA==",
       "requires": {
         "@emotion/core": "^10.0.28",
         "@superset-ui/chart-controls": "0.17.5",
@@ -20817,14 +20816,6 @@
         "@types/node": "*"
       }
     },
-    "@types/echarts": {
-      "version": "4.9.3",
-      "resolved": "https://registry.npmjs.org/@types/echarts/-/echarts-4.9.3.tgz",
-      "integrity": "sha512-CbgZUYdLy1G2BhCI6maBwVXmrqIx/D8KwUccMXQ9W2uyXNMjBvpIRXSs+UaBtvUihPV2f0g7LGj/yua1iY0VbQ==",
-      "requires": {
-        "@types/zrender": "*"
-      }
-    },
     "@types/enzyme": {
       "version": "3.10.5",
       "resolved": "https://registry.npmjs.org/@types/enzyme/-/enzyme-3.10.5.tgz",
@@ -21520,11 +21511,6 @@
       "integrity": "sha512-FA/BWv8t8ZWJ+gEOnLLd8ygxH/2UFbAvgEonyfN6yWGLKc7zVjbpl2Y4CTjid9h2RfgPP6SEt6uHwEOply00yw==",
       "dev": true
     },
-    "@types/zrender": {
-      "version": "4.0.0",
-      "resolved": "https://registry.npmjs.org/@types/zrender/-/zrender-4.0.0.tgz",
-      "integrity": "sha512-s89GOIeKFiod2KSqHkfd2rzx+T2DVu7ihZCBEBnhFrzvQPUmzvDSBot9Fi1DfMQm9Odg+rTqoMGC38RvrwJK2w=="
-    },
     "@typescript-eslint/eslint-plugin": {
       "version": "4.1.0",
       "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-4.1.0.tgz",
diff --git a/superset-frontend/package.json b/superset-frontend/package.json
index 1ab6654..76721f6 100644
--- a/superset-frontend/package.json
+++ b/superset-frontend/package.json
@@ -89,8 +89,8 @@
     "@superset-ui/legacy-preset-chart-big-number": "^0.17.5",
     "@superset-ui/legacy-preset-chart-deckgl": "^0.4.1",
     "@superset-ui/legacy-preset-chart-nvd3": "^0.17.5",
-    "@superset-ui/plugin-chart-echarts": "^0.17.5",
-    "@superset-ui/plugin-chart-table": "^0.17.5",
+    "@superset-ui/plugin-chart-echarts": "^0.17.6",
+    "@superset-ui/plugin-chart-table": "^0.17.6",
     "@superset-ui/plugin-chart-word-cloud": "^0.17.5",
     "@superset-ui/preset-chart-xy": "^0.17.5",
     "@vx/responsive": "^0.0.195",
diff --git a/superset/common/query_context.py b/superset/common/query_context.py
index 6e3fc9f..01bc170 100644
--- a/superset/common/query_context.py
+++ b/superset/common/query_context.py
@@ -181,7 +181,7 @@ class QueryContext:
         status = payload["status"]
         if status != utils.QueryStatus.FAILED:
             payload["colnames"] = list(df.columns)
-            payload["coltypes"] = utils.serialize_pandas_dtypes(df.dtypes)
+            payload["coltypes"] = utils.extract_dataframe_dtypes(df)
             payload["data"] = self.get_data(df)
         del payload["df"]
 
diff --git a/superset/utils/core.py b/superset/utils/core.py
index 8c6da5c..ceec948 100644
--- a/superset/utils/core.py
+++ b/superset/utils/core.py
@@ -74,6 +74,7 @@ from flask_appbuilder import SQLA
 from flask_appbuilder.security.sqla.models import Role, User
 from flask_babel import gettext as __
 from flask_babel.speaklater import LazyString
+from pandas.api.types import infer_dtype
 from sqlalchemy import event, exc, select, Text
 from sqlalchemy.dialects.mysql import MEDIUMTEXT
 from sqlalchemy.engine import Connection, Engine
@@ -1401,19 +1402,29 @@ def get_column_names_from_metrics(metrics: List[Metric]) -> List[str]:
     return columns
 
 
-def serialize_pandas_dtypes(dtypes: List[np.dtype]) -> List[GenericDataType]:
-    """Serialize pandas/numpy dtypes to JavaScript types"""
-    mapping = {
-        "object": GenericDataType.STRING,
-        "category": GenericDataType.STRING,
-        "datetime64[ns]": GenericDataType.TEMPORAL,
-        "int64": GenericDataType.NUMERIC,
-        "in32": GenericDataType.NUMERIC,
-        "float64": GenericDataType.NUMERIC,
-        "float32": GenericDataType.NUMERIC,
-        "bool": GenericDataType.BOOLEAN,
+def extract_dataframe_dtypes(df: pd.DataFrame) -> List[GenericDataType]:
+    """Serialize pandas/numpy dtypes to generic types"""
+
+    # omitting string types as those will be the default type
+    inferred_type_map: Dict[str, GenericDataType] = {
+        "floating": GenericDataType.NUMERIC,
+        "integer": GenericDataType.NUMERIC,
+        "mixed-integer-float": GenericDataType.NUMERIC,
+        "decimal": GenericDataType.NUMERIC,
+        "boolean": GenericDataType.BOOLEAN,
+        "datetime64": GenericDataType.TEMPORAL,
+        "datetime": GenericDataType.TEMPORAL,
+        "date": GenericDataType.TEMPORAL,
     }
-    return [mapping.get(str(x), GenericDataType.STRING) for x in dtypes]
+
+    generic_types: List[GenericDataType] = []
+    for column in df.columns:
+        series = df[column]
+        inferred_type = infer_dtype(series)
+        generic_type = inferred_type_map.get(inferred_type, GenericDataType.STRING)
+        generic_types.append(generic_type)
+
+    return generic_types
 
 
 def indexed(
diff --git a/tests/utils_tests.py b/tests/utils_tests.py
index fbdf131..ab2f6ea 100644
--- a/tests/utils_tests.py
+++ b/tests/utils_tests.py
@@ -23,10 +23,12 @@ import hashlib
 import json
 import os
 import re
+from typing import Any, Tuple, List
 from unittest.mock import Mock, patch
 from tests.fixtures.birth_names_dashboard import load_birth_names_dashboard_with_slices
 
-import numpy
+import numpy as np
+import pandas as pd
 import pytest
 from flask import Flask, g
 import marshmallow
@@ -44,6 +46,7 @@ from superset.utils.core import (
     convert_legacy_filters_into_adhoc,
     create_ssl_cert_file,
     format_timedelta,
+    GenericDataType,
     get_form_data_token,
     get_iterable,
     get_email_address_list,
@@ -57,6 +60,7 @@ from superset.utils.core import (
     merge_request_params,
     parse_ssl_cert,
     parse_js_uri_path_item,
+    extract_dataframe_dtypes,
     split,
     TimeRangeEndpoint,
     validate_json,
@@ -113,9 +117,9 @@ class TestUtils(SupersetTestCase):
             json_iso_dttm_ser("this is not a date")
 
     def test_base_json_conv(self):
-        assert isinstance(base_json_conv(numpy.bool_(1)), bool) is True
-        assert isinstance(base_json_conv(numpy.int64(1)), int) is True
-        assert isinstance(base_json_conv(numpy.array([1, 2, 3])), list) is True
+        assert isinstance(base_json_conv(np.bool_(1)), bool) is True
+        assert isinstance(base_json_conv(np.int64(1)), int) is True
+        assert isinstance(base_json_conv(np.array([1, 2, 3])), list) is True
         assert isinstance(base_json_conv(set([1])), list) is True
         assert isinstance(base_json_conv(Decimal("1.0")), float) is True
         assert isinstance(base_json_conv(uuid.uuid4()), str) is True
@@ -1066,3 +1070,34 @@ class TestUtils(SupersetTestCase):
         assert get_form_data_token({"token": "token_abcdefg1"}) == "token_abcdefg1"
         generated_token = get_form_data_token({})
         assert re.match(r"^token_[a-z0-9]{8}$", generated_token) is not None
+
+    def test_extract_dataframe_dtypes(self):
+        cols: Tuple[Tuple[str, GenericDataType, List[Any]], ...] = (
+            ("dt", GenericDataType.TEMPORAL, [date(2021, 2, 4), date(2021, 2, 4)]),
+            (
+                "dttm",
+                GenericDataType.TEMPORAL,
+                [datetime(2021, 2, 4, 1, 1, 1), datetime(2021, 2, 4, 1, 1, 1)],
+            ),
+            ("str", GenericDataType.STRING, ["foo", "foo"]),
+            ("int", GenericDataType.NUMERIC, [1, 1]),
+            ("float", GenericDataType.NUMERIC, [0.5, 0.5]),
+            ("mixed-int-float", GenericDataType.NUMERIC, [0.5, 1.0]),
+            ("bool", GenericDataType.BOOLEAN, [True, False]),
+            ("mixed-str-int", GenericDataType.STRING, ["abc", 1.0]),
+            ("obj", GenericDataType.STRING, [{"a": 1}, {"a": 1}]),
+            ("dt_null", GenericDataType.TEMPORAL, [None, date(2021, 2, 4)]),
+            (
+                "dttm_null",
+                GenericDataType.TEMPORAL,
+                [None, datetime(2021, 2, 4, 1, 1, 1)],
+            ),
+            ("str_null", GenericDataType.STRING, [None, "foo"]),
+            ("int_null", GenericDataType.NUMERIC, [None, 1]),
+            ("float_null", GenericDataType.NUMERIC, [None, 0.5]),
+            ("bool_null", GenericDataType.BOOLEAN, [None, False]),
+            ("obj_null", GenericDataType.STRING, [None, {"a": 1}]),
+        )
+
+        df = pd.DataFrame(data={col[0]: col[2] for col in cols})
+        assert extract_dataframe_dtypes(df) == [col[1] for col in cols]