You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@superset.apache.org by yj...@apache.org on 2021/02/04 18:47:53 UTC
[superset] branch master updated: fix(viz): improve dtype inference
logic (#12933)
This is an automated email from the ASF dual-hosted git repository.
yjc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/superset.git
The following commit(s) were added to refs/heads/master by this push:
new ac73991 fix(viz): improve dtype inference logic (#12933)
ac73991 is described below
commit ac73991913c596a0b2b37aaf96083c227088dad6
Author: Ville Brofeldt <33...@users.noreply.github.com>
AuthorDate: Thu Feb 4 20:47:27 2021 +0200
fix(viz): improve dtype inference logic (#12933)
---
superset-frontend/package-lock.json | 28 ++++++------------------
superset-frontend/package.json | 4 ++--
superset/common/query_context.py | 2 +-
superset/utils/core.py | 35 +++++++++++++++++++-----------
tests/utils_tests.py | 43 +++++++++++++++++++++++++++++++++----
5 files changed, 72 insertions(+), 40 deletions(-)
diff --git a/superset-frontend/package-lock.json b/superset-frontend/package-lock.json
index d17af47..1307f99 100644
--- a/superset-frontend/package-lock.json
+++ b/superset-frontend/package-lock.json
@@ -19064,22 +19064,21 @@
}
},
"@superset-ui/plugin-chart-echarts": {
- "version": "0.17.5",
- "resolved": "https://registry.npmjs.org/@superset-ui/plugin-chart-echarts/-/plugin-chart-echarts-0.17.5.tgz",
- "integrity": "sha512-7HpREiMqUUR1wiYxZVOlR/25jMeWRsShBHulanGYGSR6DHuvjL1S/lq1WiK6qbUxqZ/iDhzoJpSsFzZxkhbwdA==",
+ "version": "0.17.6",
+ "resolved": "https://registry.npmjs.org/@superset-ui/plugin-chart-echarts/-/plugin-chart-echarts-0.17.6.tgz",
+ "integrity": "sha512-C0OXO7yrkVbHhILVb8fvxbM8O/+iB6eA8NCwwOiYNJZ/HfToZGr+ebbA6uCTf/A/WWdl8Qh/lIX8vz+GJe8d/A==",
"requires": {
"@superset-ui/chart-controls": "0.17.5",
"@superset-ui/core": "0.17.5",
- "@types/echarts": "^4.9.3",
"@types/mathjs": "^6.0.7",
- "echarts": "^5.0.0",
+ "echarts": "^5.0.1",
"mathjs": "^8.0.1"
}
},
"@superset-ui/plugin-chart-table": {
- "version": "0.17.5",
- "resolved": "https://registry.npmjs.org/@superset-ui/plugin-chart-table/-/plugin-chart-table-0.17.5.tgz",
- "integrity": "sha512-OJmQJkCtNZORjl2sLhkyJPQaCOV7igHJRDnVbRBDa5rTVoYETK5lb3j6fO/Oxq1PXufqwsf58SJeVzl3NCogHA==",
+ "version": "0.17.6",
+ "resolved": "https://registry.npmjs.org/@superset-ui/plugin-chart-table/-/plugin-chart-table-0.17.6.tgz",
+ "integrity": "sha512-s5pfX1/AxKuiTlBOovBjI8fmMjkn2gcjxWsrhjrvo9sfeCtZ64PObTobNiKHlKrc43YLf2ZKiNY9MiWBlsT3ZA==",
"requires": {
"@emotion/core": "^10.0.28",
"@superset-ui/chart-controls": "0.17.5",
@@ -20817,14 +20816,6 @@
"@types/node": "*"
}
},
- "@types/echarts": {
- "version": "4.9.3",
- "resolved": "https://registry.npmjs.org/@types/echarts/-/echarts-4.9.3.tgz",
- "integrity": "sha512-CbgZUYdLy1G2BhCI6maBwVXmrqIx/D8KwUccMXQ9W2uyXNMjBvpIRXSs+UaBtvUihPV2f0g7LGj/yua1iY0VbQ==",
- "requires": {
- "@types/zrender": "*"
- }
- },
"@types/enzyme": {
"version": "3.10.5",
"resolved": "https://registry.npmjs.org/@types/enzyme/-/enzyme-3.10.5.tgz",
@@ -21520,11 +21511,6 @@
"integrity": "sha512-FA/BWv8t8ZWJ+gEOnLLd8ygxH/2UFbAvgEonyfN6yWGLKc7zVjbpl2Y4CTjid9h2RfgPP6SEt6uHwEOply00yw==",
"dev": true
},
- "@types/zrender": {
- "version": "4.0.0",
- "resolved": "https://registry.npmjs.org/@types/zrender/-/zrender-4.0.0.tgz",
- "integrity": "sha512-s89GOIeKFiod2KSqHkfd2rzx+T2DVu7ihZCBEBnhFrzvQPUmzvDSBot9Fi1DfMQm9Odg+rTqoMGC38RvrwJK2w=="
- },
"@typescript-eslint/eslint-plugin": {
"version": "4.1.0",
"resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-4.1.0.tgz",
diff --git a/superset-frontend/package.json b/superset-frontend/package.json
index 1ab6654..76721f6 100644
--- a/superset-frontend/package.json
+++ b/superset-frontend/package.json
@@ -89,8 +89,8 @@
"@superset-ui/legacy-preset-chart-big-number": "^0.17.5",
"@superset-ui/legacy-preset-chart-deckgl": "^0.4.1",
"@superset-ui/legacy-preset-chart-nvd3": "^0.17.5",
- "@superset-ui/plugin-chart-echarts": "^0.17.5",
- "@superset-ui/plugin-chart-table": "^0.17.5",
+ "@superset-ui/plugin-chart-echarts": "^0.17.6",
+ "@superset-ui/plugin-chart-table": "^0.17.6",
"@superset-ui/plugin-chart-word-cloud": "^0.17.5",
"@superset-ui/preset-chart-xy": "^0.17.5",
"@vx/responsive": "^0.0.195",
diff --git a/superset/common/query_context.py b/superset/common/query_context.py
index 6e3fc9f..01bc170 100644
--- a/superset/common/query_context.py
+++ b/superset/common/query_context.py
@@ -181,7 +181,7 @@ class QueryContext:
status = payload["status"]
if status != utils.QueryStatus.FAILED:
payload["colnames"] = list(df.columns)
- payload["coltypes"] = utils.serialize_pandas_dtypes(df.dtypes)
+ payload["coltypes"] = utils.extract_dataframe_dtypes(df)
payload["data"] = self.get_data(df)
del payload["df"]
diff --git a/superset/utils/core.py b/superset/utils/core.py
index 8c6da5c..ceec948 100644
--- a/superset/utils/core.py
+++ b/superset/utils/core.py
@@ -74,6 +74,7 @@ from flask_appbuilder import SQLA
from flask_appbuilder.security.sqla.models import Role, User
from flask_babel import gettext as __
from flask_babel.speaklater import LazyString
+from pandas.api.types import infer_dtype
from sqlalchemy import event, exc, select, Text
from sqlalchemy.dialects.mysql import MEDIUMTEXT
from sqlalchemy.engine import Connection, Engine
@@ -1401,19 +1402,29 @@ def get_column_names_from_metrics(metrics: List[Metric]) -> List[str]:
return columns
-def serialize_pandas_dtypes(dtypes: List[np.dtype]) -> List[GenericDataType]:
- """Serialize pandas/numpy dtypes to JavaScript types"""
- mapping = {
- "object": GenericDataType.STRING,
- "category": GenericDataType.STRING,
- "datetime64[ns]": GenericDataType.TEMPORAL,
- "int64": GenericDataType.NUMERIC,
- "in32": GenericDataType.NUMERIC,
- "float64": GenericDataType.NUMERIC,
- "float32": GenericDataType.NUMERIC,
- "bool": GenericDataType.BOOLEAN,
+def extract_dataframe_dtypes(df: pd.DataFrame) -> List[GenericDataType]:
+ """Serialize pandas/numpy dtypes to generic types"""
+
+ # omitting string types as those will be the default type
+ inferred_type_map: Dict[str, GenericDataType] = {
+ "floating": GenericDataType.NUMERIC,
+ "integer": GenericDataType.NUMERIC,
+ "mixed-integer-float": GenericDataType.NUMERIC,
+ "decimal": GenericDataType.NUMERIC,
+ "boolean": GenericDataType.BOOLEAN,
+ "datetime64": GenericDataType.TEMPORAL,
+ "datetime": GenericDataType.TEMPORAL,
+ "date": GenericDataType.TEMPORAL,
}
- return [mapping.get(str(x), GenericDataType.STRING) for x in dtypes]
+
+ generic_types: List[GenericDataType] = []
+ for column in df.columns:
+ series = df[column]
+ inferred_type = infer_dtype(series)
+ generic_type = inferred_type_map.get(inferred_type, GenericDataType.STRING)
+ generic_types.append(generic_type)
+
+ return generic_types
def indexed(
diff --git a/tests/utils_tests.py b/tests/utils_tests.py
index fbdf131..ab2f6ea 100644
--- a/tests/utils_tests.py
+++ b/tests/utils_tests.py
@@ -23,10 +23,12 @@ import hashlib
import json
import os
import re
+from typing import Any, Tuple, List
from unittest.mock import Mock, patch
from tests.fixtures.birth_names_dashboard import load_birth_names_dashboard_with_slices
-import numpy
+import numpy as np
+import pandas as pd
import pytest
from flask import Flask, g
import marshmallow
@@ -44,6 +46,7 @@ from superset.utils.core import (
convert_legacy_filters_into_adhoc,
create_ssl_cert_file,
format_timedelta,
+ GenericDataType,
get_form_data_token,
get_iterable,
get_email_address_list,
@@ -57,6 +60,7 @@ from superset.utils.core import (
merge_request_params,
parse_ssl_cert,
parse_js_uri_path_item,
+ extract_dataframe_dtypes,
split,
TimeRangeEndpoint,
validate_json,
@@ -113,9 +117,9 @@ class TestUtils(SupersetTestCase):
json_iso_dttm_ser("this is not a date")
def test_base_json_conv(self):
- assert isinstance(base_json_conv(numpy.bool_(1)), bool) is True
- assert isinstance(base_json_conv(numpy.int64(1)), int) is True
- assert isinstance(base_json_conv(numpy.array([1, 2, 3])), list) is True
+ assert isinstance(base_json_conv(np.bool_(1)), bool) is True
+ assert isinstance(base_json_conv(np.int64(1)), int) is True
+ assert isinstance(base_json_conv(np.array([1, 2, 3])), list) is True
assert isinstance(base_json_conv(set([1])), list) is True
assert isinstance(base_json_conv(Decimal("1.0")), float) is True
assert isinstance(base_json_conv(uuid.uuid4()), str) is True
@@ -1066,3 +1070,34 @@ class TestUtils(SupersetTestCase):
assert get_form_data_token({"token": "token_abcdefg1"}) == "token_abcdefg1"
generated_token = get_form_data_token({})
assert re.match(r"^token_[a-z0-9]{8}$", generated_token) is not None
+
+ def test_extract_dataframe_dtypes(self):
+ cols: Tuple[Tuple[str, GenericDataType, List[Any]], ...] = (
+ ("dt", GenericDataType.TEMPORAL, [date(2021, 2, 4), date(2021, 2, 4)]),
+ (
+ "dttm",
+ GenericDataType.TEMPORAL,
+ [datetime(2021, 2, 4, 1, 1, 1), datetime(2021, 2, 4, 1, 1, 1)],
+ ),
+ ("str", GenericDataType.STRING, ["foo", "foo"]),
+ ("int", GenericDataType.NUMERIC, [1, 1]),
+ ("float", GenericDataType.NUMERIC, [0.5, 0.5]),
+ ("mixed-int-float", GenericDataType.NUMERIC, [0.5, 1.0]),
+ ("bool", GenericDataType.BOOLEAN, [True, False]),
+ ("mixed-str-int", GenericDataType.STRING, ["abc", 1.0]),
+ ("obj", GenericDataType.STRING, [{"a": 1}, {"a": 1}]),
+ ("dt_null", GenericDataType.TEMPORAL, [None, date(2021, 2, 4)]),
+ (
+ "dttm_null",
+ GenericDataType.TEMPORAL,
+ [None, datetime(2021, 2, 4, 1, 1, 1)],
+ ),
+ ("str_null", GenericDataType.STRING, [None, "foo"]),
+ ("int_null", GenericDataType.NUMERIC, [None, 1]),
+ ("float_null", GenericDataType.NUMERIC, [None, 0.5]),
+ ("bool_null", GenericDataType.BOOLEAN, [None, False]),
+ ("obj_null", GenericDataType.STRING, [None, {"a": 1}]),
+ )
+
+ df = pd.DataFrame(data={col[0]: col[2] for col in cols})
+ assert extract_dataframe_dtypes(df) == [col[1] for col in cols]