You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@superset.apache.org by jo...@apache.org on 2020/02/07 01:25:36 UTC
[incubator-superset] branch master updated: SQL Lab: Use numpy
structured arrays, fallback to JSON serialization (#9096)
This is an automated email from the ASF dual-hosted git repository.
johnbodley pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-superset.git
The following commit(s) were added to refs/heads/master by this push:
new 161d211 SQL Lab: Use numpy structured arrays, fallback to JSON serialization (#9096)
161d211 is described below
commit 161d211c07398a14992cf79784b566b7b9f1fb39
Author: Rob DiCiuccio <ro...@gmail.com>
AuthorDate: Thu Feb 6 17:25:22 2020 -0800
SQL Lab: Use numpy structured arrays, fallback to JSON serialization (#9096)
* Use numpy structured arrays, fallback to JSON serialization
* Explicitly cast data as list when creating numpy array
---
superset/result_set.py | 55 +++++++++++++++++++++++++++++++++--------------
tests/result_set_tests.py | 34 +++++++++++++++++++++++++++++
2 files changed, 73 insertions(+), 16 deletions(-)
diff --git a/superset/result_set.py b/superset/result_set.py
index 538d878..5d7e378 100644
--- a/superset/result_set.py
+++ b/superset/result_set.py
@@ -57,6 +57,15 @@ def dedup(l: List[str], suffix: str = "__", case_sensitive: bool = True) -> List
return new_l
+def stringify(obj: Any) -> str:
+ return json.dumps(obj, default=utils.json_iso_dttm_ser)
+
+
+def stringify_values(array: np.ndarray) -> np.ndarray:
+ vstringify: Callable = np.vectorize(stringify)
+ return vstringify(array)
+
+
class SupersetResultSet:
def __init__(
self,
@@ -68,6 +77,8 @@ class SupersetResultSet:
column_names: List[str] = []
pa_data: List[pa.Array] = []
deduped_cursor_desc: List[Tuple[Any, ...]] = []
+ numpy_dtype: List[Tuple[str, ...]] = []
+ stringified_arr: np.ndarray
if cursor_description:
# get deduped list of column names
@@ -79,33 +90,45 @@ class SupersetResultSet:
for column_name, description in zip(column_names, cursor_description)
]
- # put data in a 2D array so we can efficiently access each column;
- array = np.array(data, dtype="object")
+ # generate numpy structured array dtype
+ numpy_dtype = [(column_name, "object") for column_name in column_names]
+
+ # put data in a structured array so we can efficiently access each column.
+ # cast `data` as list due to MySQL (others?) wrapping results with a tuple.
+ array = np.array(list(data), dtype=numpy_dtype)
if array.size > 0:
- pa_data = [pa.array(array[:, i]) for i, column in enumerate(column_names)]
+ for column in column_names:
+ try:
+ pa_data.append(pa.array(array[column].tolist()))
+ except (
+ pa.lib.ArrowInvalid,
+ pa.lib.ArrowTypeError,
+ pa.lib.ArrowNotImplementedError,
+ ):
+ # attempt serialization of values as strings
+ stringified_arr = stringify_values(array[column])
+ pa_data.append(pa.array(stringified_arr.tolist()))
- # workaround for bug converting `psycopg2.tz.FixedOffsetTimezone` tzinfo values.
- # related: https://issues.apache.org/jira/browse/ARROW-5248
if pa_data:
for i, column in enumerate(column_names):
- # TODO: revisit nested column serialization once Arrow 1.0 is released with:
- # https://github.com/apache/arrow/pull/6199
- # Related issue: #8978
if pa.types.is_nested(pa_data[i].type):
- stringify_func = lambda item: json.dumps(
- item, default=utils.json_iso_dttm_ser
- )
- vfunc = np.vectorize(stringify_func)
- strigified_arr = vfunc(array[:, i])
- pa_data[i] = pa.array(strigified_arr)
+ # TODO: revisit nested column serialization once PyArrow updated with:
+ # https://github.com/apache/arrow/pull/6199
+ # Related issue: https://github.com/apache/incubator-superset/issues/8978
+ stringified_arr = stringify_values(array[column])
+ pa_data[i] = pa.array(stringified_arr.tolist())
elif pa.types.is_temporal(pa_data[i].type):
- sample = self.first_nonempty(array[:, i])
+ # workaround for bug converting `psycopg2.tz.FixedOffsetTimezone` tzinfo values.
+ # related: https://issues.apache.org/jira/browse/ARROW-5248
+ sample = self.first_nonempty(array[column])
if sample and isinstance(sample, datetime.datetime):
try:
if sample.tzinfo:
tz = sample.tzinfo
- series = pd.Series(array[:, i], dtype="datetime64[ns]")
+ series = pd.Series(
+ array[column], dtype="datetime64[ns]"
+ )
series = pd.to_datetime(series).dt.tz_localize(tz)
pa_data[i] = pa.Array.from_pandas(
series, type=pa.timestamp("ns", tz=tz)
diff --git a/tests/result_set_tests.py b/tests/result_set_tests.py
index 1f697cc..9df461f 100644
--- a/tests/result_set_tests.py
+++ b/tests/result_set_tests.py
@@ -166,6 +166,40 @@ class SupersetResultSetTestCase(SupersetTestCase):
],
)
+ def test_single_column_multidim_nested_types(self):
+ data = [
+ (
+ [
+ "test",
+ [
+ [
+ "foo",
+ 123456,
+ [
+ [["test"], 3432546, 7657658766],
+ [["fake"], 656756765, 324324324324],
+ ],
+ ]
+ ],
+ ["test2", 43, 765765765],
+ None,
+ None,
+ ],
+ )
+ ]
+ cursor_descr = [("metadata",)]
+ results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
+ self.assertEqual(results.columns[0]["type"], "STRING")
+ df = results.to_pandas_df()
+ self.assertEqual(
+ df_to_records(df),
+ [
+ {
+ "metadata": '["test", [["foo", 123456, [[["test"], 3432546, 7657658766], [["fake"], 656756765, 324324324324]]]], ["test2", 43, 765765765], null, null]'
+ }
+ ],
+ )
+
def test_empty_datetime(self):
data = [(None,)]
cursor_descr = [("ds", "timestamp", None, None, None, None, True)]