You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@superset.apache.org by jo...@apache.org on 2020/02/07 01:25:36 UTC
[incubator-superset] branch master updated: SQL Lab: Use numpy structured arrays, fallback to JSON serialization (#9096)

This is an automated email from the ASF dual-hosted git repository.

johnbodley pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-superset.git


The following commit(s) were added to refs/heads/master by this push:
     new 161d211  SQL Lab: Use numpy structured arrays, fallback to JSON serialization (#9096)
161d211 is described below

commit 161d211c07398a14992cf79784b566b7b9f1fb39
Author: Rob DiCiuccio <ro...@gmail.com>
AuthorDate: Thu Feb 6 17:25:22 2020 -0800

    SQL Lab: Use numpy structured arrays, fallback to JSON serialization (#9096)
    
    * Use numpy structured arrays, fallback to JSON serialization
    
    * Explicitly cast data as list when creating numpy array
---
 superset/result_set.py    | 55 +++++++++++++++++++++++++++++++++--------------
 tests/result_set_tests.py | 34 +++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 16 deletions(-)

diff --git a/superset/result_set.py b/superset/result_set.py
index 538d878..5d7e378 100644
--- a/superset/result_set.py
+++ b/superset/result_set.py
@@ -57,6 +57,15 @@ def dedup(l: List[str], suffix: str = "__", case_sensitive: bool = True) -> List
     return new_l
 
 
+def stringify(obj: Any) -> str:
+    return json.dumps(obj, default=utils.json_iso_dttm_ser)
+
+
+def stringify_values(array: np.ndarray) -> np.ndarray:
+    vstringify: Callable = np.vectorize(stringify)
+    return vstringify(array)
+
+
 class SupersetResultSet:
     def __init__(
         self,
@@ -68,6 +77,8 @@ class SupersetResultSet:
         column_names: List[str] = []
         pa_data: List[pa.Array] = []
         deduped_cursor_desc: List[Tuple[Any, ...]] = []
+        numpy_dtype: List[Tuple[str, ...]] = []
+        stringified_arr: np.ndarray
 
         if cursor_description:
             # get deduped list of column names
@@ -79,33 +90,45 @@ class SupersetResultSet:
                 for column_name, description in zip(column_names, cursor_description)
             ]
 
-        # put data in a 2D array so we can efficiently access each column;
-        array = np.array(data, dtype="object")
+            # generate numpy structured array dtype
+            numpy_dtype = [(column_name, "object") for column_name in column_names]
+
+        # put data in a structured array so we can efficiently access each column.
+        # cast `data` as list due to MySQL (others?) wrapping results with a tuple.
+        array = np.array(list(data), dtype=numpy_dtype)
         if array.size > 0:
-            pa_data = [pa.array(array[:, i]) for i, column in enumerate(column_names)]
+            for column in column_names:
+                try:
+                    pa_data.append(pa.array(array[column].tolist()))
+                except (
+                    pa.lib.ArrowInvalid,
+                    pa.lib.ArrowTypeError,
+                    pa.lib.ArrowNotImplementedError,
+                ):
+                    # attempt serialization of values as strings
+                    stringified_arr = stringify_values(array[column])
+                    pa_data.append(pa.array(stringified_arr.tolist()))
 
-        # workaround for bug converting `psycopg2.tz.FixedOffsetTimezone` tzinfo values.
-        # related: https://issues.apache.org/jira/browse/ARROW-5248
         if pa_data:
             for i, column in enumerate(column_names):
-                # TODO: revisit nested column serialization once Arrow 1.0 is released with:
-                # https://github.com/apache/arrow/pull/6199
-                # Related issue: #8978
                 if pa.types.is_nested(pa_data[i].type):
-                    stringify_func = lambda item: json.dumps(
-                        item, default=utils.json_iso_dttm_ser
-                    )
-                    vfunc = np.vectorize(stringify_func)
-                    strigified_arr = vfunc(array[:, i])
-                    pa_data[i] = pa.array(strigified_arr)
+                    # TODO: revisit nested column serialization once PyArrow updated with:
+                    # https://github.com/apache/arrow/pull/6199
+                    # Related issue: https://github.com/apache/incubator-superset/issues/8978
+                    stringified_arr = stringify_values(array[column])
+                    pa_data[i] = pa.array(stringified_arr.tolist())
 
                 elif pa.types.is_temporal(pa_data[i].type):
-                    sample = self.first_nonempty(array[:, i])
+                    # workaround for bug converting `psycopg2.tz.FixedOffsetTimezone` tzinfo values.
+                    # related: https://issues.apache.org/jira/browse/ARROW-5248
+                    sample = self.first_nonempty(array[column])
                     if sample and isinstance(sample, datetime.datetime):
                         try:
                             if sample.tzinfo:
                                 tz = sample.tzinfo
-                                series = pd.Series(array[:, i], dtype="datetime64[ns]")
+                                series = pd.Series(
+                                    array[column], dtype="datetime64[ns]"
+                                )
                                 series = pd.to_datetime(series).dt.tz_localize(tz)
                                 pa_data[i] = pa.Array.from_pandas(
                                     series, type=pa.timestamp("ns", tz=tz)
diff --git a/tests/result_set_tests.py b/tests/result_set_tests.py
index 1f697cc..9df461f 100644
--- a/tests/result_set_tests.py
+++ b/tests/result_set_tests.py
@@ -166,6 +166,40 @@ class SupersetResultSetTestCase(SupersetTestCase):
             ],
         )
 
+    def test_single_column_multidim_nested_types(self):
+        data = [
+            (
+                [
+                    "test",
+                    [
+                        [
+                            "foo",
+                            123456,
+                            [
+                                [["test"], 3432546, 7657658766],
+                                [["fake"], 656756765, 324324324324],
+                            ],
+                        ]
+                    ],
+                    ["test2", 43, 765765765],
+                    None,
+                    None,
+                ],
+            )
+        ]
+        cursor_descr = [("metadata",)]
+        results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
+        self.assertEqual(results.columns[0]["type"], "STRING")
+        df = results.to_pandas_df()
+        self.assertEqual(
+            df_to_records(df),
+            [
+                {
+                    "metadata": '["test", [["foo", 123456, [[["test"], 3432546, 7657658766], [["fake"], 656756765, 324324324324]]]], ["test2", 43, 765765765], null, null]'
+                }
+            ],
+        )
+
     def test_empty_datetime(self):
         data = [(None,)]
         cursor_descr = [("ds", "timestamp", None, None, None, None, True)]