You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@superset.apache.org by GitBox <gi...@apache.org> on 2019/12/10 22:22:38 UTC

[GitHub] [incubator-superset] willbarrett commented on a change in pull request #8733: Replace pandas.DataFrame with PyArrow.Table for nullable int typing

willbarrett commented on a change in pull request #8733: Replace pandas.DataFrame with PyArrow.Table for nullable int typing
URL: https://github.com/apache/incubator-superset/pull/8733#discussion_r356308324
 
 

 ##########
 File path: superset/dataframe.py
 ##########
 @@ -14,257 +14,21 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=C,R,W
-""" Superset wrapper around pandas.DataFrame.
-
-TODO(bkyryliuk): add support for the conventions like: *_dim or dim_*
-                 dimensions, *_ts, ts_*, ds_*, *_ds - datetime, etc.
-TODO(bkyryliuk): recognize integer encoded enums.
-
+""" Superset utilities for pandas.DataFrame.
 """
-import logging
-from datetime import date, datetime
-
-import numpy as np
+from typing import Dict
 import pandas as pd
-from pandas.core.common import maybe_box_datetimelike
-from pandas.core.dtypes.dtypes import ExtensionDtype
 
 from superset.utils.core import JS_MAX_INTEGER
 
-INFER_COL_TYPES_THRESHOLD = 95
-INFER_COL_TYPES_SAMPLE_SIZE = 100
-
-
-def dedup(l, suffix="__", case_sensitive=True):
-    """De-duplicates a list of string by suffixing a counter
-
-    Always returns the same number of entries as provided, and always returns
-    unique values. Case sensitive comparison by default.
-
-    >>> print(','.join(dedup(['foo', 'bar', 'bar', 'bar', 'Bar'])))
-    foo,bar,bar__1,bar__2,Bar
-    >>> print(
-        ','.join(dedup(['foo', 'bar', 'bar', 'bar', 'Bar'], case_sensitive=False))
-    )
-    foo,bar,bar__1,bar__2,Bar__3
-    """
-    new_l = []
-    seen = {}
-    for s in l:
-        s_fixed_case = s if case_sensitive else s.lower()
-        if s_fixed_case in seen:
-            seen[s_fixed_case] += 1
-            s += suffix + str(seen[s_fixed_case])
-        else:
-            seen[s_fixed_case] = 0
-        new_l.append(s)
-    return new_l
-
-
-def is_numeric(dtype):
-    if hasattr(dtype, "_is_numeric"):
-        return dtype._is_numeric
-    return np.issubdtype(dtype, np.number)
-
-
-class SupersetDataFrame(object):
-    # Mapping numpy dtype.char to generic database types
-    type_map = {
-        "b": "BOOL",  # boolean
-        "i": "INT",  # (signed) integer
-        "u": "INT",  # unsigned integer
-        "l": "INT",  # 64bit integer
-        "f": "FLOAT",  # floating-point
-        "c": "FLOAT",  # complex-floating point
-        "m": None,  # timedelta
-        "M": "DATETIME",  # datetime
-        "O": "OBJECT",  # (Python) objects
-        "S": "BYTE",  # (byte-)string
-        "U": "STRING",  # Unicode
-        "V": None,  # raw data (void)
-    }
-
-    def __init__(self, data, cursor_description, db_engine_spec):
-        data = data or []
-
-        column_names = []
-        dtype = None
-        if cursor_description:
-            # get deduped list of column names
-            column_names = dedup([col[0] for col in cursor_description])
-
-            # fix cursor descriptor with the deduped names
-            cursor_description = [
-                tuple([column_name, *list(description)[1:]])
-                for column_name, description in zip(column_names, cursor_description)
-            ]
-
-            # get type for better type casting, if possible
-            dtype = db_engine_spec.get_pandas_dtype(cursor_description)
-
-        self.column_names = column_names
-
-        if dtype:
-            # put data in a 2D array so we can efficiently access each column;
-            # the reshape ensures the shape is 2D in case data is empty
-            array = np.array(data, dtype="object").reshape(-1, len(column_names))
-            # convert each column in data into a Series of the proper dtype; we
-            # need to do this because we can not specify a mixed dtype when
-            # instantiating the DataFrame, and this allows us to have different
-            # dtypes for each column.
-            data = {
-                column: pd.Series(array[:, i], dtype=dtype[column])
-                for i, column in enumerate(column_names)
-            }
-            self.df = pd.DataFrame(data, columns=column_names)
-        else:
-            self.df = pd.DataFrame(list(data), columns=column_names).infer_objects()
-
-        self._type_dict = {}
-        try:
-            # The driver may not be passing a cursor.description
-            self._type_dict = {
-                col: db_engine_spec.get_datatype(cursor_description[i][1])
-                for i, col in enumerate(column_names)
-                if cursor_description
-            }
-        except Exception as e:
-            logging.exception(e)
-
-    @property
-    def raw_df(self):
-        return self.df
-
-    @property
-    def size(self):
-        return len(self.df.index)
-
-    @property
-    def data(self):
-        return self.format_data(self.df)
-
-    @classmethod
-    def format_data(cls, df):
-        # work around for https://github.com/pandas-dev/pandas/issues/18372
-        data = [
-            dict(
-                (k, maybe_box_datetimelike(v))
-                for k, v in zip(df.columns, np.atleast_1d(row))
-            )
-            for row in df.values
-        ]
-        for d in data:
-            for k, v in list(d.items()):
-                # if an int is too big for Java Script to handle
-                # convert it to a string
-                if isinstance(v, int):
-                    if abs(v) > JS_MAX_INTEGER:
-                        d[k] = str(v)
-        return data
-
-    @classmethod
-    def db_type(cls, dtype):
-        """Given a numpy dtype, Returns a generic database type"""
-        if isinstance(dtype, ExtensionDtype):
-            return cls.type_map.get(dtype.kind)
-        elif hasattr(dtype, "char"):
-            return cls.type_map.get(dtype.char)
-
-    @classmethod
-    def datetime_conversion_rate(cls, data_series):
-        success = 0
-        total = 0
-        for value in data_series:
-            total += 1
-            try:
-                pd.to_datetime(value)
-                success += 1
-            except Exception:
-                continue
-        return 100 * success / total
-
-    @staticmethod
-    def is_date(np_dtype, db_type_str):
-        def looks_daty(s):
-            if isinstance(s, str):
-                return any([s.lower().startswith(ss) for ss in ("time", "date")])
-            return False
-
-        if looks_daty(db_type_str):
-            return True
-        if np_dtype and np_dtype.name and looks_daty(np_dtype.name):
-            return True
-        return False
-
-    @classmethod
-    def is_dimension(cls, dtype, column_name):
-        if cls.is_id(column_name):
-            return False
-        return dtype.name in ("object", "bool")
-
-    @classmethod
-    def is_id(cls, column_name):
-        return column_name.startswith("id") or column_name.endswith("id")
-
-    @classmethod
-    def agg_func(cls, dtype, column_name):
-        # consider checking for key substring too.
-        if cls.is_id(column_name):
-            return "count_distinct"
-        if (
-            hasattr(dtype, "type")
-            and issubclass(dtype.type, np.generic)
-            and is_numeric(dtype)
-        ):
-            return "sum"
-        return None
-
-    @property
-    def columns(self):
-        """Provides metadata about columns for data visualization.
-
-        :return: dict, with the fields name, type, is_date, is_dim and agg.
-        """
-        if self.df.empty:
-            return None
-
-        columns = []
-        sample_size = min(INFER_COL_TYPES_SAMPLE_SIZE, len(self.df.index))
-        sample = self.df
-        if sample_size:
-            sample = self.df.sample(sample_size)
-        for col in self.df.dtypes.keys():
-            db_type_str = self._type_dict.get(col) or self.db_type(self.df.dtypes[col])
-            column = {
-                "name": col,
-                "agg": self.agg_func(self.df.dtypes[col], col),
-                "type": db_type_str,
-                "is_date": self.is_date(self.df.dtypes[col], db_type_str),
-                "is_dim": self.is_dimension(self.df.dtypes[col], col),
-            }
-
-            if not db_type_str or db_type_str.upper() == "OBJECT":
-                v = sample[col].iloc[0] if not sample[col].empty else None
-                if isinstance(v, str):
-                    column["type"] = "STRING"
-                elif isinstance(v, int):
-                    column["type"] = "INT"
-                elif isinstance(v, float):
-                    column["type"] = "FLOAT"
-                elif isinstance(v, (datetime, date)):
-                    column["type"] = "DATETIME"
-                    column["is_date"] = True
-                    column["is_dim"] = False
-                # check if encoded datetime
-                if (
-                    column["type"] == "STRING"
-                    and self.datetime_conversion_rate(sample[col])
-                    > INFER_COL_TYPES_THRESHOLD
-                ):
-                    column.update({"is_date": True, "is_dim": False, "agg": None})
-            # 'agg' is optional attribute
-            if not column["agg"]:
-                column.pop("agg", None)
-            columns.append(column)
-        return columns
+def df_to_dict(dframe: pd.DataFrame) -> Dict:
+    data = dframe.to_dict(orient="records")
+    # TODO: refactor this
+    for d in data:
+        for k, v in list(d.items()):
+            # if an int is too big for Java Script to handle
+            # convert it to a string
+            if isinstance(v, int):
+                if abs(v) > JS_MAX_INTEGER:
+                    d[k] = str(v)
+    return data
 
 Review comment:
   Would it be preferable or simpler to convert all numeric types in the dataframe to strings before dict-ifying it to (I presume) send to the client?

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: notifications-unsubscribe@superset.apache.org
For additional commands, e-mail: notifications-help@superset.apache.org