You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@superset.apache.org by GitBox <gi...@apache.org> on 2020/04/02 19:19:08 UTC

[GitHub] [incubator-superset] ktmud commented on a change in pull request #9427: feat: Add post processing to QueryObject

ktmud commented on a change in pull request #9427: feat: Add post processing to QueryObject
URL: https://github.com/apache/incubator-superset/pull/9427#discussion_r402554088
 
 

 ##########
 File path: superset/utils/pandas_postprocessing.py
 ##########
 @@ -0,0 +1,210 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from functools import partial
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+from flask_babel import gettext as _
+from pandas import DataFrame, NamedAgg
+
+from superset.exceptions import NullValueException, SupersetException
+
+SUPPORTED_NUMPY_FUNCTIONS = (
+    "average",
+    "argmin",
+    "argmax",
+    "cumsum",
+    "cumprod",
+    "max",
+    "mean",
+    "median",
+    "nansum" "nanmin" "nanmax" "nanmean",
+    "nanmedian",
+    "min",
+    "percentile",
+    "prod",
+    "product",
+    "std",
+    "sum",
+    "var",
+)
+
+
+def _get_aggregate_funcs(aggregates: Dict[str, Dict[str, Any]],) -> Dict[str, NamedAgg]:
+    """
+    Converts a set of aggregate config objects into functions that pandas can use as
+    aggregators. Currently only numpy aggregators are supported.
+
+    :param aggregates: Mapping from column name to aggregat config.
+    :return: Mapping from metric name to function that takes a single input argument.
+    """
+    agg_funcs: Dict[str, NamedAgg] = {}
+    for name, agg_obj in aggregates.items():
+        column = agg_obj.get("column", name)
+        operator = agg_obj.get("operator") or "sum"
+        if operator not in SUPPORTED_NUMPY_FUNCTIONS:
+            raise SupersetException("Unsupported numpy function: %")
+        func = getattr(np, operator)
+        options = agg_obj.get("options", {})
+        agg_funcs[name] = NamedAgg(column=column, aggfunc=partial(func, **options))
+
+    return agg_funcs
+
+
+def pivot(
+    df: DataFrame,
+    index: List[str],
+    columns: List[str],
+    aggregates: Dict[str, Dict[str, Any]],
+    metric_fill_value: Optional[Any] = None,
+    column_fill_value: Optional[str] = None,
+    drop_missing_columns: Optional[bool] = True,
+    combine_value_with_metric=False,
+    marginal_distributions: Optional[bool] = None,
+    marginal_distribution_name: Optional[str] = None,
+) -> DataFrame:
+    """
+    Perform a pivot operation on a DataFrame.
+
+    :param df: Object on which pivot operation will be performed
+    :param index: Columns to group by on the table index (=rows)
+    :param columns: Columns to group by on the table columns
+    :param metric_fill_value: Value to replace missing values with
+    :param column_fill_value: Value to replace missing pivot columns with
+    :param drop_missing_columns: Do not include columns whose entries are all missing
+    :param combine_value_with_metric: Display metrics side by side within each column,
+           as opposed to each column being displayed side by side for each metric.
+    :param aggregates: A mapping from aggregate column name to the the aggregate
+           config.
+    :param marginal_distributions: Add totals for row/column. Default to False
+    :param marginal_distribution_name: Name of row/column with marginal distribution.
+           Default to 'All'.
+    :return: A pivot table
+    """
+    if not index:
+        raise SupersetException(_("Pivot operation requires at least one index"))
+    if not columns:
+        raise SupersetException(_("Pivot operation requires at least one column"))
+    if not aggregates:
+        raise SupersetException(_("Pivot operation specifying aggregates"))
+
+    if column_fill_value:
+        df[columns] = df[columns].fillna(value=column_fill_value)
+
+    aggregate_funcs = _get_aggregate_funcs(aggregates)
+
+    # TODO (villebro): Pandas 1.0.3 doesn't yet support NamedAgg in pivot_table.
+    #  Remove once support is added.
+    aggfunc = {na.column: na.aggfunc for na in aggregate_funcs.values()}
+
+    df = df.pivot_table(
+        values=aggfunc.keys(),
+        index=index,
+        columns=columns,
+        aggfunc=aggfunc,
+        fill_value=metric_fill_value,
+        dropna=drop_missing_columns,
+        margins=marginal_distributions,
+        margins_name=marginal_distribution_name,
+    )
+
+    if combine_value_with_metric:
+        df = df.stack(0).unstack()
+
+    return df
+
+
+def aggregate(
+    df: DataFrame, groupby: List[str], aggregates: Dict[str, Dict[str, Any]]
+) -> DataFrame:
+    """
+    Apply aggregations to a DataFrame.
+
+    :param df: Object to aggregate.
+    :param groupby: columns to aggregate
+    :param aggregates: A mapping from metric column to the function used to
+           aggregate values.
+    :return: Aggregated DataFrame
+    """
+    aggregates = aggregates or {}
+    aggregate_funcs = _get_aggregate_funcs(aggregates)
+    return df.groupby(by=groupby).agg(**aggregate_funcs)
+
+
+def sort(
+    df: DataFrame,
+    by: List[str],
+    ascending: Optional[Union[bool, Dict[str, bool]]] = True,
+) -> DataFrame:
+    """
+    Sort a DataFrame.
+
+    df: DataFrame to sort.
+    by: columns by by which to sort `df`.
+    ascending: Sort order. Defaults to True. If bool, applies same sort order to all
+               columns. If dict, applies sorting per column, defaulting to True if
+               missing.
+    """
+    if isinstance(ascending, dict):
+        ascending = [ascending.get(col, True) for col in by]
+
+    return df.sort_values(by=by, ascending=ascending)
+
+
+def rolling(
+    df: DataFrame,
+    rolling_type: str,
+    center: bool = False,
+    win_type: Optional[str] = None,
+    window: Optional[int] = None,
+    min_periods: Optional[int] = None,
+) -> DataFrame:
+    """
+    Apply a rolling window on the dataset. See the Pandas docs for further details:
+    https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rolling.html
+
+    :param df: DataFrame on which the rolling period will be based.
+    :param rolling_type: Type of rolling window. Any numpy function will work.
+    :param center: Should the label be at the center of the window.
+    :param win_type: Type of window function.
+    :param window: Size of the window.
+    :param min_periods:
+    :return:
+    """
+    if rolling_type == "cumsum":
+        df = df.cumsum()
+    else:
+        kwargs = {}
+        if window is not None:
+            kwargs["window"] = window
+        if min_periods is not None:
+            kwargs["min_periods"] = min_periods
+        if center is not None:
+            kwargs["center"] = center
+        if win_type is not None:
+            kwargs["win_type"] = win_type
+
+        df_rolling = df.rolling(**kwargs)
+        if not hasattr(df_rolling, rolling_type):
+            raise SupersetException(
+                _("Unsupported rolling_type: %(type)s", type=rolling_type)
+            )
+        df = getattr(df_rolling, rolling_type)()
 
 Review comment:
   It would be more useful if
   
   1. Rolling results are added as a new column to the original df
   2. Users can select which columns to roll
   3. There is the ability to compute rolling diffs
   
   How about an API like this:
   
   ```python
   [
       {
           "operation": "rolling",
           "options": {
               "columns": ["abc", "cde"],
               # 7 day moving average
               "output_columns": ["abc MA7", "cde MA7"],
               "rolling_type": "mean",
               "win_type": None,
               "window": 7,
               "min_periods": 7,
           },
       },
       {
           "operation": "rolling",
           "options": {
               "columns": ["abc", "cde"],
               # MoM growth computed with 30 MA
               "output_columns": ["abc MoM", "cde MoM"],
               "rolling_type": "mean",
               "win_type": None,
               "window": 30,
               "min_periods": 30,
               # apply `df.diff(...)` if `diff_periods is specified`
               "diff_periods": 30,
           },
       },
       {
           "operation": "rolling",
           "options": {
               "columns": ["abc", "cde"],
               # YoY growth in percentages
               "output_columns": ["abc YoY (%)", "cde YoY (%)"],
               "rolling_type": "mean",
               "win_type": None,
               "window": 365,
               "min_periods": 365,
               # apply `df.pct_change(...)`
               "pct_change_periods": 365,
           },
       },
       # `diff_periods` and `pct_change_periods` are shortcuts
       # to applying additional `diff` and `pct_change` operators
       # on the output columns.
       {
           "operation": "diff",
           "options": {
               "columns": ["abc", "cde"],
               "output_columns": ["abc MoM", "cde MoM"],
               "periods": 7,
           },
       },
   ]
   ```
   
   

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: notifications-unsubscribe@superset.apache.org
For additional commands, e-mail: notifications-help@superset.apache.org