You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@superset.apache.org by GitBox <gi...@apache.org> on 2018/08/03 16:53:57 UTC
[GitHub] mistercrunch closed pull request #5487: Match viz dataframe column case to form_data fields for Snowflake, Oracle and Redshift

mistercrunch closed pull request #5487: Match viz dataframe column case to form_data fields for Snowflake, Oracle and Redshift
URL: https://github.com/apache/incubator-superset/pull/5487
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/superset/dataframe.py b/superset/dataframe.py
index 30ba4c776b..2fecad9264 100644
--- a/superset/dataframe.py
+++ b/superset/dataframe.py
@@ -27,23 +27,26 @@
 INFER_COL_TYPES_SAMPLE_SIZE = 100
 
 
-def dedup(l, suffix='__'):
+def dedup(l, suffix='__', case_sensitive=True):
     """De-duplicates a list of string by suffixing a counter
 
     Always returns the same number of entries as provided, and always returns
-    unique values.
+    unique values. Case sensitive comparison by default.
 
-    >>> print(','.join(dedup(['foo', 'bar', 'bar', 'bar'])))
-    foo,bar,bar__1,bar__2
+    >>> print(','.join(dedup(['foo', 'bar', 'bar', 'bar', 'Bar'])))
+    foo,bar,bar__1,bar__2,Bar
+    >>> print(','.join(dedup(['foo', 'bar', 'bar', 'bar', 'Bar'], case_sensitive=False)))
+    foo,bar,bar__1,bar__2,Bar__3
     """
     new_l = []
     seen = {}
     for s in l:
-        if s in seen:
-            seen[s] += 1
-            s += suffix + str(seen[s])
+        s_fixed_case = s if case_sensitive else s.lower()
+        if s_fixed_case in seen:
+            seen[s_fixed_case] += 1
+            s += suffix + str(seen[s_fixed_case])
         else:
-            seen[s] = 0
+            seen[s_fixed_case] = 0
         new_l.append(s)
     return new_l
 
@@ -70,7 +73,9 @@ def __init__(self, data, cursor_description, db_engine_spec):
         if cursor_description:
             column_names = [col[0] for col in cursor_description]
 
-        self.column_names = dedup(column_names)
+        case_sensitive = db_engine_spec.consistent_case_sensitivity
+        self.column_names = dedup(column_names,
+                                  case_sensitive=case_sensitive)
 
         data = data or []
         self.df = (
diff --git a/superset/db_engine_specs.py b/superset/db_engine_specs.py
index 4967d30647..c07910af4d 100644
--- a/superset/db_engine_specs.py
+++ b/superset/db_engine_specs.py
@@ -101,6 +101,7 @@ class BaseEngineSpec(object):
     time_secondary_columns = False
     inner_joins = True
     allows_subquery = True
+    consistent_case_sensitivity = True  # do results have same case as qry for col names?
 
     @classmethod
     def get_time_grains(cls):
@@ -318,7 +319,6 @@ def select_star(cls, my_db, table_name, engine, schema=None, limit=100,
 
         if show_cols:
             fields = [sqla.column(c.get('name')) for c in cols]
-        full_table_name = table_name
         quote = engine.dialect.identifier_preparer.quote
         if schema:
             full_table_name = quote(schema) + '.' + quote(table_name)
@@ -366,6 +366,57 @@ def get_configuration_for_impersonation(cls, uri, impersonate_user, username):
     def execute(cursor, query, async=False):
         cursor.execute(query)
 
+    @classmethod
+    def adjust_df_column_names(cls, df, fd):
+        """Based of fields in form_data, return dataframe with new column names
+
+        Usually sqla engines return column names whose case matches that of the
+        original query. For example:
+            SELECT 1 as col1, 2 as COL2, 3 as Col_3
+        will usually result in the following df.columns:
+            ['col1', 'COL2', 'Col_3'].
+        For these engines there is no need to adjust the dataframe column names
+        (default behavior). However, some engines (at least Snowflake, Oracle and
+        Redshift) return column names with different case than in the original query,
+        usually all uppercase. For these the column names need to be adjusted to
+        correspond to the case of the fields specified in the form data for Viz
+        to work properly. This adjustment can be done here.
+        """
+        if cls.consistent_case_sensitivity:
+            return df
+        else:
+            return cls.align_df_col_names_with_form_data(df, fd)
+
+    @staticmethod
+    def align_df_col_names_with_form_data(df, fd):
+        """Helper function to rename columns that have changed case during query.
+
+        Returns a dataframe where column names have been adjusted to correspond with
+        column names in form data (case insensitive). Examples:
+        dataframe: 'col1', form_data: 'col1' -> no change
+        dataframe: 'COL1', form_data: 'col1' -> dataframe column renamed: 'col1'
+        dataframe: 'col1', form_data: 'Col1' -> dataframe column renamed: 'Col1'
+        """
+
+        columns = set()
+        lowercase_mapping = {}
+
+        metrics = utils.get_metric_names(fd.get('metrics', []))
+        groupby = fd.get('groupby', [])
+        other_cols = [utils.DTTM_ALIAS]
+        for col in metrics + groupby + other_cols:
+            columns.add(col)
+            lowercase_mapping[col.lower()] = col
+
+        rename_cols = {}
+        for col in df.columns:
+            if col not in columns:
+                orig_col = lowercase_mapping.get(col.lower())
+                if orig_col:
+                    rename_cols[col] = orig_col
+
+        return df.rename(index=str, columns=rename_cols)
+
 
 class PostgresBaseEngineSpec(BaseEngineSpec):
     """ Abstract class for Postgres 'like' databases """
@@ -414,6 +465,7 @@ def get_table_names(cls, schema, inspector):
 
 class SnowflakeEngineSpec(PostgresBaseEngineSpec):
     engine = 'snowflake'
+    consistent_case_sensitivity = False
     time_grain_functions = {
         None: '{col}',
         'PT1S': "DATE_TRUNC('SECOND', {col})",
@@ -434,6 +486,15 @@ class SnowflakeEngineSpec(PostgresBaseEngineSpec):
         'P1Y': "DATE_TRUNC('YEAR', {col})",
     }
 
+    @classmethod
+    def adjust_database_uri(cls, uri, selected_schema=None):
+        database = uri.database
+        if '/' in uri.database:
+            database = uri.database.split('/')[0]
+        if selected_schema:
+            uri.database = database + '/' + selected_schema
+        return uri
+
 
 class VerticaEngineSpec(PostgresBaseEngineSpec):
     engine = 'vertica'
@@ -441,11 +502,13 @@ class VerticaEngineSpec(PostgresBaseEngineSpec):
 
 class RedshiftEngineSpec(PostgresBaseEngineSpec):
     engine = 'redshift'
+    consistent_case_sensitivity = False
 
 
 class OracleEngineSpec(PostgresBaseEngineSpec):
     engine = 'oracle'
     limit_method = LimitMethod.WRAP_SQL
+    consistent_case_sensitivity = False
 
     time_grain_functions = {
         None: '{col}',
diff --git a/superset/viz.py b/superset/viz.py
index 770d7eaff0..7ba8645bbc 100644
--- a/superset/viz.py
+++ b/superset/viz.py
@@ -153,7 +153,7 @@ def run_extra_queries(self):
 
     def handle_nulls(self, df):
         fillna = self.get_fillna_for_columns(df.columns)
-        df = df.fillna(fillna)
+        return df.fillna(fillna)
 
     def get_fillna_for_col(self, col):
         """Returns the value to use as filler for a specific Column.type"""
@@ -217,7 +217,7 @@ def get_df(self, query_obj=None):
                 self.df_metrics_to_num(df, query_obj.get('metrics') or [])
 
             df.replace([np.inf, -np.inf], np.nan)
-            self.handle_nulls(df)
+            df = self.handle_nulls(df)
         return df
 
     @staticmethod
@@ -382,6 +382,9 @@ def get_df_payload(self, query_obj=None):
         if query_obj and not is_loaded:
             try:
                 df = self.get_df(query_obj)
+                if hasattr(self.datasource.database, 'db_engine_spec'):
+                    db_engine_spec = self.datasource.database.db_engine_spec
+                    df = db_engine_spec.adjust_df_column_names(df, self.form_data)
                 if self.status != utils.QueryStatus.FAILED:
                     stats_logger.incr('loaded_from_source')
                     is_loaded = True
diff --git a/tests/dataframe_test.py b/tests/dataframe_test.py
index fdba431491..a773f08c4e 100644
--- a/tests/dataframe_test.py
+++ b/tests/dataframe_test.py
@@ -16,12 +16,16 @@ def test_dedup(self):
             ['foo', 'bar'],
         )
         self.assertEquals(
-            dedup(['foo', 'bar', 'foo', 'bar']),
-            ['foo', 'bar', 'foo__1', 'bar__1'],
+            dedup(['foo', 'bar', 'foo', 'bar', 'Foo']),
+            ['foo', 'bar', 'foo__1', 'bar__1', 'Foo'],
         )
         self.assertEquals(
-            dedup(['foo', 'bar', 'bar', 'bar']),
-            ['foo', 'bar', 'bar__1', 'bar__2'],
+            dedup(['foo', 'bar', 'bar', 'bar', 'Bar']),
+            ['foo', 'bar', 'bar__1', 'bar__2', 'Bar'],
+        )
+        self.assertEquals(
+            dedup(['foo', 'bar', 'bar', 'bar', 'Bar'], case_sensitive=False),
+            ['foo', 'bar', 'bar__1', 'bar__2', 'Bar__3'],
         )
 
     def test_get_columns_basic(self):


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: notifications-unsubscribe@superset.apache.org
For additional commands, e-mail: notifications-help@superset.apache.org