You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@superset.apache.org by be...@apache.org on 2023/08/10 23:15:34 UTC

[superset] branch sc_73447 created (now 4ecf3b9118)

This is an automated email from the ASF dual-hosted git repository.

beto pushed a change to branch sc_73447
in repository https://gitbox.apache.org/repos/asf/superset.git


      at 4ecf3b9118 fix: to_datetime in Pandas 2

This branch includes the following new commits:

     new 4ecf3b9118 fix: to_datetime in Pandas 2

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[superset] 01/01: fix: to_datetime in Pandas 2

Posted by be...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

beto pushed a commit to branch sc_73447
in repository https://gitbox.apache.org/repos/asf/superset.git

commit 4ecf3b911849d8abd457a39919d64aa91b804a59
Author: Beto Dealmeida <ro...@dealmeida.net>
AuthorDate: Thu Aug 10 16:14:50 2023 -0700

    fix: to_datetime in Pandas 2
---
 superset/utils/core.py              | 11 +++++++++--
 tests/unit_tests/utils/test_core.py | 30 ++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/superset/utils/core.py b/superset/utils/core.py
index cd8c62efe7..e0ebf8cb9a 100644
--- a/superset/utils/core.py
+++ b/superset/utils/core.py
@@ -1834,17 +1834,24 @@ def normalize_dttm_col(
                 # Column is formatted as a numeric value
                 unit = _col.timestamp_format.replace("epoch_", "")
                 df[_col.col_label] = pd.to_datetime(
-                    dttm_series, utc=False, unit=unit, origin="unix", errors="coerce"
+                    dttm_series,
+                    utc=False,
+                    unit=unit,
+                    origin="unix",
+                    errors="raise",
+                    exact=False,
                 )
             else:
                 # Column has already been formatted as a timestamp.
                 df[_col.col_label] = dttm_series.apply(pd.Timestamp)
         else:
+            print(_col.timestamp_format)
             df[_col.col_label] = pd.to_datetime(
                 df[_col.col_label],
                 utc=False,
                 format=_col.timestamp_format,
-                errors="coerce",
+                errors="raise",
+                exact=False,
             )
         if _col.offset:
             df[_col.col_label] += timedelta(hours=_col.offset)
diff --git a/tests/unit_tests/utils/test_core.py b/tests/unit_tests/utils/test_core.py
index 568595517c..562ebe582e 100644
--- a/tests/unit_tests/utils/test_core.py
+++ b/tests/unit_tests/utils/test_core.py
@@ -17,11 +17,14 @@
 import os
 from typing import Any, Optional
 
+import pandas as pd
 import pytest
 
 from superset.utils.core import (
     cast_to_boolean,
+    DateColumn,
     is_test,
+    normalize_dttm_col,
     parse_boolean_string,
     QueryObjectFilterClause,
     remove_extra_adhoc_filters,
@@ -171,3 +174,30 @@ def test_other_values():
     assert cast_to_boolean([]) is False
     assert cast_to_boolean({}) is False
     assert cast_to_boolean(object()) is False
+
+
+def test_normalize_dttm_col() -> None:
+    """
+    Tests for the ``normalize_dttm_col`` function.
+
+    In particular, this covers a regression when Pandas was upgraded from 1.5.3 to
+    2.0.3 and the behavior of ``pd.to_datetime`` changed.
+    """
+    df = pd.DataFrame({"__time": ["2017-07-01T00:00:00.000Z"]})
+    assert (
+        df.to_markdown()
+        == """
+|    | __time                   |
+|---:|:-------------------------|
+|  0 | 2017-07-01T00:00:00.000Z |
+    """.strip()
+    )
+
+    # in 1.5.3 this would return a datetime64[ns] dtype, but in 2.0.3 we had to
+    # add ``exact=False`` since there is a leftover after parsing the format
+    dttm_cols = (DateColumn("__time", "%Y-%m-%d"),)
+
+    # the function modifies the dataframe in place
+    normalize_dttm_col(df, dttm_cols)
+
+    assert df["__time"].astype(str).tolist() == ["2017-07-01"]