You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@superset.apache.org by be...@apache.org on 2023/08/10 23:15:35 UTC

[superset] 01/01: fix: to_datetime in Pandas 2

This is an automated email from the ASF dual-hosted git repository.

beto pushed a commit to branch sc_73447
in repository https://gitbox.apache.org/repos/asf/superset.git

commit 4ecf3b911849d8abd457a39919d64aa91b804a59
Author: Beto Dealmeida <ro...@dealmeida.net>
AuthorDate: Thu Aug 10 16:14:50 2023 -0700

    fix: to_datetime in Pandas 2
---
 superset/utils/core.py              | 11 +++++++++--
 tests/unit_tests/utils/test_core.py | 30 ++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/superset/utils/core.py b/superset/utils/core.py
index cd8c62efe7..e0ebf8cb9a 100644
--- a/superset/utils/core.py
+++ b/superset/utils/core.py
@@ -1834,17 +1834,24 @@ def normalize_dttm_col(
                 # Column is formatted as a numeric value
                 unit = _col.timestamp_format.replace("epoch_", "")
                 df[_col.col_label] = pd.to_datetime(
-                    dttm_series, utc=False, unit=unit, origin="unix", errors="coerce"
+                    dttm_series,
+                    utc=False,
+                    unit=unit,
+                    origin="unix",
+                    errors="raise",
+                    exact=False,
                 )
             else:
                 # Column has already been formatted as a timestamp.
                 df[_col.col_label] = dttm_series.apply(pd.Timestamp)
         else:
+            print(_col.timestamp_format)
             df[_col.col_label] = pd.to_datetime(
                 df[_col.col_label],
                 utc=False,
                 format=_col.timestamp_format,
-                errors="coerce",
+                errors="raise",
+                exact=False,
             )
         if _col.offset:
             df[_col.col_label] += timedelta(hours=_col.offset)
diff --git a/tests/unit_tests/utils/test_core.py b/tests/unit_tests/utils/test_core.py
index 568595517c..562ebe582e 100644
--- a/tests/unit_tests/utils/test_core.py
+++ b/tests/unit_tests/utils/test_core.py
@@ -17,11 +17,14 @@
 import os
 from typing import Any, Optional
 
+import pandas as pd
 import pytest
 
 from superset.utils.core import (
     cast_to_boolean,
+    DateColumn,
     is_test,
+    normalize_dttm_col,
     parse_boolean_string,
     QueryObjectFilterClause,
     remove_extra_adhoc_filters,
@@ -171,3 +174,30 @@ def test_other_values():
     assert cast_to_boolean([]) is False
     assert cast_to_boolean({}) is False
     assert cast_to_boolean(object()) is False
+
+
+def test_normalize_dttm_col() -> None:
+    """
+    Tests for the ``normalize_dttm_col`` function.
+
+    In particular, this covers a regression when Pandas was upgraded from 1.5.3 to
+    2.0.3 and the behavior of ``pd.to_datetime`` changed.
+    """
+    df = pd.DataFrame({"__time": ["2017-07-01T00:00:00.000Z"]})
+    assert (
+        df.to_markdown()
+        == """
+|    | __time                   |
+|---:|:-------------------------|
+|  0 | 2017-07-01T00:00:00.000Z |
+    """.strip()
+    )
+
+    # in 1.5.3 this would return a datetime64[ns] dtype, but in 2.0.3 we had to
+    # add ``exact=False`` since there is a leftover after parsing the format
+    dttm_cols = (DateColumn("__time", "%Y-%m-%d"),)
+
+    # the function modifies the dataframe in place
+    normalize_dttm_col(df, dttm_cols)
+
+    assert df["__time"].astype(str).tolist() == ["2017-07-01"]