You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@superset.apache.org by be...@apache.org on 2023/08/11 02:32:25 UTC

[superset] branch master updated: fix: `to_datetime` in Pandas 2 (#24952)

This is an automated email from the ASF dual-hosted git repository.

beto pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/superset.git


The following commit(s) were added to refs/heads/master by this push:
     new 41ca4a00b9 fix: `to_datetime` in Pandas 2 (#24952)
41ca4a00b9 is described below

commit 41ca4a00b94437beb80adf278623888490d81a17
Author: Beto Dealmeida <ro...@dealmeida.net>
AuthorDate: Thu Aug 10 19:32:15 2023 -0700

    fix: `to_datetime` in Pandas 2 (#24952)
---
 superset/utils/core.py                 | 10 ++++++++--
 tests/integration_tests/utils_tests.py |  6 +++---
 tests/unit_tests/utils/test_core.py    | 30 ++++++++++++++++++++++++++++++
 3 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/superset/utils/core.py b/superset/utils/core.py
index cd8c62efe7..8b1cc1a485 100644
--- a/superset/utils/core.py
+++ b/superset/utils/core.py
@@ -1834,7 +1834,12 @@ def normalize_dttm_col(
                 # Column is formatted as a numeric value
                 unit = _col.timestamp_format.replace("epoch_", "")
                 df[_col.col_label] = pd.to_datetime(
-                    dttm_series, utc=False, unit=unit, origin="unix", errors="coerce"
+                    dttm_series,
+                    utc=False,
+                    unit=unit,
+                    origin="unix",
+                    errors="raise",
+                    exact=False,
                 )
             else:
                 # Column has already been formatted as a timestamp.
@@ -1844,7 +1849,8 @@ def normalize_dttm_col(
                 df[_col.col_label],
                 utc=False,
                 format=_col.timestamp_format,
-                errors="coerce",
+                errors="raise",
+                exact=False,
             )
         if _col.offset:
             df[_col.col_label] += timedelta(hours=_col.offset)
diff --git a/tests/integration_tests/utils_tests.py b/tests/integration_tests/utils_tests.py
index c0383d1d0b..86d8bf6e68 100644
--- a/tests/integration_tests/utils_tests.py
+++ b/tests/integration_tests/utils_tests.py
@@ -1114,7 +1114,7 @@ class TestUtils(SupersetTestCase):
         df = pd.DataFrame([{"__timestamp": ts.timestamp() * 1000, "a": 1}])
         assert normalize_col(df, "epoch_ms", 0, None)[DTTM_ALIAS][0] == ts
 
-        # test that out of bounds timestamps are coerced to None instead of
-        # erroring out
+        # test that we raise an error when we can't convert
         df = pd.DataFrame([{"__timestamp": "1677-09-21 00:00:00", "a": 1}])
-        assert pd.isnull(normalize_col(df, None, 0, None)[DTTM_ALIAS][0])
+        with pytest.raises(pd.errors.OutOfBoundsDatetime):
+            normalize_col(df, None, 0, None)
diff --git a/tests/unit_tests/utils/test_core.py b/tests/unit_tests/utils/test_core.py
index 568595517c..562ebe582e 100644
--- a/tests/unit_tests/utils/test_core.py
+++ b/tests/unit_tests/utils/test_core.py
@@ -17,11 +17,14 @@
 import os
 from typing import Any, Optional
 
+import pandas as pd
 import pytest
 
 from superset.utils.core import (
     cast_to_boolean,
+    DateColumn,
     is_test,
+    normalize_dttm_col,
     parse_boolean_string,
     QueryObjectFilterClause,
     remove_extra_adhoc_filters,
@@ -171,3 +174,30 @@ def test_other_values():
     assert cast_to_boolean([]) is False
     assert cast_to_boolean({}) is False
     assert cast_to_boolean(object()) is False
+
+
+def test_normalize_dttm_col() -> None:
+    """
+    Tests for the ``normalize_dttm_col`` function.
+
+    In particular, this covers a regression when Pandas was upgraded from 1.5.3 to
+    2.0.3 and the behavior of ``pd.to_datetime`` changed.
+    """
+    df = pd.DataFrame({"__time": ["2017-07-01T00:00:00.000Z"]})
+    assert (
+        df.to_markdown()
+        == """
+|    | __time                   |
+|---:|:-------------------------|
+|  0 | 2017-07-01T00:00:00.000Z |
+    """.strip()
+    )
+
+    # in 1.5.3 this would return a datetime64[ns] dtype, but in 2.0.3 we had to
+    # add ``exact=False`` since there is a leftover after parsing the format
+    dttm_cols = (DateColumn("__time", "%Y-%m-%d"),)
+
+    # the function modifies the dataframe in place
+    normalize_dttm_col(df, dttm_cols)
+
+    assert df["__time"].astype(str).tolist() == ["2017-07-01"]