You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@superset.apache.org by be...@apache.org on 2023/08/11 02:32:25 UTC
[superset] branch master updated: fix: `to_datetime` in Pandas 2 (#24952)
This is an automated email from the ASF dual-hosted git repository.
beto pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/superset.git
The following commit(s) were added to refs/heads/master by this push:
new 41ca4a00b9 fix: `to_datetime` in Pandas 2 (#24952)
41ca4a00b9 is described below
commit 41ca4a00b94437beb80adf278623888490d81a17
Author: Beto Dealmeida <ro...@dealmeida.net>
AuthorDate: Thu Aug 10 19:32:15 2023 -0700
fix: `to_datetime` in Pandas 2 (#24952)
---
superset/utils/core.py | 10 ++++++++--
tests/integration_tests/utils_tests.py | 6 +++---
tests/unit_tests/utils/test_core.py | 30 ++++++++++++++++++++++++++++++
3 files changed, 41 insertions(+), 5 deletions(-)
diff --git a/superset/utils/core.py b/superset/utils/core.py
index cd8c62efe7..8b1cc1a485 100644
--- a/superset/utils/core.py
+++ b/superset/utils/core.py
@@ -1834,7 +1834,12 @@ def normalize_dttm_col(
# Column is formatted as a numeric value
unit = _col.timestamp_format.replace("epoch_", "")
df[_col.col_label] = pd.to_datetime(
- dttm_series, utc=False, unit=unit, origin="unix", errors="coerce"
+ dttm_series,
+ utc=False,
+ unit=unit,
+ origin="unix",
+ errors="raise",
+ exact=False,
)
else:
# Column has already been formatted as a timestamp.
@@ -1844,7 +1849,8 @@ def normalize_dttm_col(
df[_col.col_label],
utc=False,
format=_col.timestamp_format,
- errors="coerce",
+ errors="raise",
+ exact=False,
)
if _col.offset:
df[_col.col_label] += timedelta(hours=_col.offset)
diff --git a/tests/integration_tests/utils_tests.py b/tests/integration_tests/utils_tests.py
index c0383d1d0b..86d8bf6e68 100644
--- a/tests/integration_tests/utils_tests.py
+++ b/tests/integration_tests/utils_tests.py
@@ -1114,7 +1114,7 @@ class TestUtils(SupersetTestCase):
df = pd.DataFrame([{"__timestamp": ts.timestamp() * 1000, "a": 1}])
assert normalize_col(df, "epoch_ms", 0, None)[DTTM_ALIAS][0] == ts
- # test that out of bounds timestamps are coerced to None instead of
- # erroring out
+ # test that we raise an error when we can't convert
df = pd.DataFrame([{"__timestamp": "1677-09-21 00:00:00", "a": 1}])
- assert pd.isnull(normalize_col(df, None, 0, None)[DTTM_ALIAS][0])
+ with pytest.raises(pd.errors.OutOfBoundsDatetime):
+ normalize_col(df, None, 0, None)
diff --git a/tests/unit_tests/utils/test_core.py b/tests/unit_tests/utils/test_core.py
index 568595517c..562ebe582e 100644
--- a/tests/unit_tests/utils/test_core.py
+++ b/tests/unit_tests/utils/test_core.py
@@ -17,11 +17,14 @@
import os
from typing import Any, Optional
+import pandas as pd
import pytest
from superset.utils.core import (
cast_to_boolean,
+ DateColumn,
is_test,
+ normalize_dttm_col,
parse_boolean_string,
QueryObjectFilterClause,
remove_extra_adhoc_filters,
@@ -171,3 +174,30 @@ def test_other_values():
assert cast_to_boolean([]) is False
assert cast_to_boolean({}) is False
assert cast_to_boolean(object()) is False
+
+
+def test_normalize_dttm_col() -> None:
+ """
+ Tests for the ``normalize_dttm_col`` function.
+
+ In particular, this covers a regression when Pandas was upgraded from 1.5.3 to
+ 2.0.3 and the behavior of ``pd.to_datetime`` changed.
+ """
+ df = pd.DataFrame({"__time": ["2017-07-01T00:00:00.000Z"]})
+ assert (
+ df.to_markdown()
+ == """
+| | __time |
+|---:|:-------------------------|
+| 0 | 2017-07-01T00:00:00.000Z |
+ """.strip()
+ )
+
+ # in 1.5.3 this would return a datetime64[ns] dtype, but in 2.0.3 we had to
+ # add ``exact=False`` since there is a leftover after parsing the format
+ dttm_cols = (DateColumn("__time", "%Y-%m-%d"),)
+
+ # the function modifies the dataframe in place
+ normalize_dttm_col(df, dttm_cols)
+
+ assert df["__time"].astype(str).tolist() == ["2017-07-01"]