You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2021/10/07 03:21:57 UTC
[spark] branch master updated: [SPARK-36742][PYTHON] Fix ps.to_datetime with plurals of keys like years, months, days

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 218da86  [SPARK-36742][PYTHON] Fix ps.to_datetime with plurals of keys like years, months, days
218da86 is described below

commit 218da86b8d682ddce3208e0c57b6df7055449130
Author: dch nguyen <dg...@viettel.com.vn>
AuthorDate: Thu Oct 7 12:21:06 2021 +0900

    [SPARK-36742][PYTHON] Fix ps.to_datetime with plurals of keys like years, months, days
    
    ### What changes were proposed in this pull request?
    Fix ps.to_datetime with plurals of keys like years, months, days.
    
    ### Why are the changes needed?
    Fix ps.to_datetime with plurals of keys like years, months, days
    Before this PR
    ``` python
    # pandas
    df_test = pd.DataFrame({'years': [2015, 2016], 'months': [2, 3], 'days': [4, 5]})
    df_test['date'] = pd.to_datetime(df_test[['years', 'months', 'days']])
    df_test
    
       years  months  days       date
    0   2015       2     4 2015-02-04
    1   2016       3     5 2016-03-05
    
    # pandas on spark
    df_test = ps.DataFrame({'years': [2015, 2016], 'months': [2, 3], 'days': [4, 5]})
    df_test['date'] = ps.to_datetime(df_test[['years', 'months', 'days']])
    
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
      File "/u02/spark/python/pyspark/pandas/namespace.py", line 1643, in to_datetime
        psdf = arg[["year", "month", "day"]]
      File "/u02/spark/python/pyspark/pandas/frame.py", line 11888, in __getitem__
        return self.loc[:, list(key)]
      File "/u02/spark/python/pyspark/pandas/indexing.py", line 480, in __getitem__
        ) = self._select_cols(cols_sel)
      File "/u02/spark/python/pyspark/pandas/indexing.py", line 325, in _select_cols
        return self._select_cols_by_iterable(cols_sel, missing_keys)
      File "/u02/spark/python/pyspark/pandas/indexing.py", line 1356, in _select_cols_by_iterable
        raise KeyError("['{}'] not in index".format(name_like_string(key)))
    KeyError: "['year'] not in index"
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    After this PR :
    ``` python
    df_test = ps.DataFrame({'years': [2015, 2016], 'months': [2, 3], 'days': [4, 5]})
    df_test['date'] = ps.to_datetime(df_test[['years', 'months', 'days']])
    df_test
    
       years  months  days       date
    0   2015       2     4 2015-02-04
    1   2016       3     5 2016-03-05
    ```
    
    ### How was this patch tested?
    Unit tests
    
    Closes #34182 from dchvn/SPARK-36742.
    
    Authored-by: dch nguyen <dg...@viettel.com.vn>
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
 python/pyspark/pandas/namespace.py            | 27 +++++++++++++++++++++++++--
 python/pyspark/pandas/tests/test_namespace.py | 21 +++++++++++++++++++++
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py
index 8df5d2c..2d62dea 100644
--- a/python/pyspark/pandas/namespace.py
+++ b/python/pyspark/pandas/namespace.py
@@ -1629,9 +1629,30 @@ def to_datetime(
     DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], dtype='datetime64[ns]', freq=None)
     """
 
+    # mappings for assembling units
+    # From pandas: pandas.core.tools.datetimes
+    _unit_map = {
+        "year": "year",
+        "years": "year",
+        "month": "month",
+        "months": "month",
+        "day": "day",
+        "days": "day",
+    }
+
+    # replace passed unit with _unit_map
+    def f(value):
+        if value in _unit_map:
+            return _unit_map[value]
+
+        if value.lower() in _unit_map:
+            return _unit_map[value.lower()]
+
+        return value
+
     def pandas_to_datetime(pser_or_pdf: Union[pd.DataFrame, pd.Series]) -> Series[np.datetime64]:
         if isinstance(pser_or_pdf, pd.DataFrame):
-            pser_or_pdf = pser_or_pdf[["year", "month", "day"]]
+            pser_or_pdf = pser_or_pdf[[unit_rev["year"], unit_rev["month"], unit_rev["day"]]]
         return pd.to_datetime(
             pser_or_pdf,
             errors=errors,
@@ -1644,7 +1665,9 @@ def to_datetime(
     if isinstance(arg, Series):
         return arg.pandas_on_spark.transform_batch(pandas_to_datetime)
     if isinstance(arg, DataFrame):
-        psdf = arg[["year", "month", "day"]]
+        unit = {k: f(k) for k in arg.keys()}
+        unit_rev = {v: k for k, v in unit.items()}
+        psdf = arg[[unit_rev["year"], unit_rev["month"], unit_rev["day"]]]
         return psdf.pandas_on_spark.transform_batch(pandas_to_datetime)
     return pd.to_datetime(
         arg,
diff --git a/python/pyspark/pandas/tests/test_namespace.py b/python/pyspark/pandas/tests/test_namespace.py
index 29578a9..6d51216 100644
--- a/python/pyspark/pandas/tests/test_namespace.py
+++ b/python/pyspark/pandas/tests/test_namespace.py
@@ -71,6 +71,27 @@ class NamespaceTest(PandasOnSparkTestCase, SQLTestUtils):
             ps.to_datetime([1, 2, 3], unit="D", origin=pd.Timestamp("1960-01-01")),
         )
 
+        pdf = pd.DataFrame({"years": [2015, 2016], "month": [2, 3], "day": [4, 5]})
+        psdf = ps.from_pandas(pdf)
+        dict_from_pdf = pdf.to_dict()
+
+        self.assert_eq(pd.to_datetime(pdf), ps.to_datetime(psdf))
+        self.assert_eq(pd.to_datetime(dict_from_pdf), ps.to_datetime(dict_from_pdf))
+
+        pdf = pd.DataFrame({"years": [2015, 2016], "months": [2, 3], "day": [4, 5]})
+        psdf = ps.from_pandas(pdf)
+        dict_from_pdf = pdf.to_dict()
+
+        self.assert_eq(pd.to_datetime(pdf), ps.to_datetime(psdf))
+        self.assert_eq(pd.to_datetime(dict_from_pdf), ps.to_datetime(dict_from_pdf))
+
+        pdf = pd.DataFrame({"years": [2015, 2016], "months": [2, 3], "days": [4, 5]})
+        psdf = ps.from_pandas(pdf)
+        dict_from_pdf = pdf.to_dict()
+
+        self.assert_eq(pd.to_datetime(pdf), ps.to_datetime(psdf))
+        self.assert_eq(pd.to_datetime(dict_from_pdf), ps.to_datetime(dict_from_pdf))
+
     def test_date_range(self):
         self.assert_eq(
             ps.date_range(start="1/1/2018", end="1/08/2018"),

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org