You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2021/10/07 03:21:57 UTC
[spark] branch master updated: [SPARK-36742][PYTHON] Fix
ps.to_datetime with plurals of keys like years, months, days
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 218da86 [SPARK-36742][PYTHON] Fix ps.to_datetime with plurals of keys like years, months, days
218da86 is described below
commit 218da86b8d682ddce3208e0c57b6df7055449130
Author: dch nguyen <dg...@viettel.com.vn>
AuthorDate: Thu Oct 7 12:21:06 2021 +0900
[SPARK-36742][PYTHON] Fix ps.to_datetime with plurals of keys like years, months, days
### What changes were proposed in this pull request?
Fix ps.to_datetime with plurals of keys like years, months, days.
### Why are the changes needed?
Fix ps.to_datetime with plurals of keys like years, months, days
Before this PR
``` python
# pandas
df_test = pd.DataFrame({'years': [2015, 2016], 'months': [2, 3], 'days': [4, 5]})
df_test['date'] = pd.to_datetime(df_test[['years', 'months', 'days']])
df_test
years months days date
0 2015 2 4 2015-02-04
1 2016 3 5 2016-03-05
# pandas on spark
df_test = ps.DataFrame({'years': [2015, 2016], 'months': [2, 3], 'days': [4, 5]})
df_test['date'] = ps.to_datetime(df_test[['years', 'months', 'days']])
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/u02/spark/python/pyspark/pandas/namespace.py", line 1643, in to_datetime
psdf = arg[["year", "month", "day"]]
File "/u02/spark/python/pyspark/pandas/frame.py", line 11888, in __getitem__
return self.loc[:, list(key)]
File "/u02/spark/python/pyspark/pandas/indexing.py", line 480, in __getitem__
) = self._select_cols(cols_sel)
File "/u02/spark/python/pyspark/pandas/indexing.py", line 325, in _select_cols
return self._select_cols_by_iterable(cols_sel, missing_keys)
File "/u02/spark/python/pyspark/pandas/indexing.py", line 1356, in _select_cols_by_iterable
raise KeyError("['{}'] not in index".format(name_like_string(key)))
KeyError: "['year'] not in index"
```
### Does this PR introduce _any_ user-facing change?
After this PR :
``` python
df_test = ps.DataFrame({'years': [2015, 2016], 'months': [2, 3], 'days': [4, 5]})
df_test['date'] = ps.to_datetime(df_test[['years', 'months', 'days']])
df_test
years months days date
0 2015 2 4 2015-02-04
1 2016 3 5 2016-03-05
```
### How was this patch tested?
Unit tests
Closes #34182 from dchvn/SPARK-36742.
Authored-by: dch nguyen <dg...@viettel.com.vn>
Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
python/pyspark/pandas/namespace.py | 27 +++++++++++++++++++++++++--
python/pyspark/pandas/tests/test_namespace.py | 21 +++++++++++++++++++++
2 files changed, 46 insertions(+), 2 deletions(-)
diff --git a/python/pyspark/pandas/namespace.py b/python/pyspark/pandas/namespace.py
index 8df5d2c..2d62dea 100644
--- a/python/pyspark/pandas/namespace.py
+++ b/python/pyspark/pandas/namespace.py
@@ -1629,9 +1629,30 @@ def to_datetime(
DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], dtype='datetime64[ns]', freq=None)
"""
+ # mappings for assembling units
+ # From pandas: pandas.core.tools.datetimes
+ _unit_map = {
+ "year": "year",
+ "years": "year",
+ "month": "month",
+ "months": "month",
+ "day": "day",
+ "days": "day",
+ }
+
+ # replace passed unit with _unit_map
+ def f(value):
+ if value in _unit_map:
+ return _unit_map[value]
+
+ if value.lower() in _unit_map:
+ return _unit_map[value.lower()]
+
+ return value
+
def pandas_to_datetime(pser_or_pdf: Union[pd.DataFrame, pd.Series]) -> Series[np.datetime64]:
if isinstance(pser_or_pdf, pd.DataFrame):
- pser_or_pdf = pser_or_pdf[["year", "month", "day"]]
+ pser_or_pdf = pser_or_pdf[[unit_rev["year"], unit_rev["month"], unit_rev["day"]]]
return pd.to_datetime(
pser_or_pdf,
errors=errors,
@@ -1644,7 +1665,9 @@ def to_datetime(
if isinstance(arg, Series):
return arg.pandas_on_spark.transform_batch(pandas_to_datetime)
if isinstance(arg, DataFrame):
- psdf = arg[["year", "month", "day"]]
+ unit = {k: f(k) for k in arg.keys()}
+ unit_rev = {v: k for k, v in unit.items()}
+ psdf = arg[[unit_rev["year"], unit_rev["month"], unit_rev["day"]]]
return psdf.pandas_on_spark.transform_batch(pandas_to_datetime)
return pd.to_datetime(
arg,
diff --git a/python/pyspark/pandas/tests/test_namespace.py b/python/pyspark/pandas/tests/test_namespace.py
index 29578a9..6d51216 100644
--- a/python/pyspark/pandas/tests/test_namespace.py
+++ b/python/pyspark/pandas/tests/test_namespace.py
@@ -71,6 +71,27 @@ class NamespaceTest(PandasOnSparkTestCase, SQLTestUtils):
ps.to_datetime([1, 2, 3], unit="D", origin=pd.Timestamp("1960-01-01")),
)
+ pdf = pd.DataFrame({"years": [2015, 2016], "month": [2, 3], "day": [4, 5]})
+ psdf = ps.from_pandas(pdf)
+ dict_from_pdf = pdf.to_dict()
+
+ self.assert_eq(pd.to_datetime(pdf), ps.to_datetime(psdf))
+ self.assert_eq(pd.to_datetime(dict_from_pdf), ps.to_datetime(dict_from_pdf))
+
+ pdf = pd.DataFrame({"years": [2015, 2016], "months": [2, 3], "day": [4, 5]})
+ psdf = ps.from_pandas(pdf)
+ dict_from_pdf = pdf.to_dict()
+
+ self.assert_eq(pd.to_datetime(pdf), ps.to_datetime(psdf))
+ self.assert_eq(pd.to_datetime(dict_from_pdf), ps.to_datetime(dict_from_pdf))
+
+ pdf = pd.DataFrame({"years": [2015, 2016], "months": [2, 3], "days": [4, 5]})
+ psdf = ps.from_pandas(pdf)
+ dict_from_pdf = pdf.to_dict()
+
+ self.assert_eq(pd.to_datetime(pdf), ps.to_datetime(psdf))
+ self.assert_eq(pd.to_datetime(dict_from_pdf), ps.to_datetime(dict_from_pdf))
+
def test_date_range(self):
self.assert_eq(
ps.date_range(start="1/1/2018", end="1/08/2018"),
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org