You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by GitBox <gi...@apache.org> on 2019/02/15 11:26:43 UTC
[GitHub] HyukjinKwon commented on a change in pull request #23795: [SPARK-26887][SQL][PYTHON] Create datetime.date directly instead of creating datetime64[ns] as intermediate data.

HyukjinKwon commented on a change in pull request #23795: [SPARK-26887][SQL][PYTHON] Create datetime.date directly instead of creating datetime64[ns] as intermediate data.
URL: https://github.com/apache/spark/pull/23795#discussion_r257198948
 
 

 ##########
 File path: python/pyspark/sql/types.py
 ##########
 @@ -1681,38 +1681,53 @@ def from_arrow_schema(arrow_schema):
          for field in arrow_schema])
 
 
-def _check_series_convert_date(series, data_type):
-    """
-    Cast the series to datetime.date if it's a date type, otherwise returns the original series.
+def _arrow_column_to_pandas(column, data_type):
+    """ Convert Arrow Column to pandas Series.
+
+    If the given column is a date type column, creates a series of datetime.date directly instead
+    of creating datetime64[ns] as intermediate data.
 
-    :param series: pandas.Series
-    :param data_type: a Spark data type for the series
+    :param series: pyarrow.lib.Column
+    :param data_type: a Spark data type for the column
     """
-    import pyarrow
+    import pandas as pd
+    import pyarrow as pa
     from distutils.version import LooseVersion
-    # As of Arrow 0.12.0, date_as_objects is True by default, see ARROW-3910
-    if LooseVersion(pyarrow.__version__) < LooseVersion("0.12.0") and type(data_type) == DateType:
-        return series.dt.date
+    # Since Arrow 0.11.0, support date_as_object to return datetime.date instead of np.datetime64.
+    if LooseVersion(pa.__version__) < LooseVersion("0.11.0"):
+        if type(data_type) == DateType:
+            return pd.Series(column.to_pylist(), name=column.name)
+        else:
+            return column.to_pandas()
     else:
-        return series
+        return column.to_pandas(date_as_object=True)
+
 
+def _arrow_table_to_pandas(table, schema):
+    """ Convert Arrow Table to pandas DataFrame.
 
-def _check_dataframe_convert_date(pdf, schema):
-    """ Correct date type value to use datetime.date.
+    If the given table contains a date type column, use `_arrow_column_to_pandas` for pyarrow<0.11
+    or use `date_as_object` option for pyarrow>=0.11 to avoid creating datetime64[ns] as
+    intermediate data.
 
     Pandas DataFrame created from PyArrow uses datetime64[ns] for date type values, but we should
     use datetime.date to match the behavior with when Arrow optimization is disabled.
 
-    :param pdf: pandas.DataFrame
-    :param schema: a Spark schema of the pandas.DataFrame
+    :param table: pyarrow.lib.Table
+    :param schema: a Spark schema of the pyarrow.lib.Table
     """
-    import pyarrow
+    import pandas as pd
+    import pyarrow as pa
     from distutils.version import LooseVersion
-    # As of Arrow 0.12.0, date_as_objects is True by default, see ARROW-3910
-    if LooseVersion(pyarrow.__version__) < LooseVersion("0.12.0"):
-        for field in schema:
-            pdf[field.name] = _check_series_convert_date(pdf[field.name], field.dataType)
-    return pdf
+    # Since Arrow 0.11.0, support date_as_object to return datetime.date instead of np.datetime64.
+    if LooseVersion(pa.__version__) < LooseVersion("0.11.0"):
 
 Review comment:
   Looks good @ueshin.
   
   @ueshin, @BryanCutler , BTW, which version of PyArrow do you think we should bump up to in Spark 3.0.0? I was thinking about matching it to 0.12.0, or 0.11.0. I think it's overhead that we should test all the pyarrow versions.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org