You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/01/01 19:34:34 UTC

[arrow] branch master updated: ARROW-3910: [Python] Set date_as_objects=True as default in to_pandas methods

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 9376d85  ARROW-3910: [Python] Set date_as_objects=True as default in to_pandas methods
9376d85 is described below

commit 9376d85c409f4b9b272297b3acb6a0f70dcedc32
Author: Wes McKinney <we...@apache.org>
AuthorDate: Tue Jan 1 13:34:25 2019 -0600

    ARROW-3910: [Python] Set date_as_objects=True as default in to_pandas methods
    
    This does not add a deprecation warning primarily because it's a bit difficult to do (we would need to check the data types whether it's a date -- or in the case of a table, if any field is a date--, and then warn if so). `True` is the correct option though in order to accurately roundtrip data to and from pandas. Some users might have some workarounds floating around, but this is sufficiently advanced stuff already.
    
    With this patch, date data round trips with no special options
    
    ```
    In [2]: import pyarrow as pa
    
    In [3]: import datetime
    
    In [4]: arr = pa.array([datetime.date(2000, 1, 1), None])
    
    In [5]: arr
    Out[5]:
    <pyarrow.lib.Date32Array object at 0x0000022CCDB1BBD8>
    [
      10957,
      null
    ]
    
    In [6]: arr.to_pandas()
    Out[6]: array([datetime.date(2000, 1, 1), None], dtype=object)
    
    In [7]: pa.array(arr.to_pandas())
    Out[7]:
    <pyarrow.lib.Date32Array object at 0x0000022CCDC7FE58>
    [
      10957,
      null
    ]
    ```
    
    If others strongly feel it's worth going to the effort of raising a deprecation warning, please chime in.
    
    Author: Wes McKinney <we...@apache.org>
    
    Closes #3272 from wesm/ARROW-3910 and squashes the following commits:
    
    308afe56 <Wes McKinney> Add Windows makefile for Sphinx, add section about date conversions to pandas.rst
    f77c2967 <Wes McKinney> Set date_as_objects=True as default in to_pandas methods
---
 docs/make.bat                               | 52 ++++++++++++++++
 docs/source/building.rst                    | 71 ++++++++++++++++++++++
 docs/source/index.rst                       |  6 ++
 docs/source/python/development.rst          | 50 ---------------
 docs/source/python/pandas.rst               | 68 ++++++++++++++++++++-
 python/pyarrow/array.pxi                    |  6 +-
 python/pyarrow/tests/test_convert_pandas.py | 94 ++++++++++-------------------
 7 files changed, 231 insertions(+), 116 deletions(-)

diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..36f2086
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,52 @@
+@rem Licensed to the Apache Software Foundation (ASF) under one
+@rem or more contributor license agreements.  See the NOTICE file
+@rem distributed with this work for additional information
+@rem regarding copyright ownership.  The ASF licenses this file
+@rem to you under the Apache License, Version 2.0 (the
+@rem "License"); you may not use this file except in compliance
+@rem with the License.  You may obtain a copy of the License at
+@rem
+@rem   http://www.apache.org/licenses/LICENSE-2.0
+@rem
+@rem Unless required by applicable law or agreed to in writing,
+@rem software distributed under the License is distributed on an
+@rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+@rem KIND, either express or implied.  See the License for the
+@rem specific language governing permissions and limitations
+@rem under the License.
+
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+
+:end
+popd
diff --git a/docs/source/building.rst b/docs/source/building.rst
new file mode 100644
index 0000000..0fb4486
--- /dev/null
+++ b/docs/source/building.rst
@@ -0,0 +1,71 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+
+..   http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied.  See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+Building the Documentation
+==========================
+
+Prerequisites
+-------------
+
+The documentation build process uses `Doxygen <http://www.doxygen.nl/>`_ and
+`Sphinx <http://www.sphinx-doc.org/>`_ along with a few extensions.
+
+If you're using Conda, the required software can be installed in a single line:
+
+.. code-block:: shell
+
+   conda install -c conda-forge --file ci/conda_env_sphinx.yml
+
+Otherwise, you'll first need to install `Doxygen <http://www.doxygen.nl/>`_
+yourself (for example from your distribution's official repositories, if
+using Linux).  Then you can install the Python-based requirements with the
+following command:
+
+.. code-block:: shell
+
+   pip install -r docs/requirements.txt
+
+Building
+--------
+
+.. note::
+
+   If you are building the documentation on Windows, not all sections
+   may build properly.
+
+These two steps are mandatory and must be executed in order.
+
+#. Process the C++ API using Doxygen
+
+   .. code-block:: shell
+
+      pushd cpp/apidoc
+      doxygen
+      popd
+
+#. Build the complete documentation using Sphinx
+
+   .. code-block:: shell
+
+      pushd docs
+      make html
+      popd
+
+After these steps are completed, the documentation is rendered in HTML
+format in ``docs/_build/html``.  In particular, you can point your browser
+at ``docs/_build/html/index.html`` to read the docs and review any changes
+you made.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index fa6c683..2b367b3 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -40,3 +40,9 @@ messaging and interprocess communication.
 
    cpp/index
    python/index
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Other Topics
+
+   building
diff --git a/docs/source/python/development.rst b/docs/source/python/development.rst
index 63e6051..ba8cfef 100644
--- a/docs/source/python/development.rst
+++ b/docs/source/python/development.rst
@@ -364,53 +364,3 @@ Getting ``python-test.exe`` to run is a bit tricky because your
    set PYTHONHOME=%CONDA_PREFIX%
 
 Now ``python-test.exe`` or simply ``ctest`` (to run all tests) should work.
-
-Building the Documentation
-==========================
-
-Prerequisites
--------------
-
-The documentation build process uses `Doxygen <http://www.doxygen.nl/>`_ and
-`Sphinx <http://www.sphinx-doc.org/>`_ along with a few extensions.
-
-If you're using Conda, the required software can be installed in a single line:
-
-.. code-block:: shell
-
-   conda install -c conda-forge --file ci/conda_env_sphinx.yml
-
-Otherwise, you'll first need to install `Doxygen <http://www.doxygen.nl/>`_
-yourself (for example from your distribution's official repositories, if
-using Linux).  Then you can install the Python-based requirements with the
-following command:
-
-.. code-block:: shell
-
-   pip install -r docs/requirements.txt
-
-Building
---------
-
-These two steps are mandatory and must be executed in order.
-
-#. Process the C++ API using Doxygen
-
-   .. code-block:: shell
-
-      pushd cpp/apidoc
-      doxygen
-      popd
-
-#. Build the complete documentation using Sphinx
-
-   .. code-block:: shell
-
-      pushd docs
-      make html
-      popd
-
-After these steps are completed, the documentation is rendered in HTML
-format in ``docs/_build/html``.  In particular, you can point your browser
-at ``docs/_build/html/index.html`` to read the docs and review any changes
-you made.
diff --git a/docs/source/python/pandas.rst b/docs/source/python/pandas.rst
index 16b4ff6..dbc5e77 100644
--- a/docs/source/python/pandas.rst
+++ b/docs/source/python/pandas.rst
@@ -29,6 +29,13 @@ to them.
    (such as a different type system, and support for null values) that this
    is a separate topic from :ref:`numpy_interop`.
 
+To follow examples in this document, make sure to run:
+
+.. ipython:: python
+
+   import pandas as pd
+   import pyarrow as pa
+
 DataFrames
 ----------
 
@@ -120,5 +127,64 @@ Arrow -> pandas Conversion
 +-------------------------------------+--------------------------------------------------------+
 | ``TIMESTAMP(unit=*)``               | ``pd.Timestamp`` (``np.datetime64[ns]``)               |
 +-------------------------------------+--------------------------------------------------------+
-| ``DATE``                            | ``pd.Timestamp`` (``np.datetime64[ns]``)               |
+| ``DATE``                            | ``object``(with ``datetime.date`` objects)             |
 +-------------------------------------+--------------------------------------------------------+
+
+Categorical types
+~~~~~~~~~~~~~~~~~
+
+TODO
+
+Datetime (Timestamp) types
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+TODO
+
+Date types
+~~~~~~~~~~
+
+While dates can be handled using the ``datetime64[ns]`` type in
+pandas, some systems work with object arrays of Python's built-in
+``datetime.date`` object:
+
+.. ipython:: python
+
+   from datetime import date
+   s = pd.Series([date(2018, 12, 31), None, date(2000, 1, 1)])
+   s
+
+When converting to an Arrow array, the ``date32`` type will be used by
+default:
+
+.. ipython:: python
+
+   arr = pa.array(s)
+   arr.type
+   arr[0]
+
+To use the 64-bit ``date64``, specify this explicitly:
+
+.. ipython:: python
+
+   arr = pa.array(s, type='date64')
+   arr.type
+
+When converting back with ``to_pandas``, object arrays of
+``datetime.date`` objects are returned:
+
+.. ipython:: python
+
+   arr.to_pandas()
+
+If you want to use NumPy's ``datetime64`` dtype instead, pass
+``date_as_object=False``:
+
+.. ipython:: python
+
+   s2 = pd.Series(arr.to_pandas(date_as_object=False))
+   s2.dtype
+
+Time types
+~~~~~~~~~~
+
+TODO
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index ef95efe..54d0e92 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -343,10 +343,8 @@ cdef class _PandasConvertible:
 
     def to_pandas(self, categories=None, bint strings_to_categorical=False,
                   bint zero_copy_only=False, bint integer_object_nulls=False,
-                  bint date_as_object=False,
-                  bint use_threads=True,
-                  bint deduplicate_objects=True,
-                  bint ignore_metadata=False):
+                  bint date_as_object=True, bint use_threads=True,
+                  bint deduplicate_objects=True, bint ignore_metadata=False):
         """
         Convert to a pandas-compatible NumPy array or DataFrame, as appropriate
 
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 8d8b65b..3e89f5e 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -912,7 +912,7 @@ class TestConvertDateTimeLikeTypes(object):
 
         result = table.to_pandas()
         expected_df = pd.DataFrame(
-            {"date": np.array(["2000-01-01"], dtype="datetime64[ns]")}
+            {"date": np.array([date(2000, 1, 1)], dtype=object)}
         )
         tm.assert_frame_equal(expected_df, result)
 
@@ -962,7 +962,7 @@ class TestConvertDateTimeLikeTypes(object):
         with pytest.raises(pa.ArrowInvalid, match=expected_msg):
             pa.Array.from_pandas(s, type=pa.date64(), mask=mask)
 
-    def test_array_date_as_object(self):
+    def test_array_types_date_as_object(self):
         data = [date(2000, 1, 1),
                 None,
                 date(1970, 1, 1),
@@ -972,58 +972,23 @@ class TestConvertDateTimeLikeTypes(object):
                              '1970-01-01',
                              '2040-02-26'], dtype='datetime64')
 
-        arr = pa.array(data)
-        assert arr.equals(pa.array(expected))
-
-        result = arr.to_pandas()
-        assert result.dtype == expected.dtype
-        npt.assert_array_equal(arr.to_pandas(), expected)
-
-        result = arr.to_pandas(date_as_object=True)
-        expected = expected.astype(object)
-        assert result.dtype == expected.dtype
-        npt.assert_array_equal(result, expected)
-
-    def test_chunked_array_convert_date_as_object(self):
-        data = [date(2000, 1, 1),
-                None,
-                date(1970, 1, 1),
-                date(2040, 2, 26)]
-        expected = np.array(['2000-01-01',
-                             None,
-                             '1970-01-01',
-                             '2040-02-26'], dtype='datetime64')
-        carr = pa.chunked_array([data])
-
-        result = carr.to_pandas()
-        assert result.dtype == expected.dtype
-        npt.assert_array_equal(carr.to_pandas(), expected)
-
-        result = carr.to_pandas(date_as_object=True)
-        expected = expected.astype(object)
-        assert result.dtype == expected.dtype
-        npt.assert_array_equal(result, expected)
+        objects = [
+            # The second value is the expected value for date_as_object=False
+            (pa.array(data), expected),
+            (pa.chunked_array([data]), expected),
+            (pa.column('date', [data]), expected.astype('M8[ns]'))]
 
-    def test_column_convert_date_as_object(self):
-        data = [date(2000, 1, 1),
-                None,
-                date(1970, 1, 1),
-                date(2040, 2, 26)]
-        expected = np.array(['2000-01-01',
-                             None,
-                             '1970-01-01',
-                             '2040-02-26'], dtype='datetime64')
-
-        arr = pa.array(data)
-        column = pa.column('date', arr)
+        assert objects[0][0].equals(pa.array(expected))
 
-        result = column.to_pandas()
-        npt.assert_array_equal(column.to_pandas(), expected)
+        for obj, expected_datetime64 in objects:
+            result = obj.to_pandas()
+            expected_obj = expected.astype(object)
+            assert result.dtype == expected_obj.dtype
+            npt.assert_array_equal(result, expected_obj)
 
-        result = column.to_pandas(date_as_object=True)
-        expected = expected.astype(object)
-        assert result.dtype == expected.dtype
-        npt.assert_array_equal(result, expected)
+            result = obj.to_pandas(date_as_object=False)
+            assert result.dtype == expected_datetime64.dtype
+            npt.assert_array_equal(result, expected_datetime64)
 
     def test_table_convert_date_as_object(self):
         df = pd.DataFrame({
@@ -1034,8 +999,8 @@ class TestConvertDateTimeLikeTypes(object):
 
         table = pa.Table.from_pandas(df, preserve_index=False)
 
-        df_datetime = table.to_pandas()
-        df_object = table.to_pandas(date_as_object=True)
+        df_datetime = table.to_pandas(date_as_object=False)
+        df_object = table.to_pandas()
 
         tm.assert_frame_equal(df.astype('datetime64[ns]'), df_datetime,
                               check_dtype=True)
@@ -1055,9 +1020,7 @@ class TestConvertDateTimeLikeTypes(object):
         assert table.schema.equals(expected_schema)
 
         result = table.to_pandas()
-        expected = df.copy()
-        expected['date'] = pd.to_datetime(df['date'])
-        tm.assert_frame_equal(result, expected)
+        tm.assert_frame_equal(result, df)
 
     def test_date_mask(self):
         arr = np.array([date(2017, 4, 3), date(2017, 4, 4)],
@@ -1094,18 +1057,27 @@ class TestConvertDateTimeLikeTypes(object):
         # Test converting back to pandas
         colnames = ['date32', 'date64']
         table = pa.Table.from_arrays([a32, a64], colnames)
-        table_pandas = table.to_pandas()
 
         ex_values = (np.array(['2017-04-03', '2017-04-04', '2017-04-04',
                                '2017-04-05'],
-                              dtype='datetime64[D]')
-                     .astype('datetime64[ns]'))
+                              dtype='datetime64[D]'))
         ex_values[1] = pd.NaT.value
-        expected_pandas = pd.DataFrame({'date32': ex_values,
-                                        'date64': ex_values},
+
+        ex_datetime64ns = ex_values.astype('datetime64[ns]')
+        expected_pandas = pd.DataFrame({'date32': ex_datetime64ns,
+                                        'date64': ex_datetime64ns},
                                        columns=colnames)
+        table_pandas = table.to_pandas(date_as_object=False)
         tm.assert_frame_equal(table_pandas, expected_pandas)
 
+        table_pandas_objects = table.to_pandas()
+        ex_objects = ex_values.astype('object')
+        expected_pandas_objects = pd.DataFrame({'date32': ex_objects,
+                                                'date64': ex_objects},
+                                               columns=colnames)
+        tm.assert_frame_equal(table_pandas_objects,
+                              expected_pandas_objects)
+
     def test_dates_from_integers(self):
         t1 = pa.date32()
         t2 = pa.date64()