You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/06/10 15:38:08 UTC
[GitHub] [arrow] jorisvandenbossche commented on a change in pull request #7169: ARROW-5359: [Python] Support non-nanosecond out-of-range timestamps in conversion to pandas

jorisvandenbossche commented on a change in pull request #7169:
URL: https://github.com/apache/arrow/pull/7169#discussion_r438202601



##########
File path: python/pyarrow/pandas_compat.py
##########
@@ -699,6 +699,22 @@ def _reconstruct_block(item, columns=None, extension_columns=None):
 
     block_arr = item.get('block', None)
     placement = item['placement']
+
+    if (
+            (block_arr is not None) and
+            (block_arr.dtype.type == np.datetime64) and
+            (block_arr.dtype.name != "datetime64[ns]")
+    ):
+        # 1. Non-nanosecond timestamps can express dates outside
+        #    the range supported by nanoseconds.
+        # 2. If the dtype is datetime64 of any sort, deep inside
+        #    Panda's make_block() code path is will do

Review comment:
       ```suggestion
           #    pandas' make_block() code path it will do
   ```

##########
File path: python/pyarrow/pandas_compat.py
##########
@@ -699,6 +699,17 @@ def _reconstruct_block(item, columns=None, extension_columns=None):
 
     block_arr = item.get('block', None)
     placement = item['placement']
+
+    if (
+            (block_arr is not None) and
+            (block_arr.dtype.type == np.datetime64) and
+            (block_arr.dtype.name != "datetime64[ns]")
+    ):
+        # Non-nanosecond timestamps can express much larger values than
+        # nanosecond timestamps, and pandas checks that the values fit into
+        # nanosecond range, so this needs to be an object as dtype.
+        block_arr = block_arr.astype(np.dtype("O"))

Review comment:
       Thanks for the additional comments. All clear now!

##########
File path: python/pyarrow/tests/test_pandas.py
##########
@@ -3941,3 +3946,63 @@ def test_metadata_compat_missing_field_name():
     result = table.to_pandas()
     # on python 3.5 the column order can differ -> adding check_like=True
     tm.assert_frame_equal(result, expected, check_like=True)
+
+
+def make_df_with_timestamps():
+    # Some of the milliseconds timestamps deliberately don't fit in the range
+    # that is possible with nanosecond timestamps.
+    df = pd.DataFrame({
+        'dateTimeMs': [
+            np.datetime64('0001-01-01 00:00', 'ms'),
+            np.datetime64('2012-05-02 12:35', 'ms'),
+            np.datetime64('2012-05-03 15:42', 'ms'),
+            np.datetime64('3000-05-03 15:42', 'ms'),
+        ],
+        'dateTimeNs': [
+            np.datetime64('1991-01-01 00:00', 'ns'),
+            np.datetime64('2012-05-02 12:35', 'ns'),
+            np.datetime64('2012-05-03 15:42', 'ns'),
+            np.datetime64('2050-05-03 15:42', 'ns'),
+        ],
+    })
+    # Not part of what we're testing, just ensuring that the inputs are what we
+    # expect.
+    assert (df.dateTimeMs.dtype, df.dateTimeNs.dtype) == (
+        # O == object, <M8[ns] == timestamp64[ns]
+        np.dtype("O"), np.dtype("<M8[ns]")
+    )
+    return df
+
+
+@pytest.mark.parquet
+def test_timestamp_as_object_parquet(tempdir):
+    # Timestamps can be stored as Parquet and reloaded into Pandas with no loss
+    # of information if the timestamp_as_object option is True.
+    df = make_df_with_timestamps()
+    table = pa.Table.from_pandas(df)
+    filename = tempdir / "timestamps_from_pandas.parquet"
+    pq.write_table(table, filename, version="2.0")
+    result = pq.read_table(filename)
+    df2 = result.to_pandas(timestamp_as_object=True)
+    tm.assert_frame_equal(df, df2, check_like=True)

Review comment:
       I quickly checked, and the `check_like` doesn't seem necessary. Do you remember why you added it?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org