You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/07/18 19:18:07 UTC
arrow git commit: ARROW-1120: Support for writing timestamp(ns) to Int96

Repository: arrow
Updated Branches:
  refs/heads/master 362e754b3 -> c5a89b7b4


ARROW-1120: Support for writing timestamp(ns) to Int96

cc @c-nichols

Author: Uwe L. Korn <uw...@xhochy.com>
Author: Colin Nichols <ni...@gmail.com>

Closes #865 from xhochy/ARROW-1120 and squashes the following commits:

ff70832f [Uwe L. Korn] Use integer division
99f825d3 [Uwe L. Korn] Add flag for timestamp[ns] roundtrips
7c28835b [Colin Nichols] ARROW-1120 Support for writing timestamp(ns) to Int96


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/c5a89b7b
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/c5a89b7b
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/c5a89b7b

Branch: refs/heads/master
Commit: c5a89b7b4fb94c3988677c5d80405ff7e9cfbd18
Parents: 362e754
Author: Uwe L. Korn <uw...@xhochy.com>
Authored: Tue Jul 18 15:18:03 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Tue Jul 18 15:18:03 2017 -0400

----------------------------------------------------------------------
 python/pyarrow/_parquet.pxd          |  9 +++++++++
 python/pyarrow/_parquet.pyx          | 17 ++++++++++++++--
 python/pyarrow/parquet.py            | 11 +++++++----
 python/pyarrow/tests/test_parquet.py | 33 +++++++++++++++++++++++++++----
 4 files changed, 60 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/c5a89b7b/python/pyarrow/_parquet.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index 3d2d0c8..b1cd5eb 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -247,8 +247,17 @@ cdef extern from "parquet/arrow/writer.h" namespace "parquet::arrow" nogil:
         CStatus Open(const CSchema& schema, CMemoryPool* pool,
                      const shared_ptr[OutputStream]& sink,
                      const shared_ptr[WriterProperties]& properties,
+                     const shared_ptr[ArrowWriterProperties]& arrow_properties,
                      unique_ptr[FileWriter]* writer)
 
         CStatus WriteTable(const CTable& table, int64_t chunk_size)
         CStatus NewRowGroup(int64_t chunk_size)
         CStatus Close()
+
+    cdef cppclass ArrowWriterProperties:
+        cppclass Builder:
+            Builder()
+            Builder* disable_deprecated_int96_timestamps()
+            Builder* enable_deprecated_int96_timestamps()
+            shared_ptr[ArrowWriterProperties] build()
+        c_bool support_deprecated_int96_timestamps()

http://git-wip-us.apache.org/repos/asf/arrow/blob/c5a89b7b/python/pyarrow/_parquet.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 0e0d58e..bbe5203 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -545,13 +545,14 @@ cdef class ParquetWriter:
 
     cdef readonly:
         object use_dictionary
+        object use_deprecated_int96_timestamps
         object compression
         object version
         int row_group_size
 
     def __cinit__(self, where, Schema schema, use_dictionary=None,
                   compression=None, version=None,
-                  MemoryPool memory_pool=None):
+                  MemoryPool memory_pool=None, use_deprecated_int96_timestamps=False):
         cdef:
             shared_ptr[FileOutputStream] filestream
             shared_ptr[OutputStream] sink
@@ -566,6 +567,7 @@ cdef class ParquetWriter:
         self.use_dictionary = use_dictionary
         self.compression = compression
         self.version = version
+        self.use_deprecated_int96_timestamps = use_deprecated_int96_timestamps
 
         cdef WriterProperties.Builder properties_builder
         self._set_version(&properties_builder)
@@ -573,10 +575,21 @@ cdef class ParquetWriter:
         self._set_dictionary_props(&properties_builder)
         properties = properties_builder.build()
 
+        cdef ArrowWriterProperties.Builder arrow_properties_builder
+        self._set_int96_support(&arrow_properties_builder)
+        arrow_properties = arrow_properties_builder.build()
+
         check_status(
             FileWriter.Open(deref(schema.schema),
                             maybe_unbox_memory_pool(memory_pool),
-                            sink, properties, &self.writer))
+                            sink, properties, arrow_properties,
+                            &self.writer))
+
+    cdef void _set_int96_support(self, ArrowWriterProperties.Builder* props):
+        if self.use_deprecated_int96_timestamps:
+            props.enable_deprecated_int96_timestamps()
+        else:
+            props.disable_deprecated_int96_timestamps()
 
     cdef void _set_version(self, WriterProperties.Builder* props):
         if self.version is not None:

http://git-wip-us.apache.org/repos/asf/arrow/blob/c5a89b7b/python/pyarrow/parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 06b3a3d..64cf330 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -743,7 +743,8 @@ def read_pandas(source, columns=None, nthreads=1, metadata=None):
 
 
 def write_table(table, where, row_group_size=None, version='1.0',
-                use_dictionary=True, compression='snappy', **kwargs):
+                use_dictionary=True, compression='snappy',
+                use_deprecated_int96_timestamps=False, **kwargs):
     """
     Write a Table to Parquet format
 
@@ -766,12 +767,13 @@ def write_table(table, where, row_group_size=None, version='1.0',
     writer = ParquetWriter(where, table.schema,
                            use_dictionary=use_dictionary,
                            compression=compression,
-                           version=version)
+                           version=version,
+                           use_deprecated_int96_timestamps=use_deprecated_int96_timestamps)
     writer.write_table(table, row_group_size=row_group_size)
     writer.close()
 
 
-def write_metadata(schema, where, version='1.0'):
+def write_metadata(schema, where, version='1.0', use_deprecated_int96_timestamps=False):
     """
     Write metadata-only Parquet file from schema
 
@@ -782,5 +784,6 @@ def write_metadata(schema, where, version='1.0'):
     version : {"1.0", "2.0"}, default "1.0"
         The Parquet format version, defaults to 1.0
     """
-    writer = ParquetWriter(where, schema, version=version)
+    writer = ParquetWriter(where, schema, version=version,
+                           use_deprecated_int96_timestamps=use_deprecated_int96_timestamps)
     writer.close()

http://git-wip-us.apache.org/repos/asf/arrow/blob/c5a89b7b/python/pyarrow/tests/test_parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index d17eb14..40e44b3 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -456,20 +456,45 @@ def test_date_time_types():
     ex_t6 = pa.time32('ms')
     ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6)
 
-    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6],
+    t7 = pa.timestamp('ns')
+    start = pd.Timestamp('2001-01-01').value
+    data7 = np.array([start, start + 1, start + 2], dtype='int64')
+    a7 = pa.Array.from_pandas(data7, type=t7)
+
+    t7_us = pa.timestamp('us')
+    start = pd.Timestamp('2001-01-01').value
+    data7_us = np.array([start, start + 1, start + 2], dtype='int64') // 1000
+    a7_us = pa.Array.from_pandas(data7_us, type=t7_us)
+
+    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7],
                                  ['date32', 'date64', 'timestamp[us]',
                                   'time32[s]', 'time64[us]',
-                                  'time32_from64[s]'])
+                                  'time32_from64[s]',
+                                  'timestamp[ns]'])
 
     # date64 as date32
     # time32[s] to time32[ms]
-    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6],
+    # 'timestamp[ns]' to 'timestamp[us]'
+    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7_us],
                                     ['date32', 'date64', 'timestamp[us]',
                                      'time32[s]', 'time64[us]',
-                                     'time32_from64[s]'])
+                                     'time32_from64[s]',
+                                     'timestamp[ns]'])
 
     _check_roundtrip(table, expected=expected, version='2.0')
 
+    # date64 as date32
+    # time32[s] to time32[ms]
+    # 'timestamp[ns]' is saved as INT96 timestamp
+    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7],
+                                    ['date32', 'date64', 'timestamp[us]',
+                                     'time32[s]', 'time64[us]',
+                                     'time32_from64[s]',
+                                     'timestamp[ns]'])
+
+    _check_roundtrip(table, expected=expected, version='2.0',
+                     use_deprecated_int96_timestamps=True)
+
     # Unsupported stuff
     def _assert_unsupported(array):
         table = pa.Table.from_arrays([array], ['unsupported'])