You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/01/13 13:46:55 UTC
arrow git commit: ARROW-479: Python: Test for expected schema in Pandas conversion

Repository: arrow
Updated Branches:
  refs/heads/master c5663c6d0 -> 5ffbda1b4


ARROW-479: Python: Test for expected schema in Pandas conversion

Author: Uwe L. Korn <uw...@xhochy.com>

Closes #281 from xhochy/ARROW-479 and squashes the following commits:

acd9abd [Uwe L. Korn] Use arrow::timestamp()
43dba37 [Uwe L. Korn] Fix tests
7a3f5b8 [Uwe L. Korn] ARROW-479: Python: Test for expected schema in Pandas conversion


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/5ffbda1b
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/5ffbda1b
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/5ffbda1b

Branch: refs/heads/master
Commit: 5ffbda1b408951cb5cf49008920f1054544148d3
Parents: c5663c6
Author: Uwe L. Korn <uw...@xhochy.com>
Authored: Fri Jan 13 08:46:48 2017 -0500
Committer: Wes McKinney <we...@twosigma.com>
Committed: Fri Jan 13 08:46:48 2017 -0500

----------------------------------------------------------------------
 python/pyarrow/includes/libarrow.pxd         |  2 +
 python/pyarrow/includes/pyarrow.pxd          |  4 +-
 python/pyarrow/schema.pyx                    | 38 ++++++++++-
 python/pyarrow/tests/test_convert_builtin.py |  2 +-
 python/pyarrow/tests/test_convert_pandas.py  | 77 ++++++++++++++++-------
 python/pyarrow/tests/test_parquet.py         |  2 +-
 python/src/pyarrow/helpers.cc                |  3 -
 7 files changed, 97 insertions(+), 31 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/5ffbda1b/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index d1970e5..8cfaaf7 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -60,6 +60,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
 
         c_string ToString()
 
+    shared_ptr[CDataType] timestamp(TimeUnit unit)
+
     cdef cppclass MemoryPool" arrow::MemoryPool":
         int64_t bytes_allocated()
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/5ffbda1b/python/pyarrow/includes/pyarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd
index dc6ccd2..901e6c9 100644
--- a/python/pyarrow/includes/pyarrow.pxd
+++ b/python/pyarrow/includes/pyarrow.pxd
@@ -19,13 +19,15 @@
 
 from pyarrow.includes.common cimport *
 from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, CTable,
-                                        CDataType, CStatus, Type, MemoryPool)
+                                        CDataType, CStatus, Type, MemoryPool,
+                                        TimeUnit)
 
 cimport pyarrow.includes.libarrow_io as arrow_io
 
 
 cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil:
     shared_ptr[CDataType] GetPrimitiveType(Type type)
+    shared_ptr[CDataType] GetTimestampType(TimeUnit unit)
     CStatus ConvertPySequence(object obj, shared_ptr[CArray]* out)
 
     CStatus PandasToArrow(MemoryPool* pool, object ao,

http://git-wip-us.apache.org/repos/asf/arrow/blob/5ffbda1b/python/pyarrow/schema.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/schema.pyx b/python/pyarrow/schema.pyx
index d91ae7c..f6a1a10 100644
--- a/python/pyarrow/schema.pyx
+++ b/python/pyarrow/schema.pyx
@@ -23,8 +23,20 @@
 # cython: embedsignature = True
 
 from pyarrow.compat import frombytes, tobytes
-from pyarrow.includes.libarrow cimport *
+from pyarrow.includes.libarrow cimport (CDataType, CStructType, CListType,
+                                        Type_NA, Type_BOOL,
+                                        Type_UINT8, Type_INT8,
+                                        Type_UINT16, Type_INT16,
+                                        Type_UINT32, Type_INT32,
+                                        Type_UINT64, Type_INT64,
+                                        Type_TIMESTAMP, Type_DATE,
+                                        Type_FLOAT, Type_DOUBLE,
+                                        Type_STRING, Type_BINARY, 
+                                        TimeUnit_SECOND, TimeUnit_MILLI,
+                                        TimeUnit_MICRO, TimeUnit_NANO,
+                                        Type, TimeUnit)
 cimport pyarrow.includes.pyarrow as pyarrow
+cimport pyarrow.includes.libarrow as libarrow
 
 cimport cpython
 
@@ -197,8 +209,28 @@ def uint64():
 def int64():
     return primitive_type(Type_INT64)
 
-def timestamp():
-    return primitive_type(Type_TIMESTAMP)
+cdef dict _timestamp_type_cache = {}
+
+def timestamp(unit_str):
+    cdef TimeUnit unit
+    if unit_str == "s":
+        unit = TimeUnit_SECOND
+    elif unit_str == 'ms':
+        unit = TimeUnit_MILLI
+    elif unit_str == 'us':
+        unit = TimeUnit_MICRO
+    elif unit_str == 'ns':
+        unit = TimeUnit_NANO
+    else:
+        raise TypeError('Invalid TimeUnit string')
+
+    if unit in _timestamp_type_cache:
+        return _timestamp_type_cache[unit]
+
+    cdef DataType out = DataType()
+    out.init(libarrow.timestamp(unit))
+    _timestamp_type_cache[unit] = out
+    return out
 
 def date():
     return primitive_type(Type_DATE)

http://git-wip-us.apache.org/repos/asf/arrow/blob/5ffbda1b/python/pyarrow/tests/test_convert_builtin.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index 6116742..72e4389 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -112,7 +112,7 @@ class TestConvertList(unittest.TestCase):
         ]
         arr = pyarrow.from_pylist(data)
         assert len(arr) == 4
-        assert arr.type == pyarrow.timestamp()
+        assert arr.type == pyarrow.timestamp('us')
         assert arr.null_count == 1
         assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1,
                                                    23, 34, 123456)

http://git-wip-us.apache.org/repos/asf/arrow/blob/5ffbda1b/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 12e7a08..261eaa8 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -60,65 +60,79 @@ class TestPandasConversion(unittest.TestCase):
         pass
 
     def _check_pandas_roundtrip(self, df, expected=None, nthreads=1,
-                                timestamps_to_ms=False):
+                                timestamps_to_ms=False, expected_schema=None):
         table = A.Table.from_pandas(df, timestamps_to_ms=timestamps_to_ms)
         result = table.to_pandas(nthreads=nthreads)
+        if expected_schema:
+            assert table.schema.equals(expected_schema)
         if expected is None:
             expected = df
         tm.assert_frame_equal(result, expected)
 
     def test_float_no_nulls(self):
         data = {}
-        numpy_dtypes = ['f4', 'f8']
+        fields = []
+        dtypes = [('f4', A.float_()), ('f8', A.double())]
         num_values = 100
 
-        for dtype in numpy_dtypes:
+        for numpy_dtype, arrow_dtype in dtypes:
             values = np.random.randn(num_values)
-            data[dtype] = values.astype(dtype)
+            data[numpy_dtype] = values.astype(numpy_dtype)
+            fields.append(A.Field.from_py(numpy_dtype, arrow_dtype))
 
         df = pd.DataFrame(data)
-        self._check_pandas_roundtrip(df)
+        schema = A.Schema.from_fields(fields)
+        self._check_pandas_roundtrip(df, expected_schema=schema)
 
     def test_float_nulls(self):
         num_values = 100
 
         null_mask = np.random.randint(0, 10, size=num_values) < 3
-        dtypes = ['f4', 'f8']
+        dtypes = [('f4', A.float_()), ('f8', A.double())]
+        names = ['f4', 'f8']
         expected_cols = []
 
         arrays = []
-        for name in dtypes:
+        fields = []
+        for name, arrow_dtype in dtypes:
             values = np.random.randn(num_values).astype(name)
 
             arr = A.from_pandas_series(values, null_mask)
             arrays.append(arr)
-
+            fields.append(A.Field.from_py(name, arrow_dtype))
             values[null_mask] = np.nan
 
             expected_cols.append(values)
 
-        ex_frame = pd.DataFrame(dict(zip(dtypes, expected_cols)),
-                                columns=dtypes)
+        ex_frame = pd.DataFrame(dict(zip(names, expected_cols)),
+                                columns=names)
 
-        table = A.Table.from_arrays(dtypes, arrays)
+        table = A.Table.from_arrays(names, arrays)
+        assert table.schema.equals(A.Schema.from_fields(fields))
         result = table.to_pandas()
         tm.assert_frame_equal(result, ex_frame)
 
     def test_integer_no_nulls(self):
         data = {}
+        fields = []
 
-        numpy_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']
+        numpy_dtypes = [('i1', A.int8()), ('i2', A.int16()),
+                        ('i4', A.int32()), ('i8', A.int64()),
+                        ('u1', A.uint8()), ('u2', A.uint16()),
+                        ('u4', A.uint32()), ('u8', A.uint64())]
         num_values = 100
 
-        for dtype in numpy_dtypes:
+        for dtype, arrow_dtype in numpy_dtypes:
             info = np.iinfo(dtype)
             values = np.random.randint(info.min,
                                        min(info.max, np.iinfo('i8').max),
                                        size=num_values)
             data[dtype] = values.astype(dtype)
+            fields.append(A.Field.from_py(dtype, arrow_dtype))
 
         df = pd.DataFrame(data)
-        self._check_pandas_roundtrip(df)
+        schema = A.Schema.from_fields(fields)
+        self._check_pandas_roundtrip(df, expected_schema=schema)
 
     def test_integer_with_nulls(self):
         # pandas requires upcast to float dtype
@@ -155,7 +169,9 @@ class TestPandasConversion(unittest.TestCase):
         np.random.seed(0)
 
         df = pd.DataFrame({'bools': np.random.randn(num_values) > 0})
-        self._check_pandas_roundtrip(df)
+        field = A.Field.from_py('bools', A.bool_())
+        schema = A.Schema.from_fields([field])
+        self._check_pandas_roundtrip(df, expected_schema=schema)
 
     def test_boolean_nulls(self):
         # pandas requires upcast to object dtype
@@ -170,9 +186,12 @@ class TestPandasConversion(unittest.TestCase):
         expected = values.astype(object)
         expected[mask] = None
 
+        field = A.Field.from_py('bools', A.bool_())
+        schema = A.Schema.from_fields([field])
         ex_frame = pd.DataFrame({'bools': expected})
 
         table = A.Table.from_arrays(['bools'], [arr])
+        assert table.schema.equals(schema)
         result = table.to_pandas()
 
         tm.assert_frame_equal(result, ex_frame)
@@ -180,14 +199,18 @@ class TestPandasConversion(unittest.TestCase):
     def test_boolean_object_nulls(self):
         arr = np.array([False, None, True] * 100, dtype=object)
         df = pd.DataFrame({'bools': arr})
-        self._check_pandas_roundtrip(df)
+        field = A.Field.from_py('bools', A.bool_())
+        schema = A.Schema.from_fields([field])
+        self._check_pandas_roundtrip(df, expected_schema=schema)
 
     def test_unicode(self):
         repeats = 1000
         values = [u'foo', None, u'bar', u'ma�ana', np.nan]
         df = pd.DataFrame({'strings': values * repeats})
+        field = A.Field.from_py('strings', A.string())
+        schema = A.Schema.from_fields([field])
 
-        self._check_pandas_roundtrip(df)
+        self._check_pandas_roundtrip(df, expected_schema=schema)
 
     def test_bytes_to_binary(self):
         values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan]
@@ -208,7 +231,9 @@ class TestPandasConversion(unittest.TestCase):
                 '2010-08-13T05:46:57.437'],
                 dtype='datetime64[ms]')
             })
-        self._check_pandas_roundtrip(df, timestamps_to_ms=True)
+        field = A.Field.from_py('datetime64', A.timestamp('ms'))
+        schema = A.Schema.from_fields([field])
+        self._check_pandas_roundtrip(df, timestamps_to_ms=True, expected_schema=schema)
 
         df = pd.DataFrame({
             'datetime64': np.array([
@@ -217,7 +242,9 @@ class TestPandasConversion(unittest.TestCase):
                 '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ns]')
             })
-        self._check_pandas_roundtrip(df, timestamps_to_ms=False)
+        field = A.Field.from_py('datetime64', A.timestamp('ns'))
+        schema = A.Schema.from_fields([field])
+        self._check_pandas_roundtrip(df, timestamps_to_ms=False, expected_schema=schema)
 
     def test_timestamps_notimezone_nulls(self):
         df = pd.DataFrame({
@@ -227,8 +254,9 @@ class TestPandasConversion(unittest.TestCase):
                 '2010-08-13T05:46:57.437'],
                 dtype='datetime64[ms]')
             })
-        df.info()
-        self._check_pandas_roundtrip(df, timestamps_to_ms=True)
+        field = A.Field.from_py('datetime64', A.timestamp('ms'))
+        schema = A.Schema.from_fields([field])
+        self._check_pandas_roundtrip(df, timestamps_to_ms=True, expected_schema=schema)
 
         df = pd.DataFrame({
             'datetime64': np.array([
@@ -237,7 +265,9 @@ class TestPandasConversion(unittest.TestCase):
                 '2010-08-13T05:46:57.437699912'],
                 dtype='datetime64[ns]')
             })
-        self._check_pandas_roundtrip(df, timestamps_to_ms=False)
+        field = A.Field.from_py('datetime64', A.timestamp('ns'))
+        schema = A.Schema.from_fields([field])
+        self._check_pandas_roundtrip(df, timestamps_to_ms=False, expected_schema=schema)
 
     def test_date(self):
         df = pd.DataFrame({
@@ -246,6 +276,9 @@ class TestPandasConversion(unittest.TestCase):
                      datetime.date(1970, 1, 1),
                      datetime.date(2040, 2, 26)]})
         table = A.Table.from_pandas(df)
+        field = A.Field.from_py('date', A.date())
+        schema = A.Schema.from_fields([field])
+        assert table.schema.equals(schema)
         result = table.to_pandas()
         expected = df.copy()
         expected['date'] = pd.to_datetime(df['date'])

http://git-wip-us.apache.org/repos/asf/arrow/blob/5ffbda1b/python/pyarrow/tests/test_parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index ad4bc58..e157155 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -244,7 +244,7 @@ def test_parquet_metadata_api():
     a_table = A.Table.from_pandas(df, timestamps_to_ms=True)
 
     buf = io.BytesIO()
-    pq.write_table(a_table, buf, compression='snappy', version='2.0')
+    pq.write_table(a_table, buf, compression='SNAPPY', version='2.0')
 
     buf.seek(0)
     fileh = pq.ParquetFile(buf)

http://git-wip-us.apache.org/repos/asf/arrow/blob/5ffbda1b/python/src/pyarrow/helpers.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/helpers.cc b/python/src/pyarrow/helpers.cc
index 3f65032..78fad16 100644
--- a/python/src/pyarrow/helpers.cc
+++ b/python/src/pyarrow/helpers.cc
@@ -41,9 +41,6 @@ std::shared_ptr<DataType> GetPrimitiveType(Type::type type) {
       GET_PRIMITIVE_TYPE(UINT64, uint64);
       GET_PRIMITIVE_TYPE(INT64, int64);
       GET_PRIMITIVE_TYPE(DATE, date);
-    case Type::TIMESTAMP:
-      return arrow::timestamp(arrow::TimeUnit::MICRO);
-      break;
       GET_PRIMITIVE_TYPE(BOOL, boolean);
       GET_PRIMITIVE_TYPE(FLOAT, float32);
       GET_PRIMITIVE_TYPE(DOUBLE, float64);