You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/01/13 13:46:55 UTC
arrow git commit: ARROW-479: Python: Test for expected schema in
Pandas conversion
Repository: arrow
Updated Branches:
refs/heads/master c5663c6d0 -> 5ffbda1b4
ARROW-479: Python: Test for expected schema in Pandas conversion
Author: Uwe L. Korn <uw...@xhochy.com>
Closes #281 from xhochy/ARROW-479 and squashes the following commits:
acd9abd [Uwe L. Korn] Use arrow::timestamp()
43dba37 [Uwe L. Korn] Fix tests
7a3f5b8 [Uwe L. Korn] ARROW-479: Python: Test for expected schema in Pandas conversion
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/5ffbda1b
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/5ffbda1b
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/5ffbda1b
Branch: refs/heads/master
Commit: 5ffbda1b408951cb5cf49008920f1054544148d3
Parents: c5663c6
Author: Uwe L. Korn <uw...@xhochy.com>
Authored: Fri Jan 13 08:46:48 2017 -0500
Committer: Wes McKinney <we...@twosigma.com>
Committed: Fri Jan 13 08:46:48 2017 -0500
----------------------------------------------------------------------
python/pyarrow/includes/libarrow.pxd | 2 +
python/pyarrow/includes/pyarrow.pxd | 4 +-
python/pyarrow/schema.pyx | 38 ++++++++++-
python/pyarrow/tests/test_convert_builtin.py | 2 +-
python/pyarrow/tests/test_convert_pandas.py | 77 ++++++++++++++++-------
python/pyarrow/tests/test_parquet.py | 2 +-
python/src/pyarrow/helpers.cc | 3 -
7 files changed, 97 insertions(+), 31 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/5ffbda1b/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index d1970e5..8cfaaf7 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -60,6 +60,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
c_string ToString()
+ shared_ptr[CDataType] timestamp(TimeUnit unit)
+
cdef cppclass MemoryPool" arrow::MemoryPool":
int64_t bytes_allocated()
http://git-wip-us.apache.org/repos/asf/arrow/blob/5ffbda1b/python/pyarrow/includes/pyarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd
index dc6ccd2..901e6c9 100644
--- a/python/pyarrow/includes/pyarrow.pxd
+++ b/python/pyarrow/includes/pyarrow.pxd
@@ -19,13 +19,15 @@
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, CTable,
- CDataType, CStatus, Type, MemoryPool)
+ CDataType, CStatus, Type, MemoryPool,
+ TimeUnit)
cimport pyarrow.includes.libarrow_io as arrow_io
cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil:
shared_ptr[CDataType] GetPrimitiveType(Type type)
+ shared_ptr[CDataType] GetTimestampType(TimeUnit unit)
CStatus ConvertPySequence(object obj, shared_ptr[CArray]* out)
CStatus PandasToArrow(MemoryPool* pool, object ao,
http://git-wip-us.apache.org/repos/asf/arrow/blob/5ffbda1b/python/pyarrow/schema.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/schema.pyx b/python/pyarrow/schema.pyx
index d91ae7c..f6a1a10 100644
--- a/python/pyarrow/schema.pyx
+++ b/python/pyarrow/schema.pyx
@@ -23,8 +23,20 @@
# cython: embedsignature = True
from pyarrow.compat import frombytes, tobytes
-from pyarrow.includes.libarrow cimport *
+from pyarrow.includes.libarrow cimport (CDataType, CStructType, CListType,
+ Type_NA, Type_BOOL,
+ Type_UINT8, Type_INT8,
+ Type_UINT16, Type_INT16,
+ Type_UINT32, Type_INT32,
+ Type_UINT64, Type_INT64,
+ Type_TIMESTAMP, Type_DATE,
+ Type_FLOAT, Type_DOUBLE,
+ Type_STRING, Type_BINARY,
+ TimeUnit_SECOND, TimeUnit_MILLI,
+ TimeUnit_MICRO, TimeUnit_NANO,
+ Type, TimeUnit)
cimport pyarrow.includes.pyarrow as pyarrow
+cimport pyarrow.includes.libarrow as libarrow
cimport cpython
@@ -197,8 +209,28 @@ def uint64():
def int64():
return primitive_type(Type_INT64)
-def timestamp():
- return primitive_type(Type_TIMESTAMP)
+cdef dict _timestamp_type_cache = {}
+
+def timestamp(unit_str):
+ cdef TimeUnit unit
+ if unit_str == "s":
+ unit = TimeUnit_SECOND
+ elif unit_str == 'ms':
+ unit = TimeUnit_MILLI
+ elif unit_str == 'us':
+ unit = TimeUnit_MICRO
+ elif unit_str == 'ns':
+ unit = TimeUnit_NANO
+ else:
+ raise TypeError('Invalid TimeUnit string')
+
+ if unit in _timestamp_type_cache:
+ return _timestamp_type_cache[unit]
+
+ cdef DataType out = DataType()
+ out.init(libarrow.timestamp(unit))
+ _timestamp_type_cache[unit] = out
+ return out
def date():
return primitive_type(Type_DATE)
http://git-wip-us.apache.org/repos/asf/arrow/blob/5ffbda1b/python/pyarrow/tests/test_convert_builtin.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index 6116742..72e4389 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -112,7 +112,7 @@ class TestConvertList(unittest.TestCase):
]
arr = pyarrow.from_pylist(data)
assert len(arr) == 4
- assert arr.type == pyarrow.timestamp()
+ assert arr.type == pyarrow.timestamp('us')
assert arr.null_count == 1
assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1,
23, 34, 123456)
http://git-wip-us.apache.org/repos/asf/arrow/blob/5ffbda1b/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 12e7a08..261eaa8 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -60,65 +60,79 @@ class TestPandasConversion(unittest.TestCase):
pass
def _check_pandas_roundtrip(self, df, expected=None, nthreads=1,
- timestamps_to_ms=False):
+ timestamps_to_ms=False, expected_schema=None):
table = A.Table.from_pandas(df, timestamps_to_ms=timestamps_to_ms)
result = table.to_pandas(nthreads=nthreads)
+ if expected_schema:
+ assert table.schema.equals(expected_schema)
if expected is None:
expected = df
tm.assert_frame_equal(result, expected)
def test_float_no_nulls(self):
data = {}
- numpy_dtypes = ['f4', 'f8']
+ fields = []
+ dtypes = [('f4', A.float_()), ('f8', A.double())]
num_values = 100
- for dtype in numpy_dtypes:
+ for numpy_dtype, arrow_dtype in dtypes:
values = np.random.randn(num_values)
- data[dtype] = values.astype(dtype)
+ data[numpy_dtype] = values.astype(numpy_dtype)
+ fields.append(A.Field.from_py(numpy_dtype, arrow_dtype))
df = pd.DataFrame(data)
- self._check_pandas_roundtrip(df)
+ schema = A.Schema.from_fields(fields)
+ self._check_pandas_roundtrip(df, expected_schema=schema)
def test_float_nulls(self):
num_values = 100
null_mask = np.random.randint(0, 10, size=num_values) < 3
- dtypes = ['f4', 'f8']
+ dtypes = [('f4', A.float_()), ('f8', A.double())]
+ names = ['f4', 'f8']
expected_cols = []
arrays = []
- for name in dtypes:
+ fields = []
+ for name, arrow_dtype in dtypes:
values = np.random.randn(num_values).astype(name)
arr = A.from_pandas_series(values, null_mask)
arrays.append(arr)
-
+ fields.append(A.Field.from_py(name, arrow_dtype))
values[null_mask] = np.nan
expected_cols.append(values)
- ex_frame = pd.DataFrame(dict(zip(dtypes, expected_cols)),
- columns=dtypes)
+ ex_frame = pd.DataFrame(dict(zip(names, expected_cols)),
+ columns=names)
- table = A.Table.from_arrays(dtypes, arrays)
+ table = A.Table.from_arrays(names, arrays)
+ assert table.schema.equals(A.Schema.from_fields(fields))
result = table.to_pandas()
tm.assert_frame_equal(result, ex_frame)
def test_integer_no_nulls(self):
data = {}
+ fields = []
- numpy_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']
+ numpy_dtypes = [('i1', A.int8()), ('i2', A.int16()),
+ ('i4', A.int32()), ('i8', A.int64()),
+ ('u1', A.uint8()), ('u2', A.uint16()),
+ ('u4', A.uint32()), ('u8', A.uint64())]
num_values = 100
- for dtype in numpy_dtypes:
+ for dtype, arrow_dtype in numpy_dtypes:
info = np.iinfo(dtype)
values = np.random.randint(info.min,
min(info.max, np.iinfo('i8').max),
size=num_values)
data[dtype] = values.astype(dtype)
+ fields.append(A.Field.from_py(dtype, arrow_dtype))
df = pd.DataFrame(data)
- self._check_pandas_roundtrip(df)
+ schema = A.Schema.from_fields(fields)
+ self._check_pandas_roundtrip(df, expected_schema=schema)
def test_integer_with_nulls(self):
# pandas requires upcast to float dtype
@@ -155,7 +169,9 @@ class TestPandasConversion(unittest.TestCase):
np.random.seed(0)
df = pd.DataFrame({'bools': np.random.randn(num_values) > 0})
- self._check_pandas_roundtrip(df)
+ field = A.Field.from_py('bools', A.bool_())
+ schema = A.Schema.from_fields([field])
+ self._check_pandas_roundtrip(df, expected_schema=schema)
def test_boolean_nulls(self):
# pandas requires upcast to object dtype
@@ -170,9 +186,12 @@ class TestPandasConversion(unittest.TestCase):
expected = values.astype(object)
expected[mask] = None
+ field = A.Field.from_py('bools', A.bool_())
+ schema = A.Schema.from_fields([field])
ex_frame = pd.DataFrame({'bools': expected})
table = A.Table.from_arrays(['bools'], [arr])
+ assert table.schema.equals(schema)
result = table.to_pandas()
tm.assert_frame_equal(result, ex_frame)
@@ -180,14 +199,18 @@ class TestPandasConversion(unittest.TestCase):
def test_boolean_object_nulls(self):
arr = np.array([False, None, True] * 100, dtype=object)
df = pd.DataFrame({'bools': arr})
- self._check_pandas_roundtrip(df)
+ field = A.Field.from_py('bools', A.bool_())
+ schema = A.Schema.from_fields([field])
+ self._check_pandas_roundtrip(df, expected_schema=schema)
def test_unicode(self):
repeats = 1000
values = [u'foo', None, u'bar', u'ma�ana', np.nan]
df = pd.DataFrame({'strings': values * repeats})
+ field = A.Field.from_py('strings', A.string())
+ schema = A.Schema.from_fields([field])
- self._check_pandas_roundtrip(df)
+ self._check_pandas_roundtrip(df, expected_schema=schema)
def test_bytes_to_binary(self):
values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan]
@@ -208,7 +231,9 @@ class TestPandasConversion(unittest.TestCase):
'2010-08-13T05:46:57.437'],
dtype='datetime64[ms]')
})
- self._check_pandas_roundtrip(df, timestamps_to_ms=True)
+ field = A.Field.from_py('datetime64', A.timestamp('ms'))
+ schema = A.Schema.from_fields([field])
+ self._check_pandas_roundtrip(df, timestamps_to_ms=True, expected_schema=schema)
df = pd.DataFrame({
'datetime64': np.array([
@@ -217,7 +242,9 @@ class TestPandasConversion(unittest.TestCase):
'2010-08-13T05:46:57.437699912'],
dtype='datetime64[ns]')
})
- self._check_pandas_roundtrip(df, timestamps_to_ms=False)
+ field = A.Field.from_py('datetime64', A.timestamp('ns'))
+ schema = A.Schema.from_fields([field])
+ self._check_pandas_roundtrip(df, timestamps_to_ms=False, expected_schema=schema)
def test_timestamps_notimezone_nulls(self):
df = pd.DataFrame({
@@ -227,8 +254,9 @@ class TestPandasConversion(unittest.TestCase):
'2010-08-13T05:46:57.437'],
dtype='datetime64[ms]')
})
- df.info()
- self._check_pandas_roundtrip(df, timestamps_to_ms=True)
+ field = A.Field.from_py('datetime64', A.timestamp('ms'))
+ schema = A.Schema.from_fields([field])
+ self._check_pandas_roundtrip(df, timestamps_to_ms=True, expected_schema=schema)
df = pd.DataFrame({
'datetime64': np.array([
@@ -237,7 +265,9 @@ class TestPandasConversion(unittest.TestCase):
'2010-08-13T05:46:57.437699912'],
dtype='datetime64[ns]')
})
- self._check_pandas_roundtrip(df, timestamps_to_ms=False)
+ field = A.Field.from_py('datetime64', A.timestamp('ns'))
+ schema = A.Schema.from_fields([field])
+ self._check_pandas_roundtrip(df, timestamps_to_ms=False, expected_schema=schema)
def test_date(self):
df = pd.DataFrame({
@@ -246,6 +276,9 @@ class TestPandasConversion(unittest.TestCase):
datetime.date(1970, 1, 1),
datetime.date(2040, 2, 26)]})
table = A.Table.from_pandas(df)
+ field = A.Field.from_py('date', A.date())
+ schema = A.Schema.from_fields([field])
+ assert table.schema.equals(schema)
result = table.to_pandas()
expected = df.copy()
expected['date'] = pd.to_datetime(df['date'])
http://git-wip-us.apache.org/repos/asf/arrow/blob/5ffbda1b/python/pyarrow/tests/test_parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index ad4bc58..e157155 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -244,7 +244,7 @@ def test_parquet_metadata_api():
a_table = A.Table.from_pandas(df, timestamps_to_ms=True)
buf = io.BytesIO()
- pq.write_table(a_table, buf, compression='snappy', version='2.0')
+ pq.write_table(a_table, buf, compression='SNAPPY', version='2.0')
buf.seek(0)
fileh = pq.ParquetFile(buf)
http://git-wip-us.apache.org/repos/asf/arrow/blob/5ffbda1b/python/src/pyarrow/helpers.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/helpers.cc b/python/src/pyarrow/helpers.cc
index 3f65032..78fad16 100644
--- a/python/src/pyarrow/helpers.cc
+++ b/python/src/pyarrow/helpers.cc
@@ -41,9 +41,6 @@ std::shared_ptr<DataType> GetPrimitiveType(Type::type type) {
GET_PRIMITIVE_TYPE(UINT64, uint64);
GET_PRIMITIVE_TYPE(INT64, int64);
GET_PRIMITIVE_TYPE(DATE, date);
- case Type::TIMESTAMP:
- return arrow::timestamp(arrow::TimeUnit::MICRO);
- break;
GET_PRIMITIVE_TYPE(BOOL, boolean);
GET_PRIMITIVE_TYPE(FLOAT, float32);
GET_PRIMITIVE_TYPE(DOUBLE, float64);