You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/04/25 18:54:23 UTC
arrow git commit: ARROW-865: [Python] Add unit tests validating
Parquet date/time type roundtrips
Repository: arrow
Updated Branches:
refs/heads/master 0bee8040e -> 68decb6f3
ARROW-865: [Python] Add unit tests validating Parquet date/time type roundtrips
Requires PARQUET-915 https://github.com/apache/parquet-cpp/pull/311
Author: Wes McKinney <we...@twosigma.com>
Closes #595 from wesm/ARROW-865 and squashes the following commits:
db16940 [Wes McKinney] Add tests for auto-casted types, and unsupported nanosecond time
475fa3f [Wes McKinney] Fix test case
fad3934 [Wes McKinney] Update test case
da96a38 [Wes McKinney] Add failing Parquet test case. Enable same-type-size cases in pandas_convert.cc
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/68decb6f
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/68decb6f
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/68decb6f
Branch: refs/heads/master
Commit: 68decb6f33cb1ed262006d4b237137e36f89057c
Parents: 0bee804
Author: Wes McKinney <we...@twosigma.com>
Authored: Tue Apr 25 14:54:18 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Tue Apr 25 14:54:18 2017 -0400
----------------------------------------------------------------------
cpp/src/arrow/python/pandas_convert.cc | 2 +-
cpp/src/arrow/python/type_traits.h | 48 +++++++++++++++++++++++
cpp/src/arrow/util/stl.h | 2 +-
python/pyarrow/tests/test_ipc.py | 3 +-
python/pyarrow/tests/test_parquet.py | 60 +++++++++++++++++++++++++++++
5 files changed, 112 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/68decb6f/cpp/src/arrow/python/pandas_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc
index 636a3fd..9f65af4 100644
--- a/cpp/src/arrow/python/pandas_convert.cc
+++ b/cpp/src/arrow/python/pandas_convert.cc
@@ -444,7 +444,7 @@ inline Status PandasConverter::ConvertData(std::shared_ptr<Buffer>* data) {
// Handle LONGLONG->INT64 and other fun things
int type_num_compat = cast_npy_type_compat(PyArray_DESCR(arr_)->type_num);
- if (traits::npy_type != type_num_compat) {
+ if (numpy_type_size(traits::npy_type) != numpy_type_size(type_num_compat)) {
return Status::NotImplemented("NumPy type casts not yet implemented");
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/68decb6f/cpp/src/arrow/python/type_traits.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/type_traits.h b/cpp/src/arrow/python/type_traits.h
index 26b15bd..b6761ae 100644
--- a/cpp/src/arrow/python/type_traits.h
+++ b/cpp/src/arrow/python/type_traits.h
@@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.
+// Internal header
+
#include "arrow/python/platform.h"
#include <cstdint>
@@ -24,6 +26,7 @@
#include "arrow/builder.h"
#include "arrow/type.h"
+#include "arrow/util/logging.h"
namespace arrow {
namespace py {
@@ -224,5 +227,50 @@ struct arrow_traits<Type::BINARY> {
static constexpr bool supports_nulls = true;
};
+static inline int numpy_type_size(int npy_type) {
+ switch (npy_type) {
+ case NPY_BOOL:
+ return 1;
+ case NPY_INT8:
+ return 1;
+ case NPY_INT16:
+ return 2;
+ case NPY_INT32:
+ return 4;
+ case NPY_INT64:
+ return 8;
+#if (NPY_INT64 != NPY_LONGLONG)
+ case NPY_LONGLONG:
+ return 8;
+#endif
+ case NPY_UINT8:
+ return 1;
+ case NPY_UINT16:
+ return 2;
+ case NPY_UINT32:
+ return 4;
+ case NPY_UINT64:
+ return 8;
+#if (NPY_UINT64 != NPY_ULONGLONG)
+ case NPY_ULONGLONG:
+ return 8;
+#endif
+ case NPY_FLOAT16:
+ return 2;
+ case NPY_FLOAT32:
+ return 4;
+ case NPY_FLOAT64:
+ return 8;
+ case NPY_DATETIME:
+ return 8;
+ case NPY_OBJECT:
+ return sizeof(void*);
+ default:
+ DCHECK(false) << "unhandled numpy type";
+ break;
+ }
+ return -1;
+}
+
} // namespace py
} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/68decb6f/cpp/src/arrow/util/stl.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/stl.h b/cpp/src/arrow/util/stl.h
index bfce111..d58689b 100644
--- a/cpp/src/arrow/util/stl.h
+++ b/cpp/src/arrow/util/stl.h
@@ -20,7 +20,7 @@
#include <vector>
-#include <arrow/util/logging.h>
+#include "arrow/util/logging.h"
namespace arrow {
http://git-wip-us.apache.org/repos/asf/arrow/blob/68decb6f/python/pyarrow/tests/test_ipc.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py
index 81213ed..0204067 100644
--- a/python/pyarrow/tests/test_ipc.py
+++ b/python/pyarrow/tests/test_ipc.py
@@ -158,7 +158,8 @@ class TestSocket(MessagingTest, unittest.TestCase):
connection.close()
def get_result(self):
- return(self._schema, self._table if self._do_read_all else self._batches)
+ return(self._schema, self._table if self._do_read_all
+ else self._batches)
def setUp(self):
# NOTE: must start and stop server in test
http://git-wip-us.apache.org/repos/asf/arrow/blob/68decb6f/python/pyarrow/tests/test_parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 268e87a..8c446af 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -349,6 +349,66 @@ def test_column_of_lists(tmpdir):
@parquet
+def test_date_time_types(tmpdir):
+ buf = io.BytesIO()
+
+ t1 = pa.date32()
+ data1 = np.array([17259, 17260, 17261], dtype='int32')
+ a1 = pa.Array.from_pandas(data1, type=t1)
+
+ t2 = pa.date64()
+ data2 = data1.astype('int64') * 86400000
+ a2 = pa.Array.from_pandas(data2, type=t2)
+
+ t3 = pa.timestamp('us')
+ start = pd.Timestamp('2000-01-01').value / 1000
+ data3 = np.array([start, start + 1, start + 2], dtype='int64')
+ a3 = pa.Array.from_pandas(data3, type=t3)
+
+ t4 = pa.time32('ms')
+ data4 = np.arange(3, dtype='i4')
+ a4 = pa.Array.from_pandas(data4, type=t4)
+
+ t5 = pa.time64('us')
+ a5 = pa.Array.from_pandas(data4.astype('int64'), type=t5)
+
+ t6 = pa.time32('s')
+ a6 = pa.Array.from_pandas(data4, type=t6)
+
+ ex_t6 = pa.time32('ms')
+ ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6)
+
+ table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6],
+ ['date32', 'date64', 'timestamp[us]',
+ 'time32[s]', 'time64[us]', 'time32[s]'])
+
+ # date64 as date32
+ # time32[s] to time32[ms]
+ expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6],
+ ['date32', 'date64', 'timestamp[us]',
+ 'time32[s]', 'time64[us]', 'time32[s]'])
+
+ pq.write_table(table, buf, version="2.0")
+ buf.seek(0)
+
+ result = pq.read_table(buf)
+ assert result.equals(expected)
+
+ # Unsupported stuff
+ def _assert_unsupported(array):
+ table = pa.Table.from_arrays([array], ['unsupported'])
+ buf = io.BytesIO()
+
+ with pytest.raises(NotImplementedError):
+ pq.write_table(table, buf, version="2.0")
+
+ t7 = pa.time64('ns')
+ a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7)
+
+ _assert_unsupported(a7)
+
+
+@parquet
def test_multithreaded_read():
df = alltypes_sample(size=10000)