You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/04/10 13:47:13 UTC
arrow git commit: ARROW-761: [C++/Python] Add GetTensorSize method,
Python bindings
Repository: arrow
Updated Branches:
refs/heads/master 793f4e0c5 -> e327c2e08
ARROW-761: [C++/Python] Add GetTensorSize method, Python bindings
This computes the memory footprint of a serialized `arrow::Tensor` so that an appropriate memory region can be allocated.
Author: Wes McKinney <we...@twosigma.com>
Closes #521 from wesm/ARROW-761 and squashes the following commits:
983177e [Wes McKinney] Fix sign comparison warning
0d787ad [Wes McKinney] Add GetTensorSize method, Python bindings
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/e327c2e0
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/e327c2e0
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/e327c2e0
Branch: refs/heads/master
Commit: e327c2e08d51ee13b3cf3b8801cd3adfe88b3f7c
Parents: 793f4e0
Author: Wes McKinney <we...@twosigma.com>
Authored: Mon Apr 10 09:47:08 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Mon Apr 10 09:47:08 2017 -0400
----------------------------------------------------------------------
cpp/src/arrow/ipc/ipc-read-write-test.cc | 4 ++++
cpp/src/arrow/ipc/writer.cc | 29 ++++++++++++++----------
cpp/src/arrow/ipc/writer.h | 4 ++++
python/pyarrow/__init__.py | 3 ++-
python/pyarrow/includes/libarrow.pxd | 3 +++
python/pyarrow/io.pyx | 20 ++++++++++++++++
python/pyarrow/tests/test_convert_pandas.py | 4 ++--
python/pyarrow/tests/test_ipc.py | 9 ++++++++
python/pyarrow/tests/test_tensor.py | 9 +++++++-
9 files changed, 69 insertions(+), 16 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/e327c2e0/cpp/src/arrow/ipc/ipc-read-write-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc b/cpp/src/arrow/ipc/ipc-read-write-test.cc
index 98a7c3d..cfba0d0 100644
--- a/cpp/src/arrow/ipc/ipc-read-write-test.cc
+++ b/cpp/src/arrow/ipc/ipc-read-write-test.cc
@@ -640,6 +640,10 @@ TEST_F(TestTensorRoundTrip, BasicRoundtrip) {
CheckTensorRoundTrip(t0);
CheckTensorRoundTrip(tzero);
+
+ int64_t serialized_size;
+ ASSERT_OK(GetTensorSize(t0, &serialized_size));
+ ASSERT_TRUE(serialized_size > static_cast<int64_t>(size * sizeof(int64_t)));
}
TEST_F(TestTensorRoundTrip, NonContiguous) {
http://git-wip-us.apache.org/repos/asf/arrow/blob/e327c2e0/cpp/src/arrow/ipc/writer.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc
index 18a5855..8ba00a6 100644
--- a/cpp/src/arrow/ipc/writer.cc
+++ b/cpp/src/arrow/ipc/writer.cc
@@ -192,16 +192,6 @@ class RecordBatchWriter : public ArrayVisitor {
return Status::OK();
}
- Status GetTotalSize(const RecordBatch& batch, int64_t* size) {
- // emulates the behavior of Write without actually writing
- int32_t metadata_length = 0;
- int64_t body_length = 0;
- MockOutputStream dst;
- RETURN_NOT_OK(Write(batch, &dst, &metadata_length, &body_length));
- *size = dst.GetExtentBytesWritten();
- return Status::OK();
- }
-
protected:
template <typename ArrayType>
Status VisitFixedWidth(const ArrayType& array) {
@@ -522,8 +512,23 @@ Status WriteDictionary(int64_t dictionary_id, const std::shared_ptr<Array>& dict
}
Status GetRecordBatchSize(const RecordBatch& batch, int64_t* size) {
- RecordBatchWriter writer(default_memory_pool(), 0, kMaxNestingDepth, true);
- RETURN_NOT_OK(writer.GetTotalSize(batch, size));
+ // emulates the behavior of Write without actually writing
+ int32_t metadata_length = 0;
+ int64_t body_length = 0;
+ MockOutputStream dst;
+ RETURN_NOT_OK(WriteRecordBatch(batch, 0, &dst, &metadata_length, &body_length,
+ default_memory_pool(), kMaxNestingDepth, true));
+ *size = dst.GetExtentBytesWritten();
+ return Status::OK();
+}
+
+Status GetTensorSize(const Tensor& tensor, int64_t* size) {
+ // emulates the behavior of Write without actually writing
+ int32_t metadata_length = 0;
+ int64_t body_length = 0;
+ MockOutputStream dst;
+ RETURN_NOT_OK(WriteTensor(tensor, &dst, &metadata_length, &body_length));
+ *size = dst.GetExtentBytesWritten();
return Status::OK();
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/e327c2e0/cpp/src/arrow/ipc/writer.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/writer.h b/cpp/src/arrow/ipc/writer.h
index 629bcb9..b71becb 100644
--- a/cpp/src/arrow/ipc/writer.h
+++ b/cpp/src/arrow/ipc/writer.h
@@ -81,6 +81,10 @@ Status WriteDictionary(int64_t dictionary_id, const std::shared_ptr<Array>& dict
// Flatbuffers metadata.
Status ARROW_EXPORT GetRecordBatchSize(const RecordBatch& batch, int64_t* size);
+// Compute the precise number of bytes needed in a contiguous memory segment to
+// write the tensor including metadata, padding, and data
+Status ARROW_EXPORT GetTensorSize(const Tensor& tensor, int64_t* size);
+
class ARROW_EXPORT StreamWriter {
public:
virtual ~StreamWriter();
http://git-wip-us.apache.org/repos/asf/arrow/blob/e327c2e0/python/pyarrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 7b23cf6..df615b4 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -51,7 +51,8 @@ from pyarrow.io import (HdfsFile, NativeFile, PythonFileInterface,
Buffer, BufferReader, InMemoryOutputStream,
MemoryMappedFile, memory_map,
frombuffer, read_tensor, write_tensor,
- memory_map, create_memory_map)
+ memory_map, create_memory_map,
+ get_record_batch_size, get_tensor_size)
from pyarrow.ipc import FileReader, FileWriter, StreamReader, StreamWriter
http://git-wip-us.apache.org/repos/asf/arrow/blob/e327c2e0/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 71b5c8d..40dd837 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -544,6 +544,9 @@ cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil:
CStatus GetRecordBatch(int i, shared_ptr[CRecordBatch]* batch)
+ CStatus GetRecordBatchSize(const CRecordBatch& batch, int64_t* size)
+ CStatus GetTensorSize(const CTensor& tensor, int64_t* size)
+
CStatus WriteTensor(const CTensor& tensor, OutputStream* dst,
int32_t* metadata_length,
int64_t* body_length)
http://git-wip-us.apache.org/repos/asf/arrow/blob/e327c2e0/python/pyarrow/io.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/io.pyx b/python/pyarrow/io.pyx
index 98b5a62..4eb0816 100644
--- a/python/pyarrow/io.pyx
+++ b/python/pyarrow/io.pyx
@@ -1202,6 +1202,26 @@ cdef class FeatherReader:
return col
+def get_tensor_size(Tensor tensor):
+ """
+ Return total size of serialized Tensor including metadata and padding
+ """
+ cdef int64_t size
+ with nogil:
+ check_status(GetTensorSize(deref(tensor.tp), &size))
+ return size
+
+
+def get_record_batch_size(RecordBatch batch):
+ """
+ Return total size of serialized RecordBatch including metadata and padding
+ """
+ cdef int64_t size
+ with nogil:
+ check_status(GetRecordBatchSize(deref(batch.batch), &size))
+ return size
+
+
def write_tensor(Tensor tensor, NativeFile dest):
"""
Write pyarrow.Tensor to pyarrow.NativeFile object its current position
http://git-wip-us.apache.org/repos/asf/arrow/blob/e327c2e0/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index d1bea0b..4a57e4b 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -68,7 +68,7 @@ class TestPandasConversion(unittest.TestCase):
timestamps_to_ms=False, expected_schema=None,
check_dtype=True, schema=None):
table = pa.Table.from_pandas(df, timestamps_to_ms=timestamps_to_ms,
- schema=schema)
+ schema=schema)
result = table.to_pandas(nthreads=nthreads)
if expected_schema:
assert table.schema.equals(expected_schema)
@@ -79,7 +79,7 @@ class TestPandasConversion(unittest.TestCase):
def _check_array_roundtrip(self, values, expected=None, mask=None,
timestamps_to_ms=False, type=None):
arr = pa.Array.from_numpy(values, timestamps_to_ms=timestamps_to_ms,
- mask=mask, type=type)
+ mask=mask, type=type)
result = arr.to_pandas()
values_nulls = pd.isnull(values)
http://git-wip-us.apache.org/repos/asf/arrow/blob/e327c2e0/python/pyarrow/tests/test_ipc.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py
index 4c9dad1..31d418d 100644
--- a/python/pyarrow/tests/test_ipc.py
+++ b/python/pyarrow/tests/test_ipc.py
@@ -151,6 +151,15 @@ def test_ipc_zero_copy_numpy():
assert_frame_equal(df, rdf)
+def test_get_record_batch_size():
+ N = 10
+ itemsize = 8
+ df = pd.DataFrame({'foo': np.random.randn(N)})
+
+ batch = pa.RecordBatch.from_pandas(df)
+ assert pa.get_record_batch_size(batch) > (N * itemsize)
+
+
def write_file(batch, sink):
writer = pa.FileWriter(sink, batch.schema)
writer.write_batch(batch)
http://git-wip-us.apache.org/repos/asf/arrow/blob/e327c2e0/python/pyarrow/tests/test_tensor.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_tensor.py b/python/pyarrow/tests/test_tensor.py
index 327b7f0..ec71735 100644
--- a/python/pyarrow/tests/test_tensor.py
+++ b/python/pyarrow/tests/test_tensor.py
@@ -42,10 +42,11 @@ def test_tensor_attrs():
tensor = pa.Tensor.from_numpy(data2)
assert not tensor.is_mutable
+
def test_tensor_base_object():
tensor = pa.Tensor.from_numpy(np.random.randn(10, 4))
n = sys.getrefcount(tensor)
- array = tensor.to_numpy()
+ array = tensor.to_numpy() # noqa
assert sys.getrefcount(tensor) == n + 1
@@ -111,3 +112,9 @@ def test_tensor_ipc_strided():
pa.write_tensor(tensor, mmap)
finally:
_try_delete(path)
+
+
+def test_tensor_size():
+ data = np.random.randn(10, 4)
+ tensor = pa.Tensor.from_numpy(data)
+ assert pa.get_tensor_size(tensor) > (data.size * 8)