You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/04/10 13:47:13 UTC

arrow git commit: ARROW-761: [C++/Python] Add GetTensorSize method, Python bindings

Repository: arrow
Updated Branches:
  refs/heads/master 793f4e0c5 -> e327c2e08


ARROW-761: [C++/Python] Add GetTensorSize method, Python bindings

This computes the memory footprint of a serialized `arrow::Tensor` so that an appropriate memory region can be allocated.

Author: Wes McKinney <we...@twosigma.com>

Closes #521 from wesm/ARROW-761 and squashes the following commits:

983177e [Wes McKinney] Fix sign comparison warning
0d787ad [Wes McKinney] Add GetTensorSize method, Python bindings


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/e327c2e0
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/e327c2e0
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/e327c2e0

Branch: refs/heads/master
Commit: e327c2e08d51ee13b3cf3b8801cd3adfe88b3f7c
Parents: 793f4e0
Author: Wes McKinney <we...@twosigma.com>
Authored: Mon Apr 10 09:47:08 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Mon Apr 10 09:47:08 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/ipc/ipc-read-write-test.cc    |  4 ++++
 cpp/src/arrow/ipc/writer.cc                 | 29 ++++++++++++++----------
 cpp/src/arrow/ipc/writer.h                  |  4 ++++
 python/pyarrow/__init__.py                  |  3 ++-
 python/pyarrow/includes/libarrow.pxd        |  3 +++
 python/pyarrow/io.pyx                       | 20 ++++++++++++++++
 python/pyarrow/tests/test_convert_pandas.py |  4 ++--
 python/pyarrow/tests/test_ipc.py            |  9 ++++++++
 python/pyarrow/tests/test_tensor.py         |  9 +++++++-
 9 files changed, 69 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/e327c2e0/cpp/src/arrow/ipc/ipc-read-write-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc b/cpp/src/arrow/ipc/ipc-read-write-test.cc
index 98a7c3d..cfba0d0 100644
--- a/cpp/src/arrow/ipc/ipc-read-write-test.cc
+++ b/cpp/src/arrow/ipc/ipc-read-write-test.cc
@@ -640,6 +640,10 @@ TEST_F(TestTensorRoundTrip, BasicRoundtrip) {
 
   CheckTensorRoundTrip(t0);
   CheckTensorRoundTrip(tzero);
+
+  int64_t serialized_size;
+  ASSERT_OK(GetTensorSize(t0, &serialized_size));
+  ASSERT_TRUE(serialized_size > static_cast<int64_t>(size * sizeof(int64_t)));
 }
 
 TEST_F(TestTensorRoundTrip, NonContiguous) {

http://git-wip-us.apache.org/repos/asf/arrow/blob/e327c2e0/cpp/src/arrow/ipc/writer.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc
index 18a5855..8ba00a6 100644
--- a/cpp/src/arrow/ipc/writer.cc
+++ b/cpp/src/arrow/ipc/writer.cc
@@ -192,16 +192,6 @@ class RecordBatchWriter : public ArrayVisitor {
     return Status::OK();
   }
 
-  Status GetTotalSize(const RecordBatch& batch, int64_t* size) {
-    // emulates the behavior of Write without actually writing
-    int32_t metadata_length = 0;
-    int64_t body_length = 0;
-    MockOutputStream dst;
-    RETURN_NOT_OK(Write(batch, &dst, &metadata_length, &body_length));
-    *size = dst.GetExtentBytesWritten();
-    return Status::OK();
-  }
-
  protected:
   template <typename ArrayType>
   Status VisitFixedWidth(const ArrayType& array) {
@@ -522,8 +512,23 @@ Status WriteDictionary(int64_t dictionary_id, const std::shared_ptr<Array>& dict
 }
 
 Status GetRecordBatchSize(const RecordBatch& batch, int64_t* size) {
-  RecordBatchWriter writer(default_memory_pool(), 0, kMaxNestingDepth, true);
-  RETURN_NOT_OK(writer.GetTotalSize(batch, size));
+  // emulates the behavior of Write without actually writing
+  int32_t metadata_length = 0;
+  int64_t body_length = 0;
+  MockOutputStream dst;
+  RETURN_NOT_OK(WriteRecordBatch(batch, 0, &dst, &metadata_length, &body_length,
+      default_memory_pool(), kMaxNestingDepth, true));
+  *size = dst.GetExtentBytesWritten();
+  return Status::OK();
+}
+
+Status GetTensorSize(const Tensor& tensor, int64_t* size) {
+  // emulates the behavior of Write without actually writing
+  int32_t metadata_length = 0;
+  int64_t body_length = 0;
+  MockOutputStream dst;
+  RETURN_NOT_OK(WriteTensor(tensor, &dst, &metadata_length, &body_length));
+  *size = dst.GetExtentBytesWritten();
   return Status::OK();
 }
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/e327c2e0/cpp/src/arrow/ipc/writer.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/writer.h b/cpp/src/arrow/ipc/writer.h
index 629bcb9..b71becb 100644
--- a/cpp/src/arrow/ipc/writer.h
+++ b/cpp/src/arrow/ipc/writer.h
@@ -81,6 +81,10 @@ Status WriteDictionary(int64_t dictionary_id, const std::shared_ptr<Array>& dict
 // Flatbuffers metadata.
 Status ARROW_EXPORT GetRecordBatchSize(const RecordBatch& batch, int64_t* size);
 
+// Compute the precise number of bytes needed in a contiguous memory segment to
+// write the tensor including metadata, padding, and data
+Status ARROW_EXPORT GetTensorSize(const Tensor& tensor, int64_t* size);
+
 class ARROW_EXPORT StreamWriter {
  public:
   virtual ~StreamWriter();

http://git-wip-us.apache.org/repos/asf/arrow/blob/e327c2e0/python/pyarrow/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 7b23cf6..df615b4 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -51,7 +51,8 @@ from pyarrow.io import (HdfsFile, NativeFile, PythonFileInterface,
                         Buffer, BufferReader, InMemoryOutputStream,
                         MemoryMappedFile, memory_map,
                         frombuffer, read_tensor, write_tensor,
-                        memory_map, create_memory_map)
+                        memory_map, create_memory_map,
+                        get_record_batch_size, get_tensor_size)
 
 from pyarrow.ipc import FileReader, FileWriter, StreamReader, StreamWriter
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/e327c2e0/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 71b5c8d..40dd837 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -544,6 +544,9 @@ cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil:
 
         CStatus GetRecordBatch(int i, shared_ptr[CRecordBatch]* batch)
 
+    CStatus GetRecordBatchSize(const CRecordBatch& batch, int64_t* size)
+    CStatus GetTensorSize(const CTensor& tensor, int64_t* size)
+
     CStatus WriteTensor(const CTensor& tensor, OutputStream* dst,
                         int32_t* metadata_length,
                         int64_t* body_length)

http://git-wip-us.apache.org/repos/asf/arrow/blob/e327c2e0/python/pyarrow/io.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/io.pyx b/python/pyarrow/io.pyx
index 98b5a62..4eb0816 100644
--- a/python/pyarrow/io.pyx
+++ b/python/pyarrow/io.pyx
@@ -1202,6 +1202,26 @@ cdef class FeatherReader:
         return col
 
 
+def get_tensor_size(Tensor tensor):
+    """
+    Return total size of serialized Tensor including metadata and padding
+    """
+    cdef int64_t size
+    with nogil:
+        check_status(GetTensorSize(deref(tensor.tp), &size))
+    return size
+
+
+def get_record_batch_size(RecordBatch batch):
+    """
+    Return total size of serialized RecordBatch including metadata and padding
+    """
+    cdef int64_t size
+    with nogil:
+        check_status(GetRecordBatchSize(deref(batch.batch), &size))
+    return size
+
+
 def write_tensor(Tensor tensor, NativeFile dest):
     """
     Write pyarrow.Tensor to pyarrow.NativeFile object its current position

http://git-wip-us.apache.org/repos/asf/arrow/blob/e327c2e0/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index d1bea0b..4a57e4b 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -68,7 +68,7 @@ class TestPandasConversion(unittest.TestCase):
                                 timestamps_to_ms=False, expected_schema=None,
                                 check_dtype=True, schema=None):
         table = pa.Table.from_pandas(df, timestamps_to_ms=timestamps_to_ms,
-                                    schema=schema)
+                                     schema=schema)
         result = table.to_pandas(nthreads=nthreads)
         if expected_schema:
             assert table.schema.equals(expected_schema)
@@ -79,7 +79,7 @@ class TestPandasConversion(unittest.TestCase):
     def _check_array_roundtrip(self, values, expected=None, mask=None,
                                timestamps_to_ms=False, type=None):
         arr = pa.Array.from_numpy(values, timestamps_to_ms=timestamps_to_ms,
-                                 mask=mask, type=type)
+                                  mask=mask, type=type)
         result = arr.to_pandas()
 
         values_nulls = pd.isnull(values)

http://git-wip-us.apache.org/repos/asf/arrow/blob/e327c2e0/python/pyarrow/tests/test_ipc.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py
index 4c9dad1..31d418d 100644
--- a/python/pyarrow/tests/test_ipc.py
+++ b/python/pyarrow/tests/test_ipc.py
@@ -151,6 +151,15 @@ def test_ipc_zero_copy_numpy():
     assert_frame_equal(df, rdf)
 
 
+def test_get_record_batch_size():
+    N = 10
+    itemsize = 8
+    df = pd.DataFrame({'foo': np.random.randn(N)})
+
+    batch = pa.RecordBatch.from_pandas(df)
+    assert pa.get_record_batch_size(batch) > (N * itemsize)
+
+
 def write_file(batch, sink):
     writer = pa.FileWriter(sink, batch.schema)
     writer.write_batch(batch)

http://git-wip-us.apache.org/repos/asf/arrow/blob/e327c2e0/python/pyarrow/tests/test_tensor.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_tensor.py b/python/pyarrow/tests/test_tensor.py
index 327b7f0..ec71735 100644
--- a/python/pyarrow/tests/test_tensor.py
+++ b/python/pyarrow/tests/test_tensor.py
@@ -42,10 +42,11 @@ def test_tensor_attrs():
     tensor = pa.Tensor.from_numpy(data2)
     assert not tensor.is_mutable
 
+
 def test_tensor_base_object():
     tensor = pa.Tensor.from_numpy(np.random.randn(10, 4))
     n = sys.getrefcount(tensor)
-    array = tensor.to_numpy()
+    array = tensor.to_numpy()  # noqa
     assert sys.getrefcount(tensor) == n + 1
 
 
@@ -111,3 +112,9 @@ def test_tensor_ipc_strided():
             pa.write_tensor(tensor, mmap)
     finally:
         _try_delete(path)
+
+
+def test_tensor_size():
+    data = np.random.randn(10, 4)
+    tensor = pa.Tensor.from_numpy(data)
+    assert pa.get_tensor_size(tensor) > (data.size * 8)