You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/03/27 14:44:02 UTC
arrow git commit: ARROW-658: [C++] Implement a prototype in-memory arrow::Tensor type

Repository: arrow
Updated Branches:
  refs/heads/master 3aac4adef -> d2d27555b


ARROW-658: [C++] Implement a prototype in-memory arrow::Tensor type

I haven't implemented much beyond the data container and automatically computing row major strides. If we agree on the basics, then I will implement IPC read/writes of this data structure in a follow up patch.

cc @pcmoritz @robertnishihara @JohanMabille @sylvaincorlay

Author: Wes McKinney <we...@twosigma.com>

Closes #438 from wesm/ARROW-658 and squashes the following commits:

7f82028 [Wes McKinney] Include numeric STL header
8160393 [Wes McKinney] std::accumulate is in algorithm header
bdd4c55 [Wes McKinney] No need to special case 0-dim
471c719 [Wes McKinney] Add test for 0-d tensor. Use std::accumulate in Tensor::size
8d4a13a [Wes McKinney] Make std::vector args const-refs
8bd9716 [Wes McKinney] Add extern templates for numeric tensors
7d805bf [Wes McKinney] cpplint
8b65aea [Wes McKinney] Implement a prototype in-memory arrow::Tensor type


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/d2d27555
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/d2d27555
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/d2d27555

Branch: refs/heads/master
Commit: d2d27555b4b2f3f0ba26539211bfe8b4d1b52481
Parents: 3aac4ad
Author: Wes McKinney <we...@twosigma.com>
Authored: Mon Mar 27 10:43:56 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Mon Mar 27 10:43:56 2017 -0400

----------------------------------------------------------------------
 cpp/CMakeLists.txt           |   1 +
 cpp/src/arrow/CMakeLists.txt |   1 +
 cpp/src/arrow/buffer.cc      |   4 -
 cpp/src/arrow/buffer.h       |   7 +-
 cpp/src/arrow/tensor-test.cc |  73 ++++++++++++++++++
 cpp/src/arrow/tensor.cc      | 116 ++++++++++++++++++++++++++++
 cpp/src/arrow/tensor.h       | 158 ++++++++++++++++++++++++++++++++++++++
 cpp/src/arrow/type_fwd.h     |  13 +++-
 8 files changed, 359 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/d2d27555/cpp/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c77cf60..e4c18ca 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -792,6 +792,7 @@ set(ARROW_SRCS
   src/arrow/schema.cc
   src/arrow/status.cc
   src/arrow/table.cc
+  src/arrow/tensor.cc
   src/arrow/type.cc
   src/arrow/visitor.cc
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/d2d27555/cpp/src/arrow/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 0e83aac..f965f1d 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -65,6 +65,7 @@ ADD_ARROW_TEST(pretty_print-test)
 ADD_ARROW_TEST(status-test)
 ADD_ARROW_TEST(type-test)
 ADD_ARROW_TEST(table-test)
+ADD_ARROW_TEST(tensor-test)
 
 ADD_ARROW_BENCHMARK(builder-benchmark)
 ADD_ARROW_BENCHMARK(column-benchmark)

http://git-wip-us.apache.org/repos/asf/arrow/blob/d2d27555/cpp/src/arrow/buffer.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/buffer.cc b/cpp/src/arrow/buffer.cc
index 28edf5e..be747e1 100644
--- a/cpp/src/arrow/buffer.cc
+++ b/cpp/src/arrow/buffer.cc
@@ -68,10 +68,6 @@ bool Buffer::Equals(const Buffer& other) const {
                                                             static_cast<size_t>(size_))));
 }
 
-std::shared_ptr<Buffer> MutableBuffer::GetImmutableView() {
-  return std::make_shared<Buffer>(this->get_shared_ptr(), 0, size());
-}
-
 PoolBuffer::PoolBuffer(MemoryPool* pool) : ResizableBuffer(nullptr, 0) {
   if (pool == nullptr) { pool = default_memory_pool(); }
   pool_ = pool;

http://git-wip-us.apache.org/repos/asf/arrow/blob/d2d27555/cpp/src/arrow/buffer.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h
index 449bb53..713d57a 100644
--- a/cpp/src/arrow/buffer.h
+++ b/cpp/src/arrow/buffer.h
@@ -43,7 +43,7 @@ class Status;
 /// of bytes that where allocated for the buffer in total.
 ///
 /// The following invariant is always true: Size < Capacity
-class ARROW_EXPORT Buffer : public std::enable_shared_from_this<Buffer> {
+class ARROW_EXPORT Buffer {
  public:
   Buffer(const uint8_t* data, int64_t size)
       : is_mutable_(false), data_(data), size_(size), capacity_(size) {}
@@ -58,8 +58,6 @@ class ARROW_EXPORT Buffer : public std::enable_shared_from_this<Buffer> {
   /// we might add utility methods to help determine if a buffer satisfies this contract.
   Buffer(const std::shared_ptr<Buffer>& parent, int64_t offset, int64_t size);
 
-  std::shared_ptr<Buffer> get_shared_ptr() { return shared_from_this(); }
-
   bool is_mutable() const { return is_mutable_; }
 
   /// Return true if both buffers are the same size and contain the same bytes
@@ -111,9 +109,6 @@ class ARROW_EXPORT MutableBuffer : public Buffer {
 
   uint8_t* mutable_data() { return mutable_data_; }
 
-  /// Get a read-only view of this buffer
-  std::shared_ptr<Buffer> GetImmutableView();
-
  protected:
   MutableBuffer() : Buffer(nullptr, 0), mutable_data_(nullptr) {}
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/d2d27555/cpp/src/arrow/tensor-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/tensor-test.cc b/cpp/src/arrow/tensor-test.cc
new file mode 100644
index 0000000..99a9493
--- /dev/null
+++ b/cpp/src/arrow/tensor-test.cc
@@ -0,0 +1,73 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Unit tests for DataType (and subclasses), Field, and Schema
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "arrow/buffer.h"
+#include "arrow/tensor.h"
+#include "arrow/test-util.h"
+#include "arrow/type.h"
+
+namespace arrow {
+
+TEST(TestTensor, ZeroDim) {
+  const int64_t values = 1;
+  std::vector<int64_t> shape = {};
+
+  using T = int64_t;
+
+  std::shared_ptr<MutableBuffer> buffer;
+  ASSERT_OK(AllocateBuffer(default_memory_pool(), values * sizeof(T), &buffer));
+
+  Int64Tensor t0(buffer, shape);
+
+  ASSERT_EQ(1, t0.size());
+}
+
+TEST(TestTensor, BasicCtors) {
+  const int64_t values = 24;
+  std::vector<int64_t> shape = {4, 6};
+  std::vector<int64_t> strides = {48, 8};
+  std::vector<std::string> dim_names = {"foo", "bar"};
+
+  using T = int64_t;
+
+  std::shared_ptr<MutableBuffer> buffer;
+  ASSERT_OK(AllocateBuffer(default_memory_pool(), values * sizeof(T), &buffer));
+
+  Int64Tensor t1(buffer, shape);
+  Int64Tensor t2(buffer, shape, strides);
+  Int64Tensor t3(buffer, shape, strides, dim_names);
+
+  ASSERT_EQ(24, t1.size());
+  ASSERT_TRUE(t1.is_mutable());
+  ASSERT_FALSE(t1.has_dim_names());
+
+  ASSERT_EQ(strides, t1.strides());
+  ASSERT_EQ(strides, t2.strides());
+
+  ASSERT_EQ("foo", t3.dim_name(0));
+  ASSERT_EQ("bar", t3.dim_name(1));
+}
+
+}  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/d2d27555/cpp/src/arrow/tensor.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/tensor.cc b/cpp/src/arrow/tensor.cc
new file mode 100644
index 0000000..c0d128f
--- /dev/null
+++ b/cpp/src/arrow/tensor.cc
@@ -0,0 +1,116 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/tensor.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+void ComputeRowMajorStrides(const FixedWidthType& type, const std::vector<int64_t>& shape,
+    std::vector<int64_t>* strides) {
+  int64_t remaining = type.bit_width() / 8;
+  for (int64_t dimsize : shape) {
+    remaining *= dimsize;
+  }
+
+  for (int64_t dimsize : shape) {
+    remaining /= dimsize;
+    strides->push_back(remaining);
+  }
+}
+
+/// Constructor with strides and dimension names
+Tensor::Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
+    const std::vector<int64_t>& shape, const std::vector<int64_t>& strides,
+    const std::vector<std::string>& dim_names)
+    : type_(type), data_(data), shape_(shape), strides_(strides), dim_names_(dim_names) {
+  DCHECK(is_tensor_supported(type->type));
+  if (shape.size() > 0 && strides.size() == 0) {
+    ComputeRowMajorStrides(static_cast<const FixedWidthType&>(*type_), shape, &strides_);
+  }
+}
+
+Tensor::Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
+    const std::vector<int64_t>& shape, const std::vector<int64_t>& strides)
+    : Tensor(type, data, shape, strides, {}) {}
+
+Tensor::Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
+    const std::vector<int64_t>& shape)
+    : Tensor(type, data, shape, {}, {}) {}
+
+const std::string& Tensor::dim_name(int i) const {
+  DCHECK_LT(i, static_cast<int>(dim_names_.size()));
+  return dim_names_[i];
+}
+
+int64_t Tensor::size() const {
+  return std::accumulate(
+      shape_.begin(), shape_.end(), 1, std::multiplies<int64_t>());
+}
+
+template <typename T>
+NumericTensor<T>::NumericTensor(const std::shared_ptr<Buffer>& data,
+    const std::vector<int64_t>& shape, const std::vector<int64_t>& strides,
+    const std::vector<std::string>& dim_names)
+    : Tensor(TypeTraits<T>::type_singleton(), data, shape, strides, dim_names),
+      raw_data_(nullptr),
+      mutable_raw_data_(nullptr) {
+  if (data_) {
+    raw_data_ = reinterpret_cast<const value_type*>(data_->data());
+    if (data_->is_mutable()) {
+      auto mut_buf = static_cast<MutableBuffer*>(data_.get());
+      mutable_raw_data_ = reinterpret_cast<value_type*>(mut_buf->mutable_data());
+    }
+  }
+}
+
+template <typename T>
+NumericTensor<T>::NumericTensor(
+    const std::shared_ptr<Buffer>& data, const std::vector<int64_t>& shape)
+    : NumericTensor(data, shape, {}, {}) {}
+
+template <typename T>
+NumericTensor<T>::NumericTensor(const std::shared_ptr<Buffer>& data,
+    const std::vector<int64_t>& shape, const std::vector<int64_t>& strides)
+    : NumericTensor(data, shape, strides, {}) {}
+
+template class NumericTensor<Int8Type>;
+template class NumericTensor<UInt8Type>;
+template class NumericTensor<Int16Type>;
+template class NumericTensor<UInt16Type>;
+template class NumericTensor<Int32Type>;
+template class NumericTensor<UInt32Type>;
+template class NumericTensor<Int64Type>;
+template class NumericTensor<UInt64Type>;
+template class NumericTensor<HalfFloatType>;
+template class NumericTensor<FloatType>;
+template class NumericTensor<DoubleType>;
+
+}  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/d2d27555/cpp/src/arrow/tensor.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/tensor.h b/cpp/src/arrow/tensor.h
new file mode 100644
index 0000000..0059368
--- /dev/null
+++ b/cpp/src/arrow/tensor.h
@@ -0,0 +1,158 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef ARROW_TENSOR_H
+#define ARROW_TENSOR_H
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Buffer;
+class MemoryPool;
+class MutableBuffer;
+class Status;
+
+static inline bool is_tensor_supported(Type::type type_id) {
+  switch (type_id) {
+    case Type::UINT8:
+    case Type::INT8:
+    case Type::UINT16:
+    case Type::INT16:
+    case Type::UINT32:
+    case Type::INT32:
+    case Type::UINT64:
+    case Type::INT64:
+    case Type::HALF_FLOAT:
+    case Type::FLOAT:
+    case Type::DOUBLE:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+class ARROW_EXPORT Tensor {
+ public:
+  virtual ~Tensor() = default;
+
+  /// Constructor with no dimension names or strides, data assumed to be row-major
+  Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
+      const std::vector<int64_t>& shape);
+
+  /// Constructor with non-negative strides
+  Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
+      const std::vector<int64_t>& shape, const std::vector<int64_t>& strides);
+
+  /// Constructor with strides and dimension names
+  Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
+      const std::vector<int64_t>& shape, const std::vector<int64_t>& strides,
+      const std::vector<std::string>& dim_names);
+
+  std::shared_ptr<Buffer> data() const { return data_; }
+  const std::vector<int64_t>& shape() const { return shape_; }
+  const std::vector<int64_t>& strides() const { return strides_; }
+
+  const std::string& dim_name(int i) const;
+  bool has_dim_names() const { return shape_.size() > 0 && dim_names_.size() > 0; }
+
+  /// Total number of value cells in the tensor
+  int64_t size() const;
+
+  /// Return true if the underlying data buffer is mutable
+  bool is_mutable() const { return data_->is_mutable(); }
+
+ protected:
+  Tensor() {}
+
+  std::shared_ptr<DataType> type_;
+
+  std::shared_ptr<Buffer> data_;
+
+  std::vector<int64_t> shape_;
+  std::vector<int64_t> strides_;
+
+  /// These names are optional
+  std::vector<std::string> dim_names_;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(Tensor);
+};
+
+template <typename T>
+class ARROW_EXPORT NumericTensor : public Tensor {
+ public:
+  using value_type = typename T::c_type;
+
+  NumericTensor(const std::shared_ptr<Buffer>& data, const std::vector<int64_t>& shape);
+
+  /// Constructor with non-negative strides
+  NumericTensor(const std::shared_ptr<Buffer>& data, const std::vector<int64_t>& shape,
+      const std::vector<int64_t>& strides);
+
+  /// Constructor with strides and dimension names
+  NumericTensor(const std::shared_ptr<Buffer>& data, const std::vector<int64_t>& shape,
+      const std::vector<int64_t>& strides, const std::vector<std::string>& dim_names);
+
+  const value_type* raw_data() const { return raw_data_; }
+  value_type* raw_data() { return mutable_raw_data_; }
+
+ private:
+  const value_type* raw_data_;
+  value_type* mutable_raw_data_;
+};
+
+// ----------------------------------------------------------------------
+// extern templates and other details
+
+// gcc and clang disagree about how to handle template visibility when you have
+// explicit specializations https://llvm.org/bugs/show_bug.cgi?id=24815
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wattributes"
+#endif
+
+// Only instantiate these templates once
+extern template class ARROW_EXPORT NumericTensor<Int8Type>;
+extern template class ARROW_EXPORT NumericTensor<UInt8Type>;
+extern template class ARROW_EXPORT NumericTensor<Int16Type>;
+extern template class ARROW_EXPORT NumericTensor<UInt16Type>;
+extern template class ARROW_EXPORT NumericTensor<Int32Type>;
+extern template class ARROW_EXPORT NumericTensor<UInt32Type>;
+extern template class ARROW_EXPORT NumericTensor<Int64Type>;
+extern template class ARROW_EXPORT NumericTensor<UInt64Type>;
+extern template class ARROW_EXPORT NumericTensor<HalfFloatType>;
+extern template class ARROW_EXPORT NumericTensor<FloatType>;
+extern template class ARROW_EXPORT NumericTensor<DoubleType>;
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+
+}  // namespace arrow
+
+#endif  // ARROW_TENSOR_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/d2d27555/cpp/src/arrow/type_fwd.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h
index 201f4e9..04ddf7e 100644
--- a/cpp/src/arrow/type_fwd.h
+++ b/cpp/src/arrow/type_fwd.h
@@ -30,6 +30,7 @@ struct DataType;
 class Array;
 class ArrayBuilder;
 struct Field;
+class Tensor;
 
 class Buffer;
 class MemoryPool;
@@ -78,10 +79,14 @@ class NumericArray;
 template <typename TypeClass>
 class NumericBuilder;
 
-#define _NUMERIC_TYPE_DECL(KLASS)                 \
-  struct KLASS##Type;                             \
-  using KLASS##Array = NumericArray<KLASS##Type>; \
-  using KLASS##Builder = NumericBuilder<KLASS##Type>;
+template <typename TypeClass>
+class NumericTensor;
+
+#define _NUMERIC_TYPE_DECL(KLASS)                     \
+  struct KLASS##Type;                                 \
+  using KLASS##Array = NumericArray<KLASS##Type>;     \
+  using KLASS##Builder = NumericBuilder<KLASS##Type>; \
+  using KLASS##Tensor = NumericTensor<KLASS##Type>;
 
 _NUMERIC_TYPE_DECL(Int8);
 _NUMERIC_TYPE_DECL(Int16);