You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2019/02/07 19:07:16 UTC
[arrow] branch master updated: ARROW-4318: [C++] Add
Tensor::CountNonZero
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 7fd18a1 ARROW-4318: [C++] Add Tensor::CountNonZero
7fd18a1 is described below
commit 7fd18a17174c19de00297f04e134342c98874e94
Author: Kenta Murata <mr...@mrkn.jp>
AuthorDate: Thu Feb 7 20:07:08 2019 +0100
ARROW-4318: [C++] Add Tensor::CountNonZero
In this pull-request, I would like to move CountNonZero defined in SparseTensorConverter into Tensor class, and add tests for this function.
Author: Kenta Murata <mr...@mrkn.jp>
Author: Antoine Pitrou <an...@python.org>
Closes #3452 from mrkn/tensor_count_non_zero and squashes the following commits:
fa394d95 <Antoine Pitrou> Return a Status from Tensor::CountNonZero
ecfbbf52 <Antoine Pitrou> Fix lint
37f286a9 <Kenta Murata> Check !is_tensor_supported in NonZeroCounter::Visit
e714d7d0 <Kenta Murata> Fix spelling
c9958010 <Kenta Murata> Use VisitTypeInline in CountNonZero
ce353288 <Kenta Murata> Change the return type of CountNonZero
4b365a0b <Kenta Murata> Add docstring of Tensor::CountNonZero
08c8e265 <Kenta Murata> Refactoring: extract TensorCountNonZero to make a function simple
4072ebb1 <Kenta Murata> Use anonymous namespace
d1828ac3 <Kenta Murata> Remove needless functions
e9bfc024 <Kenta Murata> Use Tensor::CountNonZero in SparseTensorConverter
45552a84 <Kenta Murata> Add Tensor::CountNonZero
---
cpp/src/arrow/sparse_tensor.cc | 53 +++++---------------------------
cpp/src/arrow/tensor-test.cc | 50 +++++++++++++++++++++++++++++-
cpp/src/arrow/tensor.cc | 70 ++++++++++++++++++++++++++++++++++++++++++
cpp/src/arrow/tensor.h | 3 ++
4 files changed, 129 insertions(+), 47 deletions(-)
diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc
index a55f51a..205c17e 100644
--- a/cpp/src/arrow/sparse_tensor.cc
+++ b/cpp/src/arrow/sparse_tensor.cc
@@ -49,45 +49,6 @@ struct SparseTensorConverterBase {
explicit SparseTensorConverterBase(const NumericTensorType& tensor) : tensor_(tensor) {}
- bool TensorIsTriviallyIterable() const {
- return tensor_.ndim() <= 1 || tensor_.is_contiguous();
- }
-
- size_t CountNonZero() const {
- if (tensor_.size() == 0) {
- return 0;
- }
-
- if (TensorIsTriviallyIterable()) {
- const value_type* data = reinterpret_cast<const value_type*>(tensor_.raw_data());
- return std::count_if(data, data + tensor_.size(),
- [](value_type x) { return x != 0; });
- }
-
- const std::vector<int64_t>& shape = tensor_.shape();
- const int64_t ndim = tensor_.ndim();
-
- size_t count = 0;
- std::vector<int64_t> coord(ndim, 0);
- for (int64_t n = tensor_.size(); n > 0; n--) {
- if (tensor_.Value(coord) != 0) {
- ++count;
- }
-
- // increment index
- ++coord[ndim - 1];
- if (n > 1 && coord[ndim - 1] == shape[ndim - 1]) {
- int64_t d = ndim - 1;
- while (d > 0 && coord[d] == shape[d]) {
- coord[d] = 0;
- ++coord[d - 1];
- --d;
- }
- }
- }
- return count;
- }
-
const NumericTensorType& tensor_;
};
@@ -96,14 +57,15 @@ class SparseTensorConverter<TYPE, SparseCOOIndex>
: private SparseTensorConverterBase<TYPE> {
public:
using BaseClass = SparseTensorConverterBase<TYPE>;
- using NumericTensorType = typename BaseClass::NumericTensorType;
- using value_type = typename BaseClass::value_type;
+ using typename BaseClass::NumericTensorType;
+ using typename BaseClass::value_type;
explicit SparseTensorConverter(const NumericTensorType& tensor) : BaseClass(tensor) {}
Status Convert() {
const int64_t ndim = tensor_.ndim();
- const int64_t nonzero_count = static_cast<int64_t>(CountNonZero());
+ int64_t nonzero_count = -1;
+ RETURN_NOT_OK(tensor_.CountNonZero(&nonzero_count));
std::shared_ptr<Buffer> indices_buffer;
RETURN_NOT_OK(
@@ -170,8 +132,7 @@ class SparseTensorConverter<TYPE, SparseCOOIndex>
std::shared_ptr<Buffer> data;
private:
- using SparseTensorConverterBase<TYPE>::tensor_;
- using SparseTensorConverterBase<TYPE>::CountNonZero;
+ using BaseClass::tensor_;
};
template <typename TYPE, typename SparseIndexType>
@@ -206,7 +167,8 @@ class SparseTensorConverter<TYPE, SparseCSRIndex>
const int64_t nr = tensor_.shape()[0];
const int64_t nc = tensor_.shape()[1];
- const int64_t nonzero_count = static_cast<int64_t>(CountNonZero());
+ int64_t nonzero_count = -1;
+ RETURN_NOT_OK(tensor_.CountNonZero(&nonzero_count));
std::shared_ptr<Buffer> indptr_buffer;
std::shared_ptr<Buffer> indices_buffer;
@@ -258,7 +220,6 @@ class SparseTensorConverter<TYPE, SparseCSRIndex>
private:
using BaseClass::tensor_;
- using SparseTensorConverterBase<TYPE>::CountNonZero;
};
// ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/tensor-test.cc b/cpp/src/arrow/tensor-test.cc
index 11ea7c2..caf0322 100644
--- a/cpp/src/arrow/tensor-test.cc
+++ b/cpp/src/arrow/tensor-test.cc
@@ -31,6 +31,12 @@
namespace arrow {
+void AssertCountNonZero(const Tensor& t, int64_t expected) {
+ int64_t count = -1;
+ ASSERT_OK(t.CountNonZero(&count));
+ ASSERT_EQ(count, expected);
+}
+
TEST(TestTensor, ZeroDim) {
const int64_t values = 1;
std::vector<int64_t> shape = {};
@@ -97,7 +103,7 @@ TEST(TestTensor, IsContiguous) {
ASSERT_FALSE(t3.is_contiguous());
}
-TEST(TestTensor, ZeroDimensionalTensor) {
+TEST(TestTensor, ZeroSizedTensor) {
std::vector<int64_t> shape = {0};
std::shared_ptr<Buffer> buffer;
@@ -107,6 +113,48 @@ TEST(TestTensor, ZeroDimensionalTensor) {
ASSERT_EQ(t.strides().size(), 1);
}
+TEST(TestTensor, CountNonZeroForZeroSizedTensor) {
+ std::vector<int64_t> shape = {0};
+
+ std::shared_ptr<Buffer> buffer;
+ ASSERT_OK(AllocateBuffer(0, &buffer));
+
+ Tensor t(int64(), buffer, shape);
+ AssertCountNonZero(t, 0);
+}
+
+TEST(TestTensor, CountNonZeroForContiguousTensor) {
+ std::vector<int64_t> shape = {4, 6};
+ std::vector<int64_t> values = {1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0,
+ 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16};
+ std::shared_ptr<Buffer> buffer = Buffer::Wrap(values);
+
+ std::vector<int64_t> c_strides = {48, 8};
+ std::vector<int64_t> f_strides = {8, 32};
+ Tensor t1(int64(), buffer, shape, c_strides);
+ Tensor t2(int64(), buffer, shape, f_strides);
+
+ ASSERT_TRUE(t1.is_contiguous());
+ ASSERT_TRUE(t2.is_contiguous());
+ AssertCountNonZero(t1, 12);
+ AssertCountNonZero(t2, 12);
+}
+
+TEST(TestTensor, CountNonZeroForNonContiguousTensor) {
+ std::vector<int64_t> shape = {4, 4};
+ std::vector<int64_t> values = {
+ 1, 0, 2, 0, 0, 3, 0, 4, 5, 0, 6, 0, 7, 0, 8, 0,
+ 0, 11, 0, 12, 13, 0, 14, 0, 0, 15, 0, 16, 0, 15, 0, 16,
+ };
+ std::shared_ptr<Buffer> buffer = Buffer::Wrap(values);
+
+ std::vector<int64_t> noncontig_strides = {64, 16};
+ Tensor t(int64(), buffer, shape, noncontig_strides);
+
+ ASSERT_FALSE(t.is_contiguous());
+ AssertCountNonZero(t, 8);
+}
+
TEST(TestNumericTensor, ElementAccessWithRowMajorStrides) {
std::vector<int64_t> shape = {3, 4};
diff --git a/cpp/src/arrow/tensor.cc b/cpp/src/arrow/tensor.cc
index a4db298..7cd4a32 100644
--- a/cpp/src/arrow/tensor.cc
+++ b/cpp/src/arrow/tensor.cc
@@ -23,6 +23,7 @@
#include <memory>
#include <numeric>
#include <string>
+#include <type_traits>
#include <vector>
#include "arrow/compare.h"
@@ -30,6 +31,7 @@
#include "arrow/type_traits.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/logging.h"
+#include "arrow/visitor_inline.h"
namespace arrow {
@@ -123,4 +125,72 @@ Type::type Tensor::type_id() const { return type_->id(); }
bool Tensor::Equals(const Tensor& other) const { return TensorEquals(*this, other); }
+namespace {
+
+template <typename TYPE>
+int64_t StridedTensorCountNonZero(int dim_index, int64_t offset, const Tensor& tensor) {
+ using c_type = typename TYPE::c_type;
+ c_type const zero = c_type(0);
+ int64_t nnz = 0;
+ if (dim_index == tensor.ndim() - 1) {
+ for (int64_t i = 0; i < tensor.shape()[dim_index]; ++i) {
+ auto const* ptr = tensor.raw_data() + offset + i * tensor.strides()[dim_index];
+ auto& elem = *reinterpret_cast<c_type const*>(ptr);
+ if (elem != zero) ++nnz;
+ }
+ return nnz;
+ }
+ for (int64_t i = 0; i < tensor.shape()[dim_index]; ++i) {
+ nnz += StridedTensorCountNonZero<TYPE>(dim_index + 1, offset, tensor);
+ offset += tensor.strides()[dim_index];
+ }
+ return nnz;
+}
+
+template <typename TYPE>
+int64_t ContiguousTensorCountNonZero(const Tensor& tensor) {
+ using c_type = typename TYPE::c_type;
+ auto* data = reinterpret_cast<c_type const*>(tensor.raw_data());
+ return std::count_if(data, data + tensor.size(),
+ [](c_type const& x) { return x != 0; });
+}
+
+template <typename TYPE>
+inline int64_t TensorCountNonZero(const Tensor& tensor) {
+ if (tensor.is_contiguous()) {
+ return ContiguousTensorCountNonZero<TYPE>(tensor);
+ } else {
+ return StridedTensorCountNonZero<TYPE>(0, 0, tensor);
+ }
+}
+
+struct NonZeroCounter {
+ NonZeroCounter(const Tensor& tensor, int64_t* result)
+ : tensor_(tensor), result_(result) {}
+
+ template <typename TYPE>
+ typename std::enable_if<!std::is_base_of<Number, TYPE>::value, Status>::type Visit(
+ const TYPE& type) {
+ DCHECK(!is_tensor_supported(type.id()));
+ return Status::NotImplemented("Tensor of ", type.ToString(), " is not implemented");
+ }
+
+ template <typename TYPE>
+ typename std::enable_if<std::is_base_of<Number, TYPE>::value, Status>::type Visit(
+ const TYPE& type) {
+ *result_ = TensorCountNonZero<TYPE>(tensor_);
+ return Status::OK();
+ }
+
+ const Tensor& tensor_;
+ int64_t* result_;
+};
+
+} // namespace
+
+Status Tensor::CountNonZero(int64_t* result) const {
+ NonZeroCounter counter(*this, result);
+ return VisitTypeInline(*type(), &counter);
+}
+
} // namespace arrow
diff --git a/cpp/src/arrow/tensor.h b/cpp/src/arrow/tensor.h
index fb2093b..3171502 100644
--- a/cpp/src/arrow/tensor.h
+++ b/cpp/src/arrow/tensor.h
@@ -104,6 +104,9 @@ class ARROW_EXPORT Tensor {
bool Equals(const Tensor& other) const;
+ /// Compute the number of non-zero values in the tensor
+ Status CountNonZero(int64_t* result) const;
+
protected:
Tensor() {}