You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "rok (via GitHub)" <gi...@apache.org> on 2023/02/15 12:13:52 UTC

[GitHub] [arrow] rok commented on a diff in pull request #8510: GH-15483: [C++] Add a Fixed Shape Tensor canonical ExtensionType

rok commented on code in PR #8510:
URL: https://github.com/apache/arrow/pull/8510#discussion_r1107047537


##########
cpp/src/arrow/extension_type_test.cc:
##########
@@ -333,4 +337,146 @@ TEST_F(TestExtensionType, ValidateExtensionArray) {
   ASSERT_OK(ext_arr4->ValidateFull());
 }
 
+TEST_F(TestExtensionType, TensorArrayType) {
+  using TensorArrayType = extension::TensorArrayType;
+
+  std::vector<int64_t> shape = {3, 3, 4};
+  std::vector<int64_t> cell_shape = {3, 4};
+  auto value_type = int64();
+  std::shared_ptr<DataType> cell_type = fixed_size_list(value_type, 12);
+
+  std::vector<std::string> dim_names = {"x", "y"};
+  std::vector<int64_t> strides = {96, 32, 8};
+  std::vector<int64_t> column_major_strides = {8, 24, 72};
+  std::vector<int64_t> neither_major_strides = {96, 8, 32};
+  std::vector<int64_t> cell_strides = {32, 8};
+  std::vector<int64_t> values = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                                 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+                                 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35};
+  std::vector<int64_t> values_partial = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                                         12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23};
+  std::vector<int64_t> shape_partial = {2, 3, 4};
+  std::string serialized =
+      R"({"shape":[3,4],"dim_names":["x","y"],"metadata":{"key1":"metadata1"}})";
+  std::string metadata = R"({"key1":"metadata1"})";
+
+  ASSERT_OK_AND_ASSIGN(auto tensor,
+                       Tensor::Make(value_type, Buffer::Wrap(values), shape));
+  ASSERT_OK_AND_ASSIGN(
+      auto tensor_partial,
+      Tensor::Make(value_type, Buffer::Wrap(values_partial), shape_partial));
+
+  std::shared_ptr<ExtensionType> ext_type =
+      extension::tensor_array(value_type, cell_shape, {}, dim_names, metadata);
+  auto exact_ext_type = internal::checked_pointer_cast<TensorArrayType>(ext_type);
+  ASSERT_OK_AND_ASSIGN(auto ds,
+                       ext_type->Deserialize(ext_type->storage_type(), serialized));
+  std::shared_ptr<ExtensionType> deserialized =
+      std::reinterpret_pointer_cast<ExtensionType>(ds);
+
+  ASSERT_TRUE(tensor->is_row_major());
+  ASSERT_EQ(tensor->strides(), strides);
+  ASSERT_EQ(tensor_partial->strides(), strides);
+
+  // Test ExtensionType methods
+  ASSERT_EQ(ext_type->extension_name(), "arrow.fixed_shape_tensor");
+  ASSERT_TRUE(ext_type->ExtensionEquals(*exact_ext_type));
+  ASSERT_TRUE(ext_type->storage_type()->Equals(*cell_type));
+  ASSERT_EQ(ext_type->Serialize(), serialized);
+  ASSERT_TRUE(deserialized->ExtensionEquals(*ext_type));
+  ASSERT_EQ(internal::checked_pointer_cast<TensorArrayType>(deserialized)->metadata(),
+            metadata);
+  ASSERT_EQ(exact_ext_type->id(), Type::EXTENSION);
+
+  // Test TensorArrayType methods
+  ASSERT_EQ(exact_ext_type->ndim(), cell_shape.size());
+  ASSERT_EQ(exact_ext_type->shape(), cell_shape);
+  ASSERT_EQ(exact_ext_type->strides(), cell_strides);
+  ASSERT_EQ(exact_ext_type->dim_names(), dim_names);
+  ASSERT_EQ(exact_ext_type->metadata(), metadata);
+
+  // Test MakeArray(std::shared_ptr<ArrayData> data)
+  std::vector<std::shared_ptr<Buffer>> buffers = {nullptr, Buffer::Wrap(values)};
+  auto arr_data = std::make_shared<ArrayData>(value_type, values.size(), buffers, 0, 0);
+  auto arr = std::make_shared<Int64Array>(arr_data);
+  EXPECT_OK_AND_ASSIGN(auto fsla_arr, FixedSizeListArray::FromArrays(arr, cell_type));
+  auto data = fsla_arr->data();
+  data->type = ext_type;
+  auto ext_arr = exact_ext_type->MakeArray(data);
+  ASSERT_EQ(ext_arr->length(), shape[0]);
+  ASSERT_EQ(ext_arr->null_count(), 0);
+
+  // Test MakeArray(std::shared_ptr<Tensor> tensor)
+  EXPECT_OK_AND_ASSIGN(auto ext_arr_partial, exact_ext_type->MakeArray(tensor_partial));
+  ASSERT_OK(ext_arr->ValidateFull());
+  ASSERT_OK(ext_arr_partial->ValidateFull());
+
+  // Test ToTensor(std::shared_ptr<Array> array)
+  EXPECT_OK_AND_ASSIGN(auto t, exact_ext_type->ToTensor(ext_arr));
+  ASSERT_EQ(t->shape(), tensor->shape());
+  ASSERT_EQ(t->strides(), tensor->strides());
+  ASSERT_TRUE(tensor->Equals(*t));
+
+  // Test slicing
+  auto sliced = internal::checked_pointer_cast<ExtensionArray>(ext_arr->Slice(0, 2));
+  auto partial = internal::checked_pointer_cast<ExtensionArray>(ext_arr_partial);
+  ASSERT_OK(sliced->ValidateFull());
+  ASSERT_TRUE(sliced->storage()->Equals(*partial->storage()));
+  ASSERT_EQ(sliced->length(), partial->length());
+
+  // TODO: Where should canonical types be registered?
+  ASSERT_OK(RegisterExtensionType(exact_ext_type));

Review Comment:
   @westonpace I want to add a canonical extension type to Arrow and for it to be usable from Python it should probably be registered "at boot". What would a good location for this registration?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org