You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "pitrou (via GitHub)" <gi...@apache.org> on 2023/05/09 16:11:23 UTC

[GitHub] [arrow] pitrou commented on a diff in pull request #35499: GH-35304: [C++][ORC] Support attributes conversion

pitrou commented on code in PR #35499:
URL: https://github.com/apache/arrow/pull/35499#discussion_r1188828047


##########
cpp/src/arrow/adapters/orc/util.cc:
##########
@@ -1176,11 +1190,32 @@ Result<std::unique_ptr<liborc::Type>> GetOrcType(const Schema& schema) {
   for (int i = 0; i < numFields; i++) {
     const auto& field = schema.field(i);
     ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*field->type()));
+    SetAttributes(field, orc_subtype.get());
     out_type->addStructField(field->name(), std::move(orc_subtype));
   }
   return std::move(out_type);
 }
 
+Result<std::shared_ptr<const KeyValueMetadata>> GetFieldMetadata(
+    const liborc::Type* type) {
+  if (type == nullptr) {
+    return nullptr;
+  }
+  const auto keys = type->getAttributeKeys();

Review Comment:
   You might actually want to avoid the metadata allocation here.
   ```suggestion
     const auto keys = type->getAttributeKeys();
     if (keys.empty()) {
       return nullptr;
     }
   ```



##########
cpp/src/arrow/adapters/orc/adapter_test.cc:
##########
@@ -506,6 +506,116 @@ TEST(TestAdapterRead, ReadCharAndVarcharType) {
   ASSERT_EQ(nullptr, record_batch);
 }
 
+TEST(TestAdapterRead, ReadFieldAttributes) {
+  const std::string id_key = "iceberg.id";
+  const std::string required_key = "iceberg.required";
+
+  auto set_attributes = [&](liborc::Type* type, const std::string& id,
+                            const std::string& required) {
+    type->setAttribute(id_key, id);
+    type->setAttribute(required_key, required);
+  };
+
+  auto check_attributes = [&](const std::shared_ptr<arrow::Field>& field,
+                              const std::string& expect_id,
+                              const std::string& expect_required) {
+    auto field_metadata = field->metadata();
+    ASSERT_NE(field_metadata, nullptr);
+    ASSERT_EQ(expect_id, field_metadata->Get(id_key));
+    ASSERT_EQ(expect_required, field_metadata->Get(required_key));
+  };
+
+  auto c1_type = liborc::createPrimitiveType(liborc::TypeKind::INT);
+  set_attributes(c1_type.get(), "1", "true");
+
+  auto c2_elem_type = liborc::createPrimitiveType(liborc::TypeKind::INT);
+  set_attributes(c2_elem_type.get(), "3", "false");
+  auto c2_type = liborc::createListType(std::move(c2_elem_type));
+  set_attributes(c2_type.get(), "2", "false");
+
+  auto c3_key_type = liborc::createPrimitiveType(liborc::TypeKind::INT);
+  set_attributes(c3_key_type.get(), "5", "true");
+  auto c3_value_type = liborc::createPrimitiveType(liborc::TypeKind::INT);
+  set_attributes(c3_value_type.get(), "6", "false");
+  auto c3_type = liborc::createMapType(std::move(c3_key_type), std::move(c3_value_type));
+  set_attributes(c3_type.get(), "4", "false");
+
+  auto c4_sub_type = liborc::createPrimitiveType(liborc::TypeKind::INT);
+  set_attributes(c4_sub_type.get(), "8", "false");
+  auto c4_type = liborc::createStructType();
+  c4_type->addStructField("c4_1", std::move(c4_sub_type));
+  set_attributes(c4_type.get(), "7", "false");
+
+  auto orc_type = liborc::createStructType();
+  orc_type->addStructField("c1", std::move(c1_type));
+  orc_type->addStructField("c2", std::move(c2_type));
+  orc_type->addStructField("c3", std::move(c3_type));
+  orc_type->addStructField("c4", std::move(c4_type));
+
+  MemoryOutputStream mem_stream(kDefaultMemStreamSize);
+  auto writer = CreateWriter(/*stripe_size=*/1024, *orc_type, &mem_stream);
+  writer->close();
+
+  std::shared_ptr<io::RandomAccessFile> in_stream(std::make_shared<io::BufferReader>(
+      reinterpret_cast<const uint8_t*>(mem_stream.getData()),
+      static_cast<int64_t>(mem_stream.getLength())));
+  ASSERT_OK_AND_ASSIGN(
+      auto reader, adapters::orc::ORCFileReader::Open(in_stream, default_memory_pool()));
+  ASSERT_EQ(0, reader->NumberOfRows());
+
+  ASSERT_OK_AND_ASSIGN(auto schema, reader->ReadSchema());
+  ASSERT_EQ(4, schema->num_fields());
+
+  // check top level fields
+  check_attributes(schema->field(0), "1", "true");
+  check_attributes(schema->field(1), "2", "false");
+  check_attributes(schema->field(2), "4", "false");
+  check_attributes(schema->field(3), "7", "false");
+
+  // check list element type
+  auto list_type = checked_pointer_cast<arrow::ListType>(schema->field(1)->type());
+  check_attributes(list_type->value_field(), "3", "false");
+
+  // check map key/value types
+  auto map_type = checked_pointer_cast<arrow::MapType>(schema->field(2)->type());
+  check_attributes(map_type->key_field(), "5", "true");
+  check_attributes(map_type->item_field(), "6", "false");
+
+  // check struct sub-field type
+  auto struct_type = checked_pointer_cast<arrow::StructType>(schema->field(3)->type());
+  check_attributes(struct_type->field(0), "8", "false");
+}
+
+TEST(TestAdapterReadWrite, FieldAttributesRoundTrip) {
+  EXPECT_OK_AND_ASSIGN(auto buffer_output_stream, io::BufferOutputStream::Create(1024));
+  auto write_options = adapters::orc::WriteOptions();
+  write_options.compression = Compression::UNCOMPRESSED;
+  EXPECT_OK_AND_ASSIGN(auto writer, adapters::orc::ORCFileWriter::Open(
+                                        buffer_output_stream.get(), write_options));
+
+  auto schema = ::arrow::schema(
+      {::arrow::field("c0", ::arrow::int64(), true, key_value_metadata({"k0"}, {"v0"})),
+       ::arrow::field("c1", ::arrow::utf8(), true, key_value_metadata({"k1"}, {"v1"})),
+       ::arrow::field(
+           "c2", ::arrow::list(::arrow::field("item", ::arrow::int64(), true,
+                                              key_value_metadata({"k0"}, {"ddv0"}))))});

Review Comment:
   Could you also add the other kinds of nested fields? (struct, map, union)



##########
cpp/src/arrow/adapters/orc/util.h:
##########
@@ -23,6 +23,7 @@
 #include "arrow/array/builder_base.h"
 #include "arrow/result.h"
 #include "arrow/status.h"
+#include "arrow/util/key_value_metadata.h"

Review Comment:
   Only the forward declarations should be necessary here.
   ```suggestion
   #include "arrow/type_fwd.h"
   ```



##########
cpp/src/arrow/adapters/orc/util.cc:
##########
@@ -1176,11 +1190,32 @@ Result<std::unique_ptr<liborc::Type>> GetOrcType(const Schema& schema) {
   for (int i = 0; i < numFields; i++) {
     const auto& field = schema.field(i);
     ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*field->type()));
+    SetAttributes(field, orc_subtype.get());
     out_type->addStructField(field->name(), std::move(orc_subtype));
   }
   return std::move(out_type);
 }
 
+Result<std::shared_ptr<const KeyValueMetadata>> GetFieldMetadata(
+    const liborc::Type* type) {
+  if (type == nullptr) {

Review Comment:
   Is there any valid use case where this happens?



##########
cpp/src/arrow/adapters/orc/adapter_test.cc:
##########
@@ -506,6 +506,116 @@ TEST(TestAdapterRead, ReadCharAndVarcharType) {
   ASSERT_EQ(nullptr, record_batch);
 }
 
+TEST(TestAdapterRead, ReadFieldAttributes) {
+  const std::string id_key = "iceberg.id";
+  const std::string required_key = "iceberg.required";
+
+  auto set_attributes = [&](liborc::Type* type, const std::string& id,
+                            const std::string& required) {
+    type->setAttribute(id_key, id);
+    type->setAttribute(required_key, required);
+  };
+
+  auto check_attributes = [&](const std::shared_ptr<arrow::Field>& field,
+                              const std::string& expect_id,
+                              const std::string& expect_required) {
+    auto field_metadata = field->metadata();
+    ASSERT_NE(field_metadata, nullptr);
+    ASSERT_EQ(expect_id, field_metadata->Get(id_key));
+    ASSERT_EQ(expect_required, field_metadata->Get(required_key));
+  };
+
+  auto c1_type = liborc::createPrimitiveType(liborc::TypeKind::INT);
+  set_attributes(c1_type.get(), "1", "true");
+
+  auto c2_elem_type = liborc::createPrimitiveType(liborc::TypeKind::INT);
+  set_attributes(c2_elem_type.get(), "3", "false");
+  auto c2_type = liborc::createListType(std::move(c2_elem_type));
+  set_attributes(c2_type.get(), "2", "false");
+
+  auto c3_key_type = liborc::createPrimitiveType(liborc::TypeKind::INT);
+  set_attributes(c3_key_type.get(), "5", "true");
+  auto c3_value_type = liborc::createPrimitiveType(liborc::TypeKind::INT);
+  set_attributes(c3_value_type.get(), "6", "false");
+  auto c3_type = liborc::createMapType(std::move(c3_key_type), std::move(c3_value_type));
+  set_attributes(c3_type.get(), "4", "false");
+
+  auto c4_sub_type = liborc::createPrimitiveType(liborc::TypeKind::INT);
+  set_attributes(c4_sub_type.get(), "8", "false");
+  auto c4_type = liborc::createStructType();
+  c4_type->addStructField("c4_1", std::move(c4_sub_type));
+  set_attributes(c4_type.get(), "7", "false");
+
+  auto orc_type = liborc::createStructType();
+  orc_type->addStructField("c1", std::move(c1_type));
+  orc_type->addStructField("c2", std::move(c2_type));
+  orc_type->addStructField("c3", std::move(c3_type));
+  orc_type->addStructField("c4", std::move(c4_type));
+
+  MemoryOutputStream mem_stream(kDefaultMemStreamSize);
+  auto writer = CreateWriter(/*stripe_size=*/1024, *orc_type, &mem_stream);
+  writer->close();
+
+  std::shared_ptr<io::RandomAccessFile> in_stream(std::make_shared<io::BufferReader>(
+      reinterpret_cast<const uint8_t*>(mem_stream.getData()),
+      static_cast<int64_t>(mem_stream.getLength())));
+  ASSERT_OK_AND_ASSIGN(
+      auto reader, adapters::orc::ORCFileReader::Open(in_stream, default_memory_pool()));
+  ASSERT_EQ(0, reader->NumberOfRows());
+
+  ASSERT_OK_AND_ASSIGN(auto schema, reader->ReadSchema());
+  ASSERT_EQ(4, schema->num_fields());
+
+  // check top level fields
+  check_attributes(schema->field(0), "1", "true");
+  check_attributes(schema->field(1), "2", "false");
+  check_attributes(schema->field(2), "4", "false");
+  check_attributes(schema->field(3), "7", "false");
+
+  // check list element type
+  auto list_type = checked_pointer_cast<arrow::ListType>(schema->field(1)->type());
+  check_attributes(list_type->value_field(), "3", "false");
+
+  // check map key/value types
+  auto map_type = checked_pointer_cast<arrow::MapType>(schema->field(2)->type());
+  check_attributes(map_type->key_field(), "5", "true");
+  check_attributes(map_type->item_field(), "6", "false");
+
+  // check struct sub-field type
+  auto struct_type = checked_pointer_cast<arrow::StructType>(schema->field(3)->type());
+  check_attributes(struct_type->field(0), "8", "false");
+}
+
+TEST(TestAdapterReadWrite, FieldAttributesRoundTrip) {
+  EXPECT_OK_AND_ASSIGN(auto buffer_output_stream, io::BufferOutputStream::Create(1024));
+  auto write_options = adapters::orc::WriteOptions();
+  write_options.compression = Compression::UNCOMPRESSED;
+  EXPECT_OK_AND_ASSIGN(auto writer, adapters::orc::ORCFileWriter::Open(
+                                        buffer_output_stream.get(), write_options));
+
+  auto schema = ::arrow::schema(
+      {::arrow::field("c0", ::arrow::int64(), true, key_value_metadata({"k0"}, {"v0"})),
+       ::arrow::field("c1", ::arrow::utf8(), true, key_value_metadata({"k1"}, {"v1"})),
+       ::arrow::field(
+           "c2", ::arrow::list(::arrow::field("item", ::arrow::int64(), true,
+                                              key_value_metadata({"k0"}, {"ddv0"}))))});
+  auto expected_output_table = ::arrow::TableFromJSON(schema, {R"([[1, "a", [1, 2]]])"});
+  ARROW_EXPECT_OK(writer->Write(*expected_output_table));
+  ARROW_EXPECT_OK(writer->Close());
+
+  EXPECT_OK_AND_ASSIGN(auto buffer, buffer_output_stream->Finish());
+  std::shared_ptr<io::RandomAccessFile> in_stream(new io::BufferReader(buffer));
+  EXPECT_OK_AND_ASSIGN(
+      auto reader, adapters::orc::ORCFileReader::Open(in_stream, default_memory_pool()));
+  EXPECT_OK_AND_ASSIGN(auto actual_output_table, reader->Read());
+  ASSERT_OK(actual_output_table->ValidateFull());
+  AssertTablesEqual(*expected_output_table, *actual_output_table, false, false);

Review Comment:
   Can you qualify the two boolean arguments?
   ```suggestion
     AssertTablesEqual(*expected_output_table, *actual_output_table, /*xxx=*/ false, /*yyy=*/ false);
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org