You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "kou (via GitHub)" <gi...@apache.org> on 2023/04/03 01:53:20 UTC

[GitHub] [arrow] kou commented on a diff in pull request #34836: GH-34823: [C++][ORC] Fix ORC CHAR type mapping

kou commented on code in PR #34836:
URL: https://github.com/apache/arrow/pull/34836#discussion_r1155420214


##########
cpp/src/arrow/adapters/orc/adapter_test.cc:
##########
@@ -453,6 +453,59 @@ TEST(TestAdapterRead, ReadIntAndStringFileMultipleStripes) {
   EXPECT_EQ(num_rows / reader_batch_size, batches);
 }
 
+TEST(TestAdapterRead, ReadCharAndVarcharType) {
+  MemoryOutputStream mem_stream(kDefaultMemStreamSize);
+  auto orc_type = liborc::Type::buildTypeFromString("struct<c1:char(6),c2:varchar(6)>");
+  auto writer = CreateWriter(/*stripe_size=*/1024, *orc_type, &mem_stream);
+
+  constexpr int64_t row_count = 2;
+  auto batch = writer->createRowBatch(row_count);
+  auto struct_batch = internal::checked_cast<liborc::StructVectorBatch*>(batch.get());
+
+  // Verify that longer data will be truncated by ORC char and varchar types.
+  // In addition, ORC char type will pad the data with spaces if the data is shorter.
+  const std::vector<std::string> data = {"abcd", "ABCDEFGH"};
+  const std::vector<std::vector<std::string>> expected_data = {{"abcd  ", "ABCDEF"},
+                                                               {"abcd", "ABCDEF"}};
+
+  for (uint64_t col = 0; col < orc_type->getSubtypeCount(); col++) {

Review Comment:
   ```suggestion
     for (uint64_t col = 0; col < orc_type->getSubtypeCount(); ++col) {
   ```



##########
cpp/src/arrow/adapters/orc/adapter_test.cc:
##########
@@ -453,6 +453,59 @@ TEST(TestAdapterRead, ReadIntAndStringFileMultipleStripes) {
   EXPECT_EQ(num_rows / reader_batch_size, batches);
 }
 
+TEST(TestAdapterRead, ReadCharAndVarcharType) {
+  MemoryOutputStream mem_stream(kDefaultMemStreamSize);
+  auto orc_type = liborc::Type::buildTypeFromString("struct<c1:char(6),c2:varchar(6)>");
+  auto writer = CreateWriter(/*stripe_size=*/1024, *orc_type, &mem_stream);
+
+  constexpr int64_t row_count = 2;
+  auto batch = writer->createRowBatch(row_count);
+  auto struct_batch = internal::checked_cast<liborc::StructVectorBatch*>(batch.get());
+
+  // Verify that longer data will be truncated by ORC char and varchar types.
+  // In addition, ORC char type will pad the data with spaces if the data is shorter.
+  const std::vector<std::string> data = {"abcd", "ABCDEFGH"};
+  const std::vector<std::vector<std::string>> expected_data = {{"abcd  ", "ABCDEF"},
+                                                               {"abcd", "ABCDEF"}};
+
+  for (uint64_t col = 0; col < orc_type->getSubtypeCount(); col++) {
+    auto str_batch =
+        internal::checked_cast<liborc::StringVectorBatch*>(struct_batch->fields[col]);
+    str_batch->hasNulls = false;
+    str_batch->numElements = row_count;
+    for (int64_t i = 0; i < row_count; ++i) {

Review Comment:
   Should we use `row` instead of `i` here like the below loop?



##########
cpp/src/arrow/adapters/orc/adapter_test.cc:
##########
@@ -453,6 +453,59 @@ TEST(TestAdapterRead, ReadIntAndStringFileMultipleStripes) {
   EXPECT_EQ(num_rows / reader_batch_size, batches);
 }
 
+TEST(TestAdapterRead, ReadCharAndVarcharType) {
+  MemoryOutputStream mem_stream(kDefaultMemStreamSize);
+  auto orc_type = liborc::Type::buildTypeFromString("struct<c1:char(6),c2:varchar(6)>");
+  auto writer = CreateWriter(/*stripe_size=*/1024, *orc_type, &mem_stream);
+
+  constexpr int64_t row_count = 2;
+  auto batch = writer->createRowBatch(row_count);
+  auto struct_batch = internal::checked_cast<liborc::StructVectorBatch*>(batch.get());
+
+  // Verify that longer data will be truncated by ORC char and varchar types.
+  // In addition, ORC char type will pad the data with spaces if the data is shorter.
+  const std::vector<std::string> data = {"abcd", "ABCDEFGH"};
+  const std::vector<std::vector<std::string>> expected_data = {{"abcd  ", "ABCDEF"},
+                                                               {"abcd", "ABCDEF"}};
+
+  for (uint64_t col = 0; col < orc_type->getSubtypeCount(); col++) {
+    auto str_batch =
+        internal::checked_cast<liborc::StringVectorBatch*>(struct_batch->fields[col]);
+    str_batch->hasNulls = false;
+    str_batch->numElements = row_count;
+    for (int64_t i = 0; i < row_count; ++i) {
+      str_batch->data[i] = const_cast<char*>(data[i].c_str());
+      str_batch->length[i] = static_cast<int64_t>(data[i].size());
+    }
+  }
+  batch->numElements = row_count;
+  writer->add(*batch);
+  writer->close();
+
+  std::shared_ptr<io::RandomAccessFile> in_stream(std::make_shared<io::BufferReader>(
+      reinterpret_cast<const uint8_t*>(mem_stream.getData()),
+      static_cast<int64_t>(mem_stream.getLength())));
+  ASSERT_OK_AND_ASSIGN(
+      auto reader, adapters::orc::ORCFileReader::Open(in_stream, default_memory_pool()));
+  ASSERT_EQ(row_count, reader->NumberOfRows());
+  ASSERT_EQ(1, reader->NumberOfStripes());
+
+  EXPECT_OK_AND_ASSIGN(auto stripe_reader, reader->NextStripeReader(row_count));
+  std::shared_ptr<RecordBatch> record_batch;
+  ASSERT_TRUE(stripe_reader->ReadNext(&record_batch).ok());
+  ASSERT_NE(record_batch, nullptr);

Review Comment:
   It seems that we use `expected, actual` order in this test.
   
   ```suggestion
     ASSERT_NE(nullptr, record_batch);
   ```



##########
cpp/src/arrow/adapters/orc/adapter_test.cc:
##########
@@ -453,6 +453,59 @@ TEST(TestAdapterRead, ReadIntAndStringFileMultipleStripes) {
   EXPECT_EQ(num_rows / reader_batch_size, batches);
 }
 
+TEST(TestAdapterRead, ReadCharAndVarcharType) {
+  MemoryOutputStream mem_stream(kDefaultMemStreamSize);
+  auto orc_type = liborc::Type::buildTypeFromString("struct<c1:char(6),c2:varchar(6)>");
+  auto writer = CreateWriter(/*stripe_size=*/1024, *orc_type, &mem_stream);
+
+  constexpr int64_t row_count = 2;
+  auto batch = writer->createRowBatch(row_count);
+  auto struct_batch = internal::checked_cast<liborc::StructVectorBatch*>(batch.get());
+
+  // Verify that longer data will be truncated by ORC char and varchar types.
+  // In addition, ORC char type will pad the data with spaces if the data is shorter.
+  const std::vector<std::string> data = {"abcd", "ABCDEFGH"};
+  const std::vector<std::vector<std::string>> expected_data = {{"abcd  ", "ABCDEF"},
+                                                               {"abcd", "ABCDEF"}};
+
+  for (uint64_t col = 0; col < orc_type->getSubtypeCount(); col++) {
+    auto str_batch =
+        internal::checked_cast<liborc::StringVectorBatch*>(struct_batch->fields[col]);
+    str_batch->hasNulls = false;
+    str_batch->numElements = row_count;
+    for (int64_t i = 0; i < row_count; ++i) {
+      str_batch->data[i] = const_cast<char*>(data[i].c_str());
+      str_batch->length[i] = static_cast<int64_t>(data[i].size());
+    }
+  }
+  batch->numElements = row_count;
+  writer->add(*batch);
+  writer->close();
+
+  std::shared_ptr<io::RandomAccessFile> in_stream(std::make_shared<io::BufferReader>(
+      reinterpret_cast<const uint8_t*>(mem_stream.getData()),
+      static_cast<int64_t>(mem_stream.getLength())));
+  ASSERT_OK_AND_ASSIGN(
+      auto reader, adapters::orc::ORCFileReader::Open(in_stream, default_memory_pool()));
+  ASSERT_EQ(row_count, reader->NumberOfRows());
+  ASSERT_EQ(1, reader->NumberOfStripes());
+
+  EXPECT_OK_AND_ASSIGN(auto stripe_reader, reader->NextStripeReader(row_count));
+  std::shared_ptr<RecordBatch> record_batch;
+  ASSERT_TRUE(stripe_reader->ReadNext(&record_batch).ok());

Review Comment:
   Can we use `ASSERT_OK()` here?
   
   ```suggestion
     ASSERT_OK(stripe_reader->ReadNext(&record_batch));
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org