You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2022/10/20 21:43:54 UTC

[arrow] 07/13: ARROW-17524: [C++] Correction for fields included when reading an ORC table (#13962)

This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch maint-10.0.0
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit eea4a54f66a36137f4204876a68e6d5ed913cbc5
Author: LouisClt <lo...@hotmail.fr>
AuthorDate: Tue Oct 18 22:27:15 2022 +0200

    ARROW-17524: [C++] Correction for fields included when reading an ORC table (#13962)
    
    I think there is a bug in the ORC reader : when we specify the fields indexes that we want to keep, it does not work correctly. Looking at the code, it seems to be because we do "includeTypes" in lieue of "include" when setting the ORC options.
    It can be problematic when we want to import an ORC table containing Union types as it will do an error at the import, even if we try not to import these specific fields.
    
    The definitions of the corresponding ORC methods are here :
    https://github.com/apache/orc/blob/72220851cbde164a22706f8d47741fd1ad3db190/c%2B%2B/src/Options.hh#L185-L191
    
    and
    https://github.com/apache/orc/blob/72220851cbde164a22706f8d47741fd1ad3db190/c%2B%2B/src/Options.hh#L201-L207
    
    Lead-authored-by: LouisClt <lo...@hotmail.fr>
    Co-authored-by: Antoine Pitrou <an...@python.org>
    Co-authored-by: Antoine Pitrou <pi...@free.fr>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 c_glib/test/test-orc-file-reader.rb        | 10 +++-----
 cpp/src/arrow/adapters/orc/adapter.cc      |  2 +-
 cpp/src/arrow/adapters/orc/adapter_test.cc | 40 ++++++++++++++++++++++++++++--
 ruby/red-arrow/test/test-orc.rb            |  4 +--
 4 files changed, 45 insertions(+), 11 deletions(-)

diff --git a/c_glib/test/test-orc-file-reader.rb b/c_glib/test/test-orc-file-reader.rb
index 38900cf12f..6626c67c3a 100644
--- a/c_glib/test/test-orc-file-reader.rb
+++ b/c_glib/test/test-orc-file-reader.rb
@@ -185,8 +185,8 @@ map: map<string, struct<int1: int32, string1: string>>
     test("select fields") do
       require_gi_bindings(3, 2, 6)
       @reader.field_indices = [1, 3]
-      assert_equal(build_table("boolean1" => build_boolean_array([false, true]),
-                               "short1" => build_int16_array([1024, 2048])),
+      assert_equal(build_table("byte1" => build_int8_array([1, 100]),
+                               "int1" => build_int32_array([65536, 65536])),
                    @reader.read_stripes)
     end
   end
@@ -200,10 +200,8 @@ map: map<string, struct<int1: int32, string1: string>>
     test("select fields") do
       require_gi_bindings(3, 2, 6)
       @reader.field_indices = [1, 3]
-      boolean1 = build_boolean_array([false, true])
-      short1 = build_int16_array([1024, 2048])
-      assert_equal(build_record_batch("boolean1" => boolean1,
-                                      "short1" => short1),
+      assert_equal(build_record_batch("byte1" => build_int8_array([1, 100]),
+                                      "int1" => build_int32_array([65536, 65536])),
                    @reader.read_stripe(0))
     end
   end
diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc
index 5af5ebccc8..18f88bc6df 100644
--- a/cpp/src/arrow/adapters/orc/adapter.cc
+++ b/cpp/src/arrow/adapters/orc/adapter.cc
@@ -411,7 +411,7 @@ class ORCFileReader::Impl {
       ARROW_RETURN_IF(*it < 0, Status::Invalid("Negative field index"));
       include_indices_list.push_back(*it);
     }
-    opts->includeTypes(include_indices_list);
+    opts->include(include_indices_list);
     return Status::OK();
   }
 
diff --git a/cpp/src/arrow/adapters/orc/adapter_test.cc b/cpp/src/arrow/adapters/orc/adapter_test.cc
index 5c234cc97c..afc4bdb1d3 100644
--- a/cpp/src/arrow/adapters/orc/adapter_test.cc
+++ b/cpp/src/arrow/adapters/orc/adapter_test.cc
@@ -226,7 +226,8 @@ std::shared_ptr<Table> GenerateRandomTable(const std::shared_ptr<Schema>& schema
 
 void AssertTableWriteReadEqual(const std::shared_ptr<Table>& input_table,
                                const std::shared_ptr<Table>& expected_output_table,
-                               const int64_t max_size = kDefaultSmallMemStreamSize) {
+                               const int64_t max_size = kDefaultSmallMemStreamSize,
+                               std::vector<int>* opt_selected_read_indices = nullptr) {
   EXPECT_OK_AND_ASSIGN(auto buffer_output_stream,
                        io::BufferOutputStream::Create(max_size));
   auto write_options = adapters::orc::WriteOptions();
@@ -250,7 +251,11 @@ void AssertTableWriteReadEqual(const std::shared_ptr<Table>& input_table,
   ASSERT_EQ(reader->GetCompression(), write_options.compression);
   ASSERT_EQ(reader->GetCompressionSize(), write_options.compression_block_size);
   ASSERT_EQ(reader->GetRowIndexStride(), write_options.row_index_stride);
-  EXPECT_OK_AND_ASSIGN(auto actual_output_table, reader->Read());
+  EXPECT_OK_AND_ASSIGN(auto actual_output_table,
+                       opt_selected_read_indices == nullptr
+                           ? reader->Read()
+                           : reader->Read(*opt_selected_read_indices));
+  ASSERT_OK(actual_output_table->ValidateFull());
   AssertTablesEqual(*expected_output_table, *actual_output_table, false, false);
 }
 
@@ -451,6 +456,37 @@ TEST_F(TestORCWriterTrivialNoConversion, writeChunkless) {
   std::shared_ptr<Table> table = TableFromJSON(table_schema, {});
   AssertTableWriteReadEqual(table, table, kDefaultSmallMemStreamSize / 16);
 }
+TEST_F(TestORCWriterTrivialNoConversion, writeTrivialChunkAndSelectField) {
+  std::shared_ptr<Table> table = TableFromJSON(table_schema, {R"([])"});
+  std::shared_ptr<Schema> schema_selected =
+      schema({field("int8", int8()), field("int32", int32())});
+  std::shared_ptr<Table> table_selected = TableFromJSON(schema_selected, {R"([])"});
+  std::vector<int> selected_indices = {1, 3};
+  AssertTableWriteReadEqual(table, table_selected, kDefaultSmallMemStreamSize / 16,
+                            &selected_indices);
+}
+TEST_F(TestORCWriterTrivialNoConversion, writeFilledChunkAndSelectField) {
+  std::vector<int> selected_indices = {1, 7};
+  random::RandomArrayGenerator rand(kRandomSeed);
+  std::shared_ptr<Schema> local_schema = schema({
+      field("bool", boolean()),
+      field("int32", int32()),
+      field("int64", int64()),
+      field("float", float32()),
+      field("struct", struct_({field("a", utf8()), field("b", int64())})),
+      field("double", float64()),
+      field("date32", date32()),
+      field("ts3", timestamp(TimeUnit::NANO)),
+      field("string", utf8()),
+      field("binary", binary()),
+  });
+  auto batch = rand.BatchOf(local_schema->fields(), 100);
+  std::shared_ptr<Table> table = Table::Make(local_schema, batch->columns());
+  EXPECT_OK_AND_ASSIGN(auto table_selected, table->SelectColumns(selected_indices));
+  AssertTableWriteReadEqual(table, table_selected, kDefaultSmallMemStreamSize,
+                            &selected_indices);
+}
+
 class TestORCWriterTrivialWithConversion : public ::testing::Test {
  public:
   TestORCWriterTrivialWithConversion() {
diff --git a/ruby/red-arrow/test/test-orc.rb b/ruby/red-arrow/test/test-orc.rb
index b882da0a1b..4670350a09 100644
--- a/ruby/red-arrow/test/test-orc.rb
+++ b/ruby/red-arrow/test/test-orc.rb
@@ -164,8 +164,8 @@ class ORCTest < Test::Unit::TestCase
         ]
       end
       assert_equal([
-                     ["boolean1: bool", [pp_values([false, true])]],
-                     ["short1: int16", [pp_values([1024, 2048])]],
+                     ["byte1: int8", [pp_values([1, 100])]],
+                     ["int1: int32", [pp_values([65536, 65536])]],
                    ],
                    dump)
     end