You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2021/05/17 18:40:59 UTC
[arrow] branch maint-4.0.x updated: ARROW-12603: [C++][Dataset] Backport fix for specifying CSV column types (#10344)

This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch maint-4.0.x
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/maint-4.0.x by this push:
     new d1ca32e  ARROW-12603: [C++][Dataset] Backport fix for specifying CSV column types (#10344)
d1ca32e is described below

commit d1ca32e3bf853c4d9aca9819a6d30bc283bfac5b
Author: David Li <li...@gmail.com>
AuthorDate: Mon May 17 14:39:35 2021 -0400

    ARROW-12603: [C++][Dataset] Backport fix for specifying CSV column types (#10344)
    
    This backports the relevant part of ARROW-12500 into the 4.0.1 branch.
    
    While ARROW-12500 cherry-picks cleanly, it doesn't build since it depends on prior changes - this just includes the actual fix and not the larger refactoring that was the focus of the patch.
---
 cpp/src/arrow/dataset/file_csv.cc      | 12 +++++++++---
 cpp/src/arrow/dataset/file_csv_test.cc | 22 ++++++++++++++++++++++
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/dataset/file_csv.cc b/cpp/src/arrow/dataset/file_csv.cc
index a8274a5..4612a12 100644
--- a/cpp/src/arrow/dataset/file_csv.cc
+++ b/cpp/src/arrow/dataset/file_csv.cc
@@ -90,10 +90,16 @@ static inline Result<csv::ConvertOptions> GetConvertOptions(
       GetFragmentScanOptions<CsvFragmentScanOptions>(
           kCsvTypeName, scan_options.get(), format.default_fragment_scan_options));
   auto convert_options = csv_scan_options->convert_options;
-  for (FieldRef ref : scan_options->MaterializedFields()) {
-    ARROW_ASSIGN_OR_RAISE(auto field, ref.GetOne(*scan_options->dataset_schema));
-
+  auto materialized = scan_options->MaterializedFields();
+  std::unordered_set<std::string> materialized_fields(materialized.begin(),
+                                                      materialized.end());
+  for (auto field : scan_options->dataset_schema->fields()) {
+    if (materialized_fields.find(field->name()) == materialized_fields.end()) continue;
+    // Ignore virtual columns.
     if (column_names.find(field->name()) == column_names.end()) continue;
+    // Only read the requested columns
+    convert_options.include_columns.push_back(field->name());
+    // Properly set conversion types
     convert_options.column_types[field->name()] = field->type();
   }
   return convert_options;
diff --git a/cpp/src/arrow/dataset/file_csv_test.cc b/cpp/src/arrow/dataset/file_csv_test.cc
index 0ae6fa5..af41654 100644
--- a/cpp/src/arrow/dataset/file_csv_test.cc
+++ b/cpp/src/arrow/dataset/file_csv_test.cc
@@ -112,6 +112,28 @@ N/A
   ASSERT_EQ(row_count, 3);
 }
 
+TEST_P(TestCsvFileFormat, ScanRecordBatchReaderSchema) {
+  // Regression test for ARROW-12603
+  auto source = GetFileSource(R"(f64,str
+1.0,2.0
+N/A,3.0
+2,foo)");
+  SetSchema({field("f64", float64()), field("str", utf8())});
+  ASSERT_OK_AND_ASSIGN(auto fragment, format_->MakeFragment(*source));
+  auto fragment_scan_options = std::make_shared<CsvFragmentScanOptions>();
+  // Force a small buffer size so type inference gets it wrong
+  fragment_scan_options->read_options.block_size = 20;
+  opts_->fragment_scan_options = fragment_scan_options;
+  ASSERT_OK(SetProjection(opts_.get(), {"f64"}));
+
+  int64_t row_count = 0;
+  for (auto maybe_batch : Batches(fragment.get())) {
+    ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch);
+    row_count += batch->num_rows();
+  }
+  ASSERT_EQ(row_count, 3);
+}
+
 TEST_P(TestCsvFileFormat, CustomConvertOptions) {
   auto source = GetFileSource(R"(str
 foo