You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2021/05/17 18:40:59 UTC
[arrow] branch maint-4.0.x updated: ARROW-12603: [C++][Dataset]
Backport fix for specifying CSV column types (#10344)
This is an automated email from the ASF dual-hosted git repository.
kszucs pushed a commit to branch maint-4.0.x
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/maint-4.0.x by this push:
new d1ca32e ARROW-12603: [C++][Dataset] Backport fix for specifying CSV column types (#10344)
d1ca32e is described below
commit d1ca32e3bf853c4d9aca9819a6d30bc283bfac5b
Author: David Li <li...@gmail.com>
AuthorDate: Mon May 17 14:39:35 2021 -0400
ARROW-12603: [C++][Dataset] Backport fix for specifying CSV column types (#10344)
This backports the relevant part of ARROW-12500 into the 4.0.1 branch.
While ARROW-12500 cherry-picks cleanly, it doesn't build since it depends on prior changes - this just includes the actual fix and not the larger refactoring that was the focus of the patch.
---
cpp/src/arrow/dataset/file_csv.cc | 12 +++++++++---
cpp/src/arrow/dataset/file_csv_test.cc | 22 ++++++++++++++++++++++
2 files changed, 31 insertions(+), 3 deletions(-)
diff --git a/cpp/src/arrow/dataset/file_csv.cc b/cpp/src/arrow/dataset/file_csv.cc
index a8274a5..4612a12 100644
--- a/cpp/src/arrow/dataset/file_csv.cc
+++ b/cpp/src/arrow/dataset/file_csv.cc
@@ -90,10 +90,16 @@ static inline Result<csv::ConvertOptions> GetConvertOptions(
GetFragmentScanOptions<CsvFragmentScanOptions>(
kCsvTypeName, scan_options.get(), format.default_fragment_scan_options));
auto convert_options = csv_scan_options->convert_options;
- for (FieldRef ref : scan_options->MaterializedFields()) {
- ARROW_ASSIGN_OR_RAISE(auto field, ref.GetOne(*scan_options->dataset_schema));
-
+ auto materialized = scan_options->MaterializedFields();
+ std::unordered_set<std::string> materialized_fields(materialized.begin(),
+ materialized.end());
+ for (auto field : scan_options->dataset_schema->fields()) {
+ if (materialized_fields.find(field->name()) == materialized_fields.end()) continue;
+ // Ignore virtual columns.
if (column_names.find(field->name()) == column_names.end()) continue;
+ // Only read the requested columns
+ convert_options.include_columns.push_back(field->name());
+ // Properly set conversion types
convert_options.column_types[field->name()] = field->type();
}
return convert_options;
diff --git a/cpp/src/arrow/dataset/file_csv_test.cc b/cpp/src/arrow/dataset/file_csv_test.cc
index 0ae6fa5..af41654 100644
--- a/cpp/src/arrow/dataset/file_csv_test.cc
+++ b/cpp/src/arrow/dataset/file_csv_test.cc
@@ -112,6 +112,28 @@ N/A
ASSERT_EQ(row_count, 3);
}
+TEST_P(TestCsvFileFormat, ScanRecordBatchReaderSchema) {
+ // Regression test for ARROW-12603
+ auto source = GetFileSource(R"(f64,str
+1.0,2.0
+N/A,3.0
+2,foo)");
+ SetSchema({field("f64", float64()), field("str", utf8())});
+ ASSERT_OK_AND_ASSIGN(auto fragment, format_->MakeFragment(*source));
+ auto fragment_scan_options = std::make_shared<CsvFragmentScanOptions>();
+ // Force a small buffer size so type inference gets it wrong
+ fragment_scan_options->read_options.block_size = 20;
+ opts_->fragment_scan_options = fragment_scan_options;
+ ASSERT_OK(SetProjection(opts_.get(), {"f64"}));
+
+ int64_t row_count = 0;
+ for (auto maybe_batch : Batches(fragment.get())) {
+ ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch);
+ row_count += batch->num_rows();
+ }
+ ASSERT_EQ(row_count, 3);
+}
+
TEST_P(TestCsvFileFormat, CustomConvertOptions) {
auto source = GetFileSource(R"(str
foo