You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by jo...@apache.org on 2022/10/20 07:35:19 UTC
[arrow] branch master updated: ARROW-14596: [C++][Python] Read table nested struct fields in columns (#14326)
This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new f49f8edfdc ARROW-14596: [C++][Python] Read table nested struct fields in columns (#14326)
f49f8edfdc is described below
commit f49f8edfdc82ffb4aef4c088670cdf572174f3c4
Author: Miles Granger <mi...@gmail.com>
AuthorDate: Thu Oct 20 09:35:07 2022 +0200
ARROW-14596: [C++][Python] Read table nested struct fields in columns (#14326)
Part of [ARROW-14596](https://issues.apache.org/jira/browse/ARROW-14596), and [ARROW-13798](https://issues.apache.org/jira/browse/ARROW-13798)
Does not propose to solve selecting from lists in this PR; only supporting dotted paths into arbitrarily nested structs using existing `FromDotPath`. Selecting from lists will require further discussion and support for kernels to select struct subsets from lists.
Authored-by: Miles Granger <mi...@gmail.com>
Signed-off-by: Joris Van den Bossche <jo...@gmail.com>
---
cpp/src/arrow/dataset/scanner.cc | 14 +++++++++++++-
cpp/src/arrow/dataset/scanner_test.cc | 18 ++++++++++++++++++
cpp/src/arrow/dataset/test_util.h | 2 +-
python/pyarrow/tests/test_dataset.py | 28 ++++++++++++++++++++++++++++
4 files changed, 60 insertions(+), 2 deletions(-)
diff --git a/cpp/src/arrow/dataset/scanner.cc b/cpp/src/arrow/dataset/scanner.cc
index 5b52b3fb81..b4baab5a09 100644
--- a/cpp/src/arrow/dataset/scanner.cc
+++ b/cpp/src/arrow/dataset/scanner.cc
@@ -789,7 +789,19 @@ Result<ProjectionDescr> ProjectionDescr::FromNames(std::vector<std::string> name
const Schema& dataset_schema) {
std::vector<compute::Expression> exprs(names.size());
for (size_t i = 0; i < exprs.size(); ++i) {
- exprs[i] = compute::field_ref(names[i]);
+ // If name isn't in schema, try finding it by dotted path.
+ if (dataset_schema.GetFieldByName(names[i]) == nullptr) {
+ auto name = names[i];
+ if (name.rfind(".", 0) != 0) {
+ name = "." + name;
+ }
+ ARROW_ASSIGN_OR_RAISE(auto field_ref, FieldRef::FromDotPath(name));
+ // safe as we know there is at least 1 dot.
+ names[i] = name.substr(name.rfind(".") + 1);
+ exprs[i] = compute::field_ref(field_ref);
+ } else {
+ exprs[i] = compute::field_ref(names[i]);
+ }
}
auto fields = dataset_schema.fields();
for (const auto& aug_field : kAugmentedFields) {
diff --git a/cpp/src/arrow/dataset/scanner_test.cc b/cpp/src/arrow/dataset/scanner_test.cc
index 7d5ef09110..8c667a7514 100644
--- a/cpp/src/arrow/dataset/scanner_test.cc
+++ b/cpp/src/arrow/dataset/scanner_test.cc
@@ -1088,6 +1088,24 @@ TEST_P(TestScanner, ProjectedScanNested) {
AssertScanBatchesUnorderedEqualRepetitionsOf(MakeScanner(batch_in), batch_out);
}
+TEST_P(TestScanner, ProjectedScanNestedFromNames) {
+ SetSchema({
+ field("struct", struct_({field("i32", int32()), field("f64", float64())})),
+ field("nested", struct_({field("left", int32()),
+ field("right", struct_({field("i32", int32()),
+ field("f64", float64())}))})),
+ });
+ ASSERT_OK_AND_ASSIGN(auto descr,
+ ProjectionDescr::FromNames({".struct.i32", "nested.right.f64"},
+ *options_->dataset_schema))
+ SetProjection(options_.get(), std::move(descr));
+ auto batch_in = ConstantArrayGenerator::Zeroes(GetParam().items_per_batch, schema_);
+ auto batch_out = ConstantArrayGenerator::Zeroes(
+ GetParam().items_per_batch,
+ schema({field("i32", int32()), field("f64", float64())}));
+ AssertScanBatchesUnorderedEqualRepetitionsOf(MakeScanner(batch_in), batch_out);
+}
+
TEST_P(TestScanner, MaterializeMissingColumn) {
SetSchema({field("i32", int32()), field("f64", float64())});
auto batch_missing_f64 = ConstantArrayGenerator::Zeroes(
diff --git a/cpp/src/arrow/dataset/test_util.h b/cpp/src/arrow/dataset/test_util.h
index 17065bfd7d..02464f0c38 100644
--- a/cpp/src/arrow/dataset/test_util.h
+++ b/cpp/src/arrow/dataset/test_util.h
@@ -157,7 +157,7 @@ class DatasetFixtureMixin : public ::testing::Test {
std::shared_ptr<RecordBatch> lhs;
ASSERT_OK(expected->ReadNext(&lhs));
EXPECT_NE(lhs, nullptr);
- AssertBatchesEqual(*lhs, batch);
+ AssertBatchesEqual(*lhs, batch, true);
}
/// \brief Ensure that record batches found in reader are equals to the
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 5274ddce03..37199e38c2 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -4865,3 +4865,31 @@ def test_write_dataset_with_scanner_use_projected_schema(tempdir):
ds.write_dataset(
scanner, tempdir, partitioning=["original_column"], format="ipc"
)
+
+
+@pytest.mark.parametrize("format", ("ipc", "parquet"))
+def test_read_table_nested_columns(tempdir, format):
+ if format == "parquet":
+ pytest.importorskip("pyarrow.parquet")
+
+ table = pa.table({"user_id": ["abc123", "qrs456"],
+ "a.dotted.field": [1, 2],
+ "interaction": [
+ {"type": None, "element": "button",
+ "values": [1, 2], "structs":[{"foo": "bar"}, None]},
+ {"type": "scroll", "element": "window",
+ "values": [None, 3, 4], "structs":[{"fizz": "buzz"}]}
+ ]})
+ ds.write_dataset(table, tempdir / "table", format=format)
+ ds1 = ds.dataset(tempdir / "table", format=format)
+
+ # Dot path to read subsets of nested data
+ table = ds1.to_table(
+ columns=["user_id", "interaction.type", "interaction.values",
+ "interaction.structs", "a.dotted.field"])
+ assert table.to_pylist() == [
+ {'user_id': 'abc123', 'type': None, 'values': [1, 2],
+ 'structs': [{'fizz': None, 'foo': 'bar'}, None], 'a.dotted.field': 1},
+ {'user_id': 'qrs456', 'type': 'scroll', 'values': [None, 3, 4],
+ 'structs': [{'fizz': 'buzz', 'foo': None}], 'a.dotted.field': 2}
+ ]