You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by jo...@apache.org on 2022/10/20 07:35:19 UTC

[arrow] branch master updated: ARROW-14596: [C++][Python] Read table nested struct fields in columns (#14326)

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new f49f8edfdc ARROW-14596: [C++][Python] Read table nested struct fields in columns (#14326)
f49f8edfdc is described below

commit f49f8edfdc82ffb4aef4c088670cdf572174f3c4
Author: Miles Granger <mi...@gmail.com>
AuthorDate: Thu Oct 20 09:35:07 2022 +0200

    ARROW-14596: [C++][Python] Read table nested struct fields in columns (#14326)
    
    Part of [ARROW-14596](https://issues.apache.org/jira/browse/ARROW-14596), and [ARROW-13798](https://issues.apache.org/jira/browse/ARROW-13798)
    
    Does not propose to solve selecting from lists in this PR; only supporting dotted paths into arbitrarily nested structs using existing `FromDotPath`. Selecting from lists will require further discussion and support for kernels to select struct subsets from lists.
    
    
    Authored-by: Miles Granger <mi...@gmail.com>
    Signed-off-by: Joris Van den Bossche <jo...@gmail.com>
---
 cpp/src/arrow/dataset/scanner.cc      | 14 +++++++++++++-
 cpp/src/arrow/dataset/scanner_test.cc | 18 ++++++++++++++++++
 cpp/src/arrow/dataset/test_util.h     |  2 +-
 python/pyarrow/tests/test_dataset.py  | 28 ++++++++++++++++++++++++++++
 4 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/dataset/scanner.cc b/cpp/src/arrow/dataset/scanner.cc
index 5b52b3fb81..b4baab5a09 100644
--- a/cpp/src/arrow/dataset/scanner.cc
+++ b/cpp/src/arrow/dataset/scanner.cc
@@ -789,7 +789,19 @@ Result<ProjectionDescr> ProjectionDescr::FromNames(std::vector<std::string> name
                                                    const Schema& dataset_schema) {
   std::vector<compute::Expression> exprs(names.size());
   for (size_t i = 0; i < exprs.size(); ++i) {
-    exprs[i] = compute::field_ref(names[i]);
+    // If name isn't in schema, try finding it by dotted path.
+    if (dataset_schema.GetFieldByName(names[i]) == nullptr) {
+      auto name = names[i];
+      if (name.rfind(".", 0) != 0) {
+        name = "." + name;
+      }
+      ARROW_ASSIGN_OR_RAISE(auto field_ref, FieldRef::FromDotPath(name));
+      // safe as we know there is at least 1 dot.
+      names[i] = name.substr(name.rfind(".") + 1);
+      exprs[i] = compute::field_ref(field_ref);
+    } else {
+      exprs[i] = compute::field_ref(names[i]);
+    }
   }
   auto fields = dataset_schema.fields();
   for (const auto& aug_field : kAugmentedFields) {
diff --git a/cpp/src/arrow/dataset/scanner_test.cc b/cpp/src/arrow/dataset/scanner_test.cc
index 7d5ef09110..8c667a7514 100644
--- a/cpp/src/arrow/dataset/scanner_test.cc
+++ b/cpp/src/arrow/dataset/scanner_test.cc
@@ -1088,6 +1088,24 @@ TEST_P(TestScanner, ProjectedScanNested) {
   AssertScanBatchesUnorderedEqualRepetitionsOf(MakeScanner(batch_in), batch_out);
 }
 
+TEST_P(TestScanner, ProjectedScanNestedFromNames) {
+  SetSchema({
+      field("struct", struct_({field("i32", int32()), field("f64", float64())})),
+      field("nested", struct_({field("left", int32()),
+                               field("right", struct_({field("i32", int32()),
+                                                       field("f64", float64())}))})),
+  });
+  ASSERT_OK_AND_ASSIGN(auto descr,
+                       ProjectionDescr::FromNames({".struct.i32", "nested.right.f64"},
+                                                  *options_->dataset_schema))
+  SetProjection(options_.get(), std::move(descr));
+  auto batch_in = ConstantArrayGenerator::Zeroes(GetParam().items_per_batch, schema_);
+  auto batch_out = ConstantArrayGenerator::Zeroes(
+      GetParam().items_per_batch,
+      schema({field("i32", int32()), field("f64", float64())}));
+  AssertScanBatchesUnorderedEqualRepetitionsOf(MakeScanner(batch_in), batch_out);
+}
+
 TEST_P(TestScanner, MaterializeMissingColumn) {
   SetSchema({field("i32", int32()), field("f64", float64())});
   auto batch_missing_f64 = ConstantArrayGenerator::Zeroes(
diff --git a/cpp/src/arrow/dataset/test_util.h b/cpp/src/arrow/dataset/test_util.h
index 17065bfd7d..02464f0c38 100644
--- a/cpp/src/arrow/dataset/test_util.h
+++ b/cpp/src/arrow/dataset/test_util.h
@@ -157,7 +157,7 @@ class DatasetFixtureMixin : public ::testing::Test {
     std::shared_ptr<RecordBatch> lhs;
     ASSERT_OK(expected->ReadNext(&lhs));
     EXPECT_NE(lhs, nullptr);
-    AssertBatchesEqual(*lhs, batch);
+    AssertBatchesEqual(*lhs, batch, true);
   }
 
   /// \brief Ensure that record batches found in reader are equals to the
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 5274ddce03..37199e38c2 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -4865,3 +4865,31 @@ def test_write_dataset_with_scanner_use_projected_schema(tempdir):
         ds.write_dataset(
             scanner, tempdir, partitioning=["original_column"], format="ipc"
         )
+
+
+@pytest.mark.parametrize("format", ("ipc", "parquet"))
+def test_read_table_nested_columns(tempdir, format):
+    if format == "parquet":
+        pytest.importorskip("pyarrow.parquet")
+
+    table = pa.table({"user_id": ["abc123", "qrs456"],
+                      "a.dotted.field": [1, 2],
+                      "interaction": [
+        {"type": None, "element": "button",
+         "values": [1, 2], "structs":[{"foo": "bar"}, None]},
+        {"type": "scroll", "element": "window",
+         "values": [None, 3, 4], "structs":[{"fizz": "buzz"}]}
+    ]})
+    ds.write_dataset(table, tempdir / "table", format=format)
+    ds1 = ds.dataset(tempdir / "table", format=format)
+
+    # Dot path to read subsets of nested data
+    table = ds1.to_table(
+        columns=["user_id", "interaction.type", "interaction.values",
+                 "interaction.structs", "a.dotted.field"])
+    assert table.to_pylist() == [
+        {'user_id': 'abc123', 'type': None, 'values': [1, 2],
+         'structs': [{'fizz': None, 'foo': 'bar'}, None], 'a.dotted.field': 1},
+        {'user_id': 'qrs456', 'type': 'scroll', 'values': [None, 3, 4],
+         'structs': [{'fizz': 'buzz', 'foo': None}], 'a.dotted.field': 2}
+    ]