You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@arrow.apache.org by li...@apache.org on 2022/08/02 12:51:11 UTC

[arrow] branch master updated: ARROW-17228: [Python] dataset.write_data should use Scanner.projected_schema when passed a scanner with projected columns (#13756)

This is an automated email from the ASF dual-hosted git repository.

lidavidm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 2a027571d7 ARROW-17228: [Python] dataset.write_data should use Scanner.projected_schema when passed a scanner with projected columns (#13756)
2a027571d7 is described below

commit 2a027571d7a10526939c4c37bf84b2368e6f4b74
Author: 0x26res <ar...@gmail.com>
AuthorDate: Tue Aug 2 13:51:05 2022 +0100

    ARROW-17228: [Python] dataset.write_data should use Scanner.projected_schema when passed a scanner with projected columns (#13756)
    
    Issue: https://issues.apache.org/jira/browse/ARROW-17228
    
    
    
    Authored-by: 0x26res <0x...@gmail.com>
    Signed-off-by: David Li <li...@gmail.com>
---
 python/pyarrow/dataset.py            |  2 +-
 python/pyarrow/tests/test_dataset.py | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py
index 2518e37ec6..326b37ec6e 100644
--- a/python/pyarrow/dataset.py
+++ b/python/pyarrow/dataset.py
@@ -964,7 +964,7 @@ Table/RecordBatch, or iterable of RecordBatch
     # was converted to one of those two. So we can grab the schema
     # to build the partitioning object from Dataset.
     if isinstance(data, Scanner):
-        partitioning_schema = data.dataset_schema
+        partitioning_schema = data.projected_schema
     else:
         partitioning_schema = data.schema
     partitioning = _ensure_write_partitioning(partitioning,
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index b900e694a9..3dc9c3beb6 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -4706,3 +4706,28 @@ def test_dataset_filter(tempdir):
         "colA": [1, 2],
         "col2": ["a", "b"]
     })
+
+
+def test_write_dataset_with_scanner_use_projected_schema(tempdir):
+    """
+    Ensure the projected schema is used to validate partitions for scanner
+
+    https://issues.apache.org/jira/browse/ARROW-17228
+    """
+    table = pa.table([pa.array(range(20))], names=["original_column"])
+    table_dataset = ds.dataset(table)
+    columns = {
+        "renamed_column": ds.field("original_column"),
+    }
+    scanner = table_dataset.scanner(columns=columns)
+
+    ds.write_dataset(
+        scanner, tempdir, partitioning=["renamed_column"], format="ipc")
+    with (
+        pytest.raises(
+            KeyError, match=r"'Column original_column does not exist in schema"
+        )
+    ):
+        ds.write_dataset(
+            scanner, tempdir, partitioning=["original_column"], format="ipc"
+        )