You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by li...@apache.org on 2022/08/02 12:51:11 UTC
[arrow] branch master updated: ARROW-17228: [Python] dataset.write_data should use Scanner.projected_schema when passed a scanner with projected columns (#13756)
This is an automated email from the ASF dual-hosted git repository.
lidavidm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 2a027571d7 ARROW-17228: [Python] dataset.write_data should use Scanner.projected_schema when passed a scanner with projected columns (#13756)
2a027571d7 is described below
commit 2a027571d7a10526939c4c37bf84b2368e6f4b74
Author: 0x26res <ar...@gmail.com>
AuthorDate: Tue Aug 2 13:51:05 2022 +0100
ARROW-17228: [Python] dataset.write_data should use Scanner.projected_schema when passed a scanner with projected columns (#13756)
Issue: https://issues.apache.org/jira/browse/ARROW-17228
Authored-by: 0x26res <0x...@gmail.com>
Signed-off-by: David Li <li...@gmail.com>
---
python/pyarrow/dataset.py | 2 +-
python/pyarrow/tests/test_dataset.py | 25 +++++++++++++++++++++++++
2 files changed, 26 insertions(+), 1 deletion(-)
diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py
index 2518e37ec6..326b37ec6e 100644
--- a/python/pyarrow/dataset.py
+++ b/python/pyarrow/dataset.py
@@ -964,7 +964,7 @@ Table/RecordBatch, or iterable of RecordBatch
# was converted to one of those two. So we can grab the schema
# to build the partitioning object from Dataset.
if isinstance(data, Scanner):
- partitioning_schema = data.dataset_schema
+ partitioning_schema = data.projected_schema
else:
partitioning_schema = data.schema
partitioning = _ensure_write_partitioning(partitioning,
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index b900e694a9..3dc9c3beb6 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -4706,3 +4706,28 @@ def test_dataset_filter(tempdir):
"colA": [1, 2],
"col2": ["a", "b"]
})
+
+
+def test_write_dataset_with_scanner_use_projected_schema(tempdir):
+ """
+ Ensure the projected schema is used to validate partitions for scanner
+
+ https://issues.apache.org/jira/browse/ARROW-17228
+ """
+ table = pa.table([pa.array(range(20))], names=["original_column"])
+ table_dataset = ds.dataset(table)
+ columns = {
+ "renamed_column": ds.field("original_column"),
+ }
+ scanner = table_dataset.scanner(columns=columns)
+
+ ds.write_dataset(
+ scanner, tempdir, partitioning=["renamed_column"], format="ipc")
+ with (
+ pytest.raises(
+ KeyError, match=r"'Column original_column does not exist in schema"
+ )
+ ):
+ ds.write_dataset(
+ scanner, tempdir, partitioning=["original_column"], format="ipc"
+ )