You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/11/17 15:54:29 UTC
[GitHub] [arrow] pitrou commented on a diff in pull request #14646: ARROW-18269: [C++] Slash character in partition value handling

pitrou commented on code in PR #14646:
URL: https://github.com/apache/arrow/pull/14646#discussion_r1025376222


##########
python/pyarrow/tests/test_dataset.py:
##########
@@ -4912,3 +4914,40 @@ def test_read_table_nested_columns(tempdir, format):
         {'user_id': 'qrs456', 'type': 'scroll', 'values': [None, 3, 4],
          'structs': [{'fizz': 'buzz', 'foo': None}], 'a.dotted.field': 2}
     ]
+
+
+def test_dataset_partition_with_slash(tmpdir):
+    from pyarrow import dataset as ds
+
+    path = tmpdir / "slash-writer-x"
+
+    dt_table = pa.Table.from_arrays([
+        pa.array([1, 2, 3, 4, 5], pa.int32()),
+        pa.array(["experiment/A/f.csv", "experiment/B/f.csv",
+                  "experiment/A/f.csv", "experiment/C/k.csv",
+                  "experiment/M/i.csv"], pa.utf8())], ["exp_id", "exp_meta"])
+
+    ds.write_dataset(
+        data=dt_table,
+        base_dir=path,
+        format='parquet',
+        partitioning=['exp_meta'],
+        partitioning_flavor='hive',
+    )
+
+    read_table = ds.dataset(
+        source=path,
+        format='parquet',
+        partitioning='hive',
+        schema=pa.schema([pa.field("exp_id", pa.int32()),
+                         pa.field("exp_meta", pa.utf8())])
+    ).to_table().combine_chunks()
+
+    assert dt_table == read_table.sort_by("exp_id")
+
+    exp_meta = dt_table.column(1).to_pylist()
+    exp_meta = sorted(list(dict.fromkeys(exp_meta)))  # take unique
+    encoded_paths = ["exp_meta="+quote(path, safe='') for path in exp_meta]

Review Comment:
   Style:
   ```suggestion
       encoded_paths = ["exp_meta=" + quote(path, safe='') for path in exp_meta]
   ```



##########
cpp/src/arrow/dataset/partition.cc:
##########
@@ -780,7 +780,7 @@ Result<PartitionPathFormat> HivePartitioning::FormatValues(
       // field_index <-> path nesting relation
       segments[i] = name + "=" + hive_options_.null_fallback;
     } else {
-      segments[i] = name + "=" + values[i]->ToString();
+      segments[i] = name + "=" + arrow::internal::UriEscape(values[i]->ToString());

Review Comment:
   Can you add tests on the C++ side to make sure it is covered?



##########
python/pyarrow/tests/test_dataset.py:
##########
@@ -4912,3 +4914,40 @@ def test_read_table_nested_columns(tempdir, format):
         {'user_id': 'qrs456', 'type': 'scroll', 'values': [None, 3, 4],
          'structs': [{'fizz': 'buzz', 'foo': None}], 'a.dotted.field': 2}
     ]
+
+
+def test_dataset_partition_with_slash(tmpdir):
+    from pyarrow import dataset as ds
+
+    path = tmpdir / "slash-writer-x"
+
+    dt_table = pa.Table.from_arrays([
+        pa.array([1, 2, 3, 4, 5], pa.int32()),
+        pa.array(["experiment/A/f.csv", "experiment/B/f.csv",
+                  "experiment/A/f.csv", "experiment/C/k.csv",
+                  "experiment/M/i.csv"], pa.utf8())], ["exp_id", "exp_meta"])
+
+    ds.write_dataset(
+        data=dt_table,
+        base_dir=path,
+        format='parquet',
+        partitioning=['exp_meta'],
+        partitioning_flavor='hive',
+    )
+
+    read_table = ds.dataset(
+        source=path,
+        format='parquet',
+        partitioning='hive',
+        schema=pa.schema([pa.field("exp_id", pa.int32()),
+                         pa.field("exp_meta", pa.utf8())])

Review Comment:
   Style
   ```suggestion
           schema=pa.schema([pa.field("exp_id", pa.int32()),
                             pa.field("exp_meta", pa.utf8())])
   ```



##########
python/pyarrow/tests/test_dataset.py:
##########
@@ -4912,3 +4914,40 @@ def test_read_table_nested_columns(tempdir, format):
         {'user_id': 'qrs456', 'type': 'scroll', 'values': [None, 3, 4],
          'structs': [{'fizz': 'buzz', 'foo': None}], 'a.dotted.field': 2}
     ]
+
+
+def test_dataset_partition_with_slash(tmpdir):
+    from pyarrow import dataset as ds
+
+    path = tmpdir / "slash-writer-x"
+
+    dt_table = pa.Table.from_arrays([
+        pa.array([1, 2, 3, 4, 5], pa.int32()),
+        pa.array(["experiment/A/f.csv", "experiment/B/f.csv",
+                  "experiment/A/f.csv", "experiment/C/k.csv",
+                  "experiment/M/i.csv"], pa.utf8())], ["exp_id", "exp_meta"])
+
+    ds.write_dataset(
+        data=dt_table,
+        base_dir=path,
+        format='parquet',

Review Comment:
   Can you use "ipc" so that this can run even if Parquet is disabled?



##########
python/pyarrow/tests/test_dataset.py:
##########
@@ -4912,3 +4914,40 @@ def test_read_table_nested_columns(tempdir, format):
         {'user_id': 'qrs456', 'type': 'scroll', 'values': [None, 3, 4],
          'structs': [{'fizz': 'buzz', 'foo': None}], 'a.dotted.field': 2}
     ]
+
+
+def test_dataset_partition_with_slash(tmpdir):
+    from pyarrow import dataset as ds
+
+    path = tmpdir / "slash-writer-x"
+
+    dt_table = pa.Table.from_arrays([
+        pa.array([1, 2, 3, 4, 5], pa.int32()),
+        pa.array(["experiment/A/f.csv", "experiment/B/f.csv",
+                  "experiment/A/f.csv", "experiment/C/k.csv",
+                  "experiment/M/i.csv"], pa.utf8())], ["exp_id", "exp_meta"])
+
+    ds.write_dataset(
+        data=dt_table,
+        base_dir=path,
+        format='parquet',
+        partitioning=['exp_meta'],
+        partitioning_flavor='hive',
+    )
+
+    read_table = ds.dataset(
+        source=path,
+        format='parquet',
+        partitioning='hive',
+        schema=pa.schema([pa.field("exp_id", pa.int32()),
+                         pa.field("exp_meta", pa.utf8())])
+    ).to_table().combine_chunks()
+
+    assert dt_table == read_table.sort_by("exp_id")
+
+    exp_meta = dt_table.column(1).to_pylist()
+    exp_meta = sorted(list(dict.fromkeys(exp_meta)))  # take unique

Review Comment:
   ```suggestion
       exp_meta = sorted(set(exp_meta))  # take unique
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org