You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2022/05/04 13:49:58 UTC
[arrow] branch master updated: ARROW-16436: [C++][Python] Datasets should not ignore CSV autogenerate_column_names

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 37c3bd00f8 ARROW-16436: [C++][Python] Datasets should not ignore CSV autogenerate_column_names
37c3bd00f8 is described below

commit 37c3bd00f812513fe22179ae87573893c741af51
Author: Raúl Cumplido <ra...@gmail.com>
AuthorDate: Wed May 4 15:49:52 2022 +0200

    ARROW-16436: [C++][Python] Datasets should not ignore CSV autogenerate_column_names
    
    The added test failed previously because the `autogenerate_column_names` was ignored:
    ```
    E   pyarrow.lib.ArrowInvalid: Error creating dataset. Could not read schema from '/tmp/pytest-of/pytest-15/test_csv_format_options_genera1/test.csv': Could not open CSV input source '/tmp/pytest-of/pytest-15/test_csv_format_options_genera1/test.csv': Invalid: CSV file contained multiple columns named 1. Is this a 'csv' file?
    ```
    Use the same approach we use on `GenerateColumnNames` here https://github.com/apache/arrow/blob/master/cpp/src/arrow/csv/reader.cc#L637-L646
    
    Closes #13064 from raulcd/ARROW-16436
    
    Authored-by: Raúl Cumplido <ra...@gmail.com>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 cpp/src/arrow/dataset/file_csv.cc    | 10 ++++++++++
 python/pyarrow/tests/test_dataset.py | 16 ++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/cpp/src/arrow/dataset/file_csv.cc b/cpp/src/arrow/dataset/file_csv.cc
index 277bab29a0..d185edf49d 100644
--- a/cpp/src/arrow/dataset/file_csv.cc
+++ b/cpp/src/arrow/dataset/file_csv.cc
@@ -85,6 +85,16 @@ Result<std::unordered_set<std::string>> GetColumnNames(
 
   std::unordered_set<std::string> column_names;
 
+  if (read_options.autogenerate_column_names) {
+    column_names.reserve(parser.num_cols());
+    for (int32_t i = 0; i < parser.num_cols(); ++i) {
+      std::stringstream ss;
+      ss << "f" << i;
+      column_names.emplace(ss.str());
+    }
+    return column_names;
+  }
+
   RETURN_NOT_OK(
       parser.VisitLastRow([&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
         // Skip BOM when reading column names (ARROW-14644)
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
index 6eda764f27..6aed7734f6 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -3033,6 +3033,22 @@ def test_csv_format_options(tempdir, dataset_reader):
         pa.table({'foo': pa.array(['skipped', 'col0', 'foo', 'bar'])}))
 
 
+def test_csv_format_options_generate_columns(tempdir, dataset_reader):
+    path = str(tempdir / 'test.csv')
+    with open(path, 'w') as sink:
+        sink.write('1,a,true,1\n')
+
+    dataset = ds.dataset(path, format=ds.CsvFileFormat(
+        read_options=pa.csv.ReadOptions(autogenerate_column_names=True)))
+    result = dataset_reader.to_table(dataset)
+    expected_column_names = ["f0", "f1", "f2", "f3"]
+    assert result.column_names == expected_column_names
+    assert result.equals(pa.table({'f0': pa.array([1]),
+                                   'f1': pa.array(["a"]),
+                                   'f2': pa.array([True]),
+                                   'f3': pa.array([1])}))
+
+
 def test_csv_fragment_options(tempdir, dataset_reader):
     path = str(tempdir / 'test.csv')
     with open(path, 'w') as sink: