You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2019/04/18 08:30:49 UTC

[arrow] branch master updated: ARROW-5177: [C++/Python] Check column index when reading Parquet column

This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 661d1cf  ARROW-5177: [C++/Python] Check column index when reading Parquet column
661d1cf is described below

commit 661d1cf8cf7854af515b5640c8de7ce125b45b89
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Thu Apr 18 10:30:28 2019 +0200

    ARROW-5177: [C++/Python] Check column index when reading Parquet column
    
    Author: Antoine Pitrou <an...@python.org>
    
    Closes #4163 from pitrou/ARROW-5177-parquet-read-column-invalid-index and squashes the following commits:
    
    a3fdf2108 <Antoine Pitrou> ARROW-5177:  Check column index when reading Parquet column
---
 cpp/src/parquet/arrow/reader.cc      |  7 +++++++
 python/pyarrow/tests/test_parquet.py | 13 +++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index f891682..e2143c0 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -380,6 +380,13 @@ FileReader::~FileReader() {}
 
 Status FileReader::Impl::GetColumn(int i, FileColumnIteratorFactory iterator_factory,
                                    std::unique_ptr<ColumnReader>* out) {
+  if (i < 0 || i >= this->num_columns()) {
+    return Status::Invalid("Column index out of bounds (got ", i,
+                           ", should be "
+                           "between 0 and ",
+                           this->num_columns() - 1, ")");
+  }
+
   std::unique_ptr<FileColumnIterator> input(iterator_factory(i, reader_.get()));
   bool read_dict = reader_properties_.read_dictionary(i);
 
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 26d561f..3190b0a 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -2552,3 +2552,16 @@ def test_partitioned_dataset(tempdir):
                         partition_cols=['one', 'two'])
     table = pq.ParquetDataset(path).read()
     pq.write_table(table, path / "output.parquet")
+
+
+def test_read_column_invalid_index():
+    table = pa.Table.from_arrays([pa.array([4, 5]), pa.array(["foo", "bar"])],
+                                 ['ints', 'strs'])
+    bio = pa.BufferOutputStream()
+    pq.write_table(table, bio)
+    f = pq.ParquetFile(bio.getvalue())
+    assert f.reader.read_column(0).to_pylist() == [4, 5]
+    assert f.reader.read_column(1).to_pylist() == ["foo", "bar"]
+    for index in (-1, 2):
+        with pytest.raises((ValueError, IndexError)):
+            f.reader.read_column(index)