You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2018/08/28 09:24:00 UTC
[arrow] branch master updated: ARROW-3049: [C++/Python] Fix reading empty ORC file

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 498215f  ARROW-3049: [C++/Python] Fix reading empty ORC file
498215f is described below

commit 498215fb1b7f17e8912a35435aeaca9962e63430
Author: Kouhei Sutou <ko...@clear-code.com>
AuthorDate: Tue Aug 28 11:23:54 2018 +0200

    ARROW-3049: [C++/Python] Fix reading empty ORC file
    
    Author: Kouhei Sutou <ko...@clear-code.com>
    Author: Antoine Pitrou <an...@python.org>
    
    Closes #2449 from pitrou/ARROW-3049-orc-empty-file and squashes the following commits:
    
    0ce0adc0 <Kouhei Sutou>  Use the same schema in record batches
    338c17b1 <Kouhei Sutou> Fix style
    676175fb <Kouhei Sutou> Use schema that applies selection
    86613936 <Antoine Pitrou> Fix C++ docstrings
    c23fecdd <Antoine Pitrou> ARROW-3049:  Fix reading empty ORC file
---
 cpp/src/arrow/adapters/orc/adapter.cc | 64 ++++++++++++++++++++++++++++-------
 cpp/src/arrow/adapters/orc/adapter.h  | 22 ++++++++++--
 python/pyarrow/tests/test_orc.py      | 37 ++++++++++++++++++--
 3 files changed, 107 insertions(+), 16 deletions(-)

diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc
index 1ad2279..9fdeb9c 100644
--- a/cpp/src/arrow/adapters/orc/adapter.cc
+++ b/cpp/src/arrow/adapters/orc/adapter.cc
@@ -259,6 +259,17 @@ class ORCFileReader::Impl {
     return GetArrowSchema(type, out);
   }
 
+  Status ReadSchema(const liborc::RowReaderOptions& opts, std::shared_ptr<Schema>* out) {
+    std::unique_ptr<liborc::RowReader> row_reader;
+    try {
+      row_reader = reader_->createRowReader(opts);
+    } catch (const liborc::ParseError& e) {
+      return Status::Invalid(e.what());
+    }
+    const liborc::Type& type = row_reader->getSelectedType();
+    return GetArrowSchema(type, out);
+  }
+
   Status GetArrowSchema(const liborc::Type& type, std::shared_ptr<Schema>* out) {
     if (type.getKind() != liborc::STRUCT) {
       return Status::NotImplemented(
@@ -288,19 +299,37 @@ class ORCFileReader::Impl {
 
   Status Read(std::shared_ptr<Table>* out) {
     liborc::RowReaderOptions opts;
-    return ReadTable(opts, out);
+    std::shared_ptr<Schema> schema;
+    RETURN_NOT_OK(ReadSchema(opts, &schema));
+    return ReadTable(opts, schema, out);
+  }
+
+  Status Read(const std::shared_ptr<Schema>& schema, std::shared_ptr<Table>* out) {
+    liborc::RowReaderOptions opts;
+    return ReadTable(opts, schema, out);
   }
 
   Status Read(const std::vector<int>& include_indices, std::shared_ptr<Table>* out) {
     liborc::RowReaderOptions opts;
     RETURN_NOT_OK(SelectIndices(&opts, include_indices));
-    return ReadTable(opts, out);
+    std::shared_ptr<Schema> schema;
+    RETURN_NOT_OK(ReadSchema(opts, &schema));
+    return ReadTable(opts, schema, out);
+  }
+
+  Status Read(const std::shared_ptr<Schema>& schema,
+              const std::vector<int>& include_indices, std::shared_ptr<Table>* out) {
+    liborc::RowReaderOptions opts;
+    RETURN_NOT_OK(SelectIndices(&opts, include_indices));
+    return ReadTable(opts, schema, out);
   }
 
   Status ReadStripe(int64_t stripe, std::shared_ptr<RecordBatch>* out) {
     liborc::RowReaderOptions opts;
     RETURN_NOT_OK(SelectStripe(&opts, stripe));
-    return ReadBatch(opts, stripes_[stripe].num_rows, out);
+    std::shared_ptr<Schema> schema;
+    RETURN_NOT_OK(ReadSchema(opts, &schema));
+    return ReadBatch(opts, schema, stripes_[stripe].num_rows, out);
   }
 
   Status ReadStripe(int64_t stripe, const std::vector<int>& include_indices,
@@ -308,7 +337,9 @@ class ORCFileReader::Impl {
     liborc::RowReaderOptions opts;
     RETURN_NOT_OK(SelectIndices(&opts, include_indices));
     RETURN_NOT_OK(SelectStripe(&opts, stripe));
-    return ReadBatch(opts, stripes_[stripe].num_rows, out);
+    std::shared_ptr<Schema> schema;
+    RETURN_NOT_OK(ReadSchema(opts, &schema));
+    return ReadBatch(opts, schema, stripes_[stripe].num_rows, out);
   }
 
   Status SelectStripe(liborc::RowReaderOptions* opts, int64_t stripe) {
@@ -335,17 +366,18 @@ class ORCFileReader::Impl {
   }
 
   Status ReadTable(const liborc::RowReaderOptions& row_opts,
-                   std::shared_ptr<Table>* out) {
+                   const std::shared_ptr<Schema>& schema, std::shared_ptr<Table>* out) {
     liborc::RowReaderOptions opts(row_opts);
     std::vector<std::shared_ptr<RecordBatch>> batches(stripes_.size());
     for (size_t stripe = 0; stripe < stripes_.size(); stripe++) {
       opts.range(stripes_[stripe].offset, stripes_[stripe].length);
-      RETURN_NOT_OK(ReadBatch(opts, stripes_[stripe].num_rows, &batches[stripe]));
+      RETURN_NOT_OK(ReadBatch(opts, schema, stripes_[stripe].num_rows, &batches[stripe]));
     }
-    return Table::FromRecordBatches(batches, out);
+    return Table::FromRecordBatches(schema, batches, out);
   }
 
-  Status ReadBatch(const liborc::RowReaderOptions& opts, int64_t nrows,
+  Status ReadBatch(const liborc::RowReaderOptions& opts,
+                   const std::shared_ptr<Schema>& schema, int64_t nrows,
                    std::shared_ptr<RecordBatch>* out) {
     std::unique_ptr<liborc::RowReader> rowreader;
     std::unique_ptr<liborc::ColumnVectorBatch> batch;
@@ -355,16 +387,13 @@ class ORCFileReader::Impl {
     } catch (const liborc::ParseError& e) {
       return Status::Invalid(e.what());
     }
-    const liborc::Type& type = rowreader->getSelectedType();
-    std::shared_ptr<Schema> schema;
-    RETURN_NOT_OK(GetArrowSchema(type, &schema));
-
     std::unique_ptr<RecordBatchBuilder> builder;
     RETURN_NOT_OK(RecordBatchBuilder::Make(schema, pool_, nrows, &builder));
 
     // The top-level type must be a struct to read into an arrow table
     const auto& struct_batch = checked_cast<liborc::StructVectorBatch&>(*batch);
 
+    const liborc::Type& type = rowreader->getSelectedType();
     while (rowreader->next(*batch)) {
       for (int i = 0; i < builder->num_fields(); i++) {
         RETURN_NOT_OK(AppendBatch(type.getSubtype(i), struct_batch.fields[i], 0,
@@ -674,11 +703,22 @@ Status ORCFileReader::ReadSchema(std::shared_ptr<Schema>* out) {
 
 Status ORCFileReader::Read(std::shared_ptr<Table>* out) { return impl_->Read(out); }
 
+Status ORCFileReader::Read(const std::shared_ptr<Schema>& schema,
+                           std::shared_ptr<Table>* out) {
+  return impl_->Read(schema, out);
+}
+
 Status ORCFileReader::Read(const std::vector<int>& include_indices,
                            std::shared_ptr<Table>* out) {
   return impl_->Read(include_indices, out);
 }
 
+Status ORCFileReader::Read(const std::shared_ptr<Schema>& schema,
+                           const std::vector<int>& include_indices,
+                           std::shared_ptr<Table>* out) {
+  return impl_->Read(schema, include_indices, out);
+}
+
 Status ORCFileReader::ReadStripe(int64_t stripe, std::shared_ptr<RecordBatch>* out) {
   return impl_->ReadStripe(stripe, out);
 }
diff --git a/cpp/src/arrow/adapters/orc/adapter.h b/cpp/src/arrow/adapters/orc/adapter.h
index 6438658..f482dee 100644
--- a/cpp/src/arrow/adapters/orc/adapter.h
+++ b/cpp/src/arrow/adapters/orc/adapter.h
@@ -59,17 +59,35 @@ class ARROW_EXPORT ORCFileReader {
   ///
   /// The table will be composed of one record batch per stripe.
   ///
-  /// \param[out] out the returned RecordBatch
+  /// \param[out] out the returned Table
   Status Read(std::shared_ptr<Table>* out);
 
   /// \brief Read the file as a Table
   ///
   /// The table will be composed of one record batch per stripe.
   ///
+  /// \param[in] schema the Table schema
+  /// \param[out] out the returned Table
+  Status Read(const std::shared_ptr<Schema>& schema, std::shared_ptr<Table>* out);
+
+  /// \brief Read the file as a Table
+  ///
+  /// The table will be composed of one record batch per stripe.
+  ///
   /// \param[in] include_indices the selected field indices to read
-  /// \param[out] out the returned RecordBatch
+  /// \param[out] out the returned Table
   Status Read(const std::vector<int>& include_indices, std::shared_ptr<Table>* out);
 
+  /// \brief Read the file as a Table
+  ///
+  /// The table will be composed of one record batch per stripe.
+  ///
+  /// \param[in] schema the Table schema
+  /// \param[in] include_indices the selected field indices to read
+  /// \param[out] out the returned Table
+  Status Read(const std::shared_ptr<Schema>& schema,
+              const std::vector<int>& include_indices, std::shared_ptr<Table>* out);
+
   /// \brief Read a single stripe as a RecordBatch
   ///
   /// \param[in] stripe the stripe index
diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py
index 311a5d4..3e51777 100644
--- a/python/pyarrow/tests/test_orc.py
+++ b/python/pyarrow/tests/test_orc.py
@@ -93,6 +93,7 @@ def check_example_file(orc_path, expected_df, need_fix=False):
     # Exercise ORCFile.read()
     table = orc_file.read()
     assert isinstance(table, pa.Table)
+    table._validate()
 
     # This workaround needed because of ARROW-3080
     orc_df = pd.DataFrame(table.to_pydict())
@@ -138,9 +139,41 @@ def check_example_using_json(example_name):
                        need_fix=True)
 
 
-@pytest.mark.xfail(strict=True, reason="ARROW-3049")
 def test_orcfile_empty():
-    check_example_using_json('TestOrcFile.emptyFile')
+    from pyarrow import orc
+    f = orc.ORCFile(path_for_orc_example('TestOrcFile.emptyFile'))
+    table = f.read()
+    assert table.num_rows == 0
+    schema = table.schema
+    expected_schema = pa.schema([
+        ('boolean1', pa.bool_()),
+        ('byte1', pa.int8()),
+        ('short1', pa.int16()),
+        ('int1', pa.int32()),
+        ('long1', pa.int64()),
+        ('float1', pa.float32()),
+        ('double1', pa.float64()),
+        ('bytes1', pa.binary()),
+        ('string1', pa.string()),
+        ('middle', pa.struct([
+            ('list', pa.list_(pa.struct([
+                ('int1', pa.int32()),
+                ('string1', pa.string()),
+                ]))),
+            ])),
+        ('list', pa.list_(pa.struct([
+            ('int1', pa.int32()),
+            ('string1', pa.string()),
+            ]))),
+        ('map', pa.list_(pa.struct([
+            ('key', pa.string()),
+            ('value', pa.struct([
+                ('int1', pa.int32()),
+                ('string1', pa.string()),
+                ])),
+            ]))),
+        ])
+    assert schema == expected_schema
 
 
 def test_orcfile_test1_json():