You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2018/08/28 09:24:00 UTC
[arrow] branch master updated: ARROW-3049: [C++/Python] Fix reading
empty ORC file
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 498215f ARROW-3049: [C++/Python] Fix reading empty ORC file
498215f is described below
commit 498215fb1b7f17e8912a35435aeaca9962e63430
Author: Kouhei Sutou <ko...@clear-code.com>
AuthorDate: Tue Aug 28 11:23:54 2018 +0200
ARROW-3049: [C++/Python] Fix reading empty ORC file
Author: Kouhei Sutou <ko...@clear-code.com>
Author: Antoine Pitrou <an...@python.org>
Closes #2449 from pitrou/ARROW-3049-orc-empty-file and squashes the following commits:
0ce0adc0 <Kouhei Sutou> Use the same schema in record batches
338c17b1 <Kouhei Sutou> Fix style
676175fb <Kouhei Sutou> Use schema that applies selection
86613936 <Antoine Pitrou> Fix C++ docstrings
c23fecdd <Antoine Pitrou> ARROW-3049: Fix reading empty ORC file
---
cpp/src/arrow/adapters/orc/adapter.cc | 64 ++++++++++++++++++++++++++++-------
cpp/src/arrow/adapters/orc/adapter.h | 22 ++++++++++--
python/pyarrow/tests/test_orc.py | 37 ++++++++++++++++++--
3 files changed, 107 insertions(+), 16 deletions(-)
diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc
index 1ad2279..9fdeb9c 100644
--- a/cpp/src/arrow/adapters/orc/adapter.cc
+++ b/cpp/src/arrow/adapters/orc/adapter.cc
@@ -259,6 +259,17 @@ class ORCFileReader::Impl {
return GetArrowSchema(type, out);
}
+ Status ReadSchema(const liborc::RowReaderOptions& opts, std::shared_ptr<Schema>* out) {
+ std::unique_ptr<liborc::RowReader> row_reader;
+ try {
+ row_reader = reader_->createRowReader(opts);
+ } catch (const liborc::ParseError& e) {
+ return Status::Invalid(e.what());
+ }
+ const liborc::Type& type = row_reader->getSelectedType();
+ return GetArrowSchema(type, out);
+ }
+
Status GetArrowSchema(const liborc::Type& type, std::shared_ptr<Schema>* out) {
if (type.getKind() != liborc::STRUCT) {
return Status::NotImplemented(
@@ -288,19 +299,37 @@ class ORCFileReader::Impl {
Status Read(std::shared_ptr<Table>* out) {
liborc::RowReaderOptions opts;
- return ReadTable(opts, out);
+ std::shared_ptr<Schema> schema;
+ RETURN_NOT_OK(ReadSchema(opts, &schema));
+ return ReadTable(opts, schema, out);
+ }
+
+ Status Read(const std::shared_ptr<Schema>& schema, std::shared_ptr<Table>* out) {
+ liborc::RowReaderOptions opts;
+ return ReadTable(opts, schema, out);
}
Status Read(const std::vector<int>& include_indices, std::shared_ptr<Table>* out) {
liborc::RowReaderOptions opts;
RETURN_NOT_OK(SelectIndices(&opts, include_indices));
- return ReadTable(opts, out);
+ std::shared_ptr<Schema> schema;
+ RETURN_NOT_OK(ReadSchema(opts, &schema));
+ return ReadTable(opts, schema, out);
+ }
+
+ Status Read(const std::shared_ptr<Schema>& schema,
+ const std::vector<int>& include_indices, std::shared_ptr<Table>* out) {
+ liborc::RowReaderOptions opts;
+ RETURN_NOT_OK(SelectIndices(&opts, include_indices));
+ return ReadTable(opts, schema, out);
}
Status ReadStripe(int64_t stripe, std::shared_ptr<RecordBatch>* out) {
liborc::RowReaderOptions opts;
RETURN_NOT_OK(SelectStripe(&opts, stripe));
- return ReadBatch(opts, stripes_[stripe].num_rows, out);
+ std::shared_ptr<Schema> schema;
+ RETURN_NOT_OK(ReadSchema(opts, &schema));
+ return ReadBatch(opts, schema, stripes_[stripe].num_rows, out);
}
Status ReadStripe(int64_t stripe, const std::vector<int>& include_indices,
@@ -308,7 +337,9 @@ class ORCFileReader::Impl {
liborc::RowReaderOptions opts;
RETURN_NOT_OK(SelectIndices(&opts, include_indices));
RETURN_NOT_OK(SelectStripe(&opts, stripe));
- return ReadBatch(opts, stripes_[stripe].num_rows, out);
+ std::shared_ptr<Schema> schema;
+ RETURN_NOT_OK(ReadSchema(opts, &schema));
+ return ReadBatch(opts, schema, stripes_[stripe].num_rows, out);
}
Status SelectStripe(liborc::RowReaderOptions* opts, int64_t stripe) {
@@ -335,17 +366,18 @@ class ORCFileReader::Impl {
}
Status ReadTable(const liborc::RowReaderOptions& row_opts,
- std::shared_ptr<Table>* out) {
+ const std::shared_ptr<Schema>& schema, std::shared_ptr<Table>* out) {
liborc::RowReaderOptions opts(row_opts);
std::vector<std::shared_ptr<RecordBatch>> batches(stripes_.size());
for (size_t stripe = 0; stripe < stripes_.size(); stripe++) {
opts.range(stripes_[stripe].offset, stripes_[stripe].length);
- RETURN_NOT_OK(ReadBatch(opts, stripes_[stripe].num_rows, &batches[stripe]));
+ RETURN_NOT_OK(ReadBatch(opts, schema, stripes_[stripe].num_rows, &batches[stripe]));
}
- return Table::FromRecordBatches(batches, out);
+ return Table::FromRecordBatches(schema, batches, out);
}
- Status ReadBatch(const liborc::RowReaderOptions& opts, int64_t nrows,
+ Status ReadBatch(const liborc::RowReaderOptions& opts,
+ const std::shared_ptr<Schema>& schema, int64_t nrows,
std::shared_ptr<RecordBatch>* out) {
std::unique_ptr<liborc::RowReader> rowreader;
std::unique_ptr<liborc::ColumnVectorBatch> batch;
@@ -355,16 +387,13 @@ class ORCFileReader::Impl {
} catch (const liborc::ParseError& e) {
return Status::Invalid(e.what());
}
- const liborc::Type& type = rowreader->getSelectedType();
- std::shared_ptr<Schema> schema;
- RETURN_NOT_OK(GetArrowSchema(type, &schema));
-
std::unique_ptr<RecordBatchBuilder> builder;
RETURN_NOT_OK(RecordBatchBuilder::Make(schema, pool_, nrows, &builder));
// The top-level type must be a struct to read into an arrow table
const auto& struct_batch = checked_cast<liborc::StructVectorBatch&>(*batch);
+ const liborc::Type& type = rowreader->getSelectedType();
while (rowreader->next(*batch)) {
for (int i = 0; i < builder->num_fields(); i++) {
RETURN_NOT_OK(AppendBatch(type.getSubtype(i), struct_batch.fields[i], 0,
@@ -674,11 +703,22 @@ Status ORCFileReader::ReadSchema(std::shared_ptr<Schema>* out) {
Status ORCFileReader::Read(std::shared_ptr<Table>* out) { return impl_->Read(out); }
+Status ORCFileReader::Read(const std::shared_ptr<Schema>& schema,
+ std::shared_ptr<Table>* out) {
+ return impl_->Read(schema, out);
+}
+
Status ORCFileReader::Read(const std::vector<int>& include_indices,
std::shared_ptr<Table>* out) {
return impl_->Read(include_indices, out);
}
+Status ORCFileReader::Read(const std::shared_ptr<Schema>& schema,
+ const std::vector<int>& include_indices,
+ std::shared_ptr<Table>* out) {
+ return impl_->Read(schema, include_indices, out);
+}
+
Status ORCFileReader::ReadStripe(int64_t stripe, std::shared_ptr<RecordBatch>* out) {
return impl_->ReadStripe(stripe, out);
}
diff --git a/cpp/src/arrow/adapters/orc/adapter.h b/cpp/src/arrow/adapters/orc/adapter.h
index 6438658..f482dee 100644
--- a/cpp/src/arrow/adapters/orc/adapter.h
+++ b/cpp/src/arrow/adapters/orc/adapter.h
@@ -59,17 +59,35 @@ class ARROW_EXPORT ORCFileReader {
///
/// The table will be composed of one record batch per stripe.
///
- /// \param[out] out the returned RecordBatch
+ /// \param[out] out the returned Table
Status Read(std::shared_ptr<Table>* out);
/// \brief Read the file as a Table
///
/// The table will be composed of one record batch per stripe.
///
+ /// \param[in] schema the Table schema
+ /// \param[out] out the returned Table
+ Status Read(const std::shared_ptr<Schema>& schema, std::shared_ptr<Table>* out);
+
+ /// \brief Read the file as a Table
+ ///
+ /// The table will be composed of one record batch per stripe.
+ ///
/// \param[in] include_indices the selected field indices to read
- /// \param[out] out the returned RecordBatch
+ /// \param[out] out the returned Table
Status Read(const std::vector<int>& include_indices, std::shared_ptr<Table>* out);
+ /// \brief Read the file as a Table
+ ///
+ /// The table will be composed of one record batch per stripe.
+ ///
+ /// \param[in] schema the Table schema
+ /// \param[in] include_indices the selected field indices to read
+ /// \param[out] out the returned Table
+ Status Read(const std::shared_ptr<Schema>& schema,
+ const std::vector<int>& include_indices, std::shared_ptr<Table>* out);
+
/// \brief Read a single stripe as a RecordBatch
///
/// \param[in] stripe the stripe index
diff --git a/python/pyarrow/tests/test_orc.py b/python/pyarrow/tests/test_orc.py
index 311a5d4..3e51777 100644
--- a/python/pyarrow/tests/test_orc.py
+++ b/python/pyarrow/tests/test_orc.py
@@ -93,6 +93,7 @@ def check_example_file(orc_path, expected_df, need_fix=False):
# Exercise ORCFile.read()
table = orc_file.read()
assert isinstance(table, pa.Table)
+ table._validate()
# This workaround needed because of ARROW-3080
orc_df = pd.DataFrame(table.to_pydict())
@@ -138,9 +139,41 @@ def check_example_using_json(example_name):
need_fix=True)
-@pytest.mark.xfail(strict=True, reason="ARROW-3049")
def test_orcfile_empty():
- check_example_using_json('TestOrcFile.emptyFile')
+ from pyarrow import orc
+ f = orc.ORCFile(path_for_orc_example('TestOrcFile.emptyFile'))
+ table = f.read()
+ assert table.num_rows == 0
+ schema = table.schema
+ expected_schema = pa.schema([
+ ('boolean1', pa.bool_()),
+ ('byte1', pa.int8()),
+ ('short1', pa.int16()),
+ ('int1', pa.int32()),
+ ('long1', pa.int64()),
+ ('float1', pa.float32()),
+ ('double1', pa.float64()),
+ ('bytes1', pa.binary()),
+ ('string1', pa.string()),
+ ('middle', pa.struct([
+ ('list', pa.list_(pa.struct([
+ ('int1', pa.int32()),
+ ('string1', pa.string()),
+ ]))),
+ ])),
+ ('list', pa.list_(pa.struct([
+ ('int1', pa.int32()),
+ ('string1', pa.string()),
+ ]))),
+ ('map', pa.list_(pa.struct([
+ ('key', pa.string()),
+ ('value', pa.struct([
+ ('int1', pa.int32()),
+ ('string1', pa.string()),
+ ])),
+ ]))),
+ ])
+ assert schema == expected_schema
def test_orcfile_test1_json():