You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/05/24 18:07:15 UTC
[arrow] branch master updated: PARQUET-1243: [C++] Throw more
informative exception when reading a length-0 Parquet file
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new d20640d PARQUET-1243: [C++] Throw more informative exception when reading a length-0 Parquet file
d20640d is described below
commit d20640deb0cf585971c07221e17dc657e0cda66d
Author: Wes McKinney <we...@apache.org>
AuthorDate: Fri May 24 13:07:00 2019 -0500
PARQUET-1243: [C++] Throw more informative exception when reading a length-0 Parquet file
Author: Wes McKinney <we...@apache.org>
Closes #4240 from wesm/PARQUET-1243 and squashes the following commits:
dd61458b5 <Wes McKinney> clang-format
75ee6ef01 <Wes McKinney> Print expected size of standard file footer, use consistent style for constants in parquet/file_reader.cc
3b3f47523 <Wes McKinney> Raise a better exception when reading a length-0 Parquet file
---
cpp/src/parquet/file_reader.cc | 31 ++++++++++++++++++-------------
python/pyarrow/tests/test_parquet.py | 15 +++++++++++++++
2 files changed, 33 insertions(+), 13 deletions(-)
diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc
index 152f4ca..0fe911a 100644
--- a/cpp/src/parquet/file_reader.cc
+++ b/cpp/src/parquet/file_reader.cc
@@ -41,9 +41,9 @@
namespace parquet {
// PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file
-static constexpr int64_t DEFAULT_FOOTER_READ_SIZE = 64 * 1024;
-static constexpr uint32_t FOOTER_SIZE = 8;
-static constexpr uint8_t PARQUET_MAGIC[4] = {'P', 'A', 'R', '1'};
+static constexpr int64_t kDefaultFooterReadSize = 64 * 1024;
+static constexpr uint32_t kFooterSize = 8;
+static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'};
// For PARQUET-816
static constexpr int64_t kMaxDictHeaderSize = 100;
@@ -162,25 +162,30 @@ class SerializedFile : public ParquetFileReader::Contents {
void ParseMetaData() {
int64_t file_size = source_->Size();
- if (file_size < FOOTER_SIZE) {
- throw ParquetException("Corrupted file, smaller than file footer");
+ if (file_size == 0) {
+ throw ParquetException("Invalid Parquet file size is 0 bytes");
+ } else if (file_size < kFooterSize) {
+ std::stringstream ss;
+ ss << "Invalid Parquet file size is " << file_size
+ << " bytes, smaller than standard file footer (" << kFooterSize << " bytes)";
+ throw ParquetException(ss.str());
}
- uint8_t footer_buffer[DEFAULT_FOOTER_READ_SIZE];
- int64_t footer_read_size = std::min(file_size, DEFAULT_FOOTER_READ_SIZE);
+ uint8_t footer_buffer[kDefaultFooterReadSize];
+ int64_t footer_read_size = std::min(file_size, kDefaultFooterReadSize);
int64_t bytes_read =
source_->ReadAt(file_size - footer_read_size, footer_read_size, footer_buffer);
// Check if all bytes are read. Check if last 4 bytes read have the magic bits
if (bytes_read != footer_read_size ||
- memcmp(footer_buffer + footer_read_size - 4, PARQUET_MAGIC, 4) != 0) {
+ memcmp(footer_buffer + footer_read_size - 4, kParquetMagic, 4) != 0) {
throw ParquetException("Invalid parquet file. Corrupt footer.");
}
uint32_t metadata_len =
- *reinterpret_cast<uint32_t*>(footer_buffer + footer_read_size - FOOTER_SIZE);
- int64_t metadata_start = file_size - FOOTER_SIZE - metadata_len;
- if (FOOTER_SIZE + metadata_len > file_size) {
+ *reinterpret_cast<uint32_t*>(footer_buffer + footer_read_size - kFooterSize);
+ int64_t metadata_start = file_size - kFooterSize - metadata_len;
+ if (kFooterSize + metadata_len > file_size) {
throw ParquetException(
"Invalid parquet file. File is less than "
"file metadata size.");
@@ -190,9 +195,9 @@ class SerializedFile : public ParquetFileReader::Contents {
AllocateBuffer(properties_.memory_pool(), metadata_len);
// Check if the footer_buffer contains the entire metadata
- if (footer_read_size >= (metadata_len + FOOTER_SIZE)) {
+ if (footer_read_size >= (metadata_len + kFooterSize)) {
memcpy(metadata_buffer->mutable_data(),
- footer_buffer + (footer_read_size - metadata_len - FOOTER_SIZE),
+ footer_buffer + (footer_read_size - metadata_len - kFooterSize),
metadata_len);
} else {
bytes_read =
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 8e31a1a..a92049a 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -2651,3 +2651,18 @@ def test_dataset_metadata(tempdir):
assert d.pop('serialized_size') == 0
assert d2.pop('serialized_size') > 0
assert d == d2
+
+
+def test_parquet_file_too_small(tempdir):
+ path = str(tempdir / "test.parquet")
+ with pytest.raises(pa.ArrowIOError,
+ match='size is 0 bytes'):
+ with open(path, 'wb') as f:
+ pass
+ pq.read_table(path)
+
+ with pytest.raises(pa.ArrowIOError,
+ match='size is 4 bytes'):
+ with open(path, 'wb') as f:
+ f.write(b'ffff')
+ pq.read_table(path)