You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/05/21 16:53:59 UTC

[arrow] branch master updated: PARQUET-1402: [C++] Parquet files with dictionary page offset as 0 is not readable

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 7a9ba61  PARQUET-1402: [C++] Parquet files with dictionary page offset as 0 is not readable
7a9ba61 is described below

commit 7a9ba6178d243779cf964b27095b4c5024223cf0
Author: shyam <sh...@dremio.com>
AuthorDate: Tue May 21 11:53:51 2019 -0500

    PARQUET-1402: [C++] Parquet files with dictionary page offset as 0 is not readable
    
    …adable
    
    pyarrow needs to handle dictionary page offset = 0 as a special case to be compatible with java parquet reader.
    
    Author: shyam <sh...@dremio.com>
    
    Closes #4359 from shyambits2004/5322 and squashes the following commits:
    
    f47762ab0 <shyam> Parquet files with dictionary page offset as 0 is not readable
---
 cpp/src/parquet/arrow/arrow-reader-writer-test.cc | 6 ++++++
 cpp/src/parquet/file_reader.cc                    | 3 ++-
 cpp/submodules/parquet-testing                    | 2 +-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
index 84ace3f..bec35d5 100644
--- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -2354,6 +2354,12 @@ TEST(TestArrowReaderAdHoc, CorruptedSchema) {
   TryReadDataFile(path, ::arrow::StatusCode::IOError);
 }
 
+TEST(TestArrowReaderAdHoc, HandleDictPageOffsetZero) {
+  // PARQUET-1402: parquet-mr writes files this way which tripped up
+  // some business logic
+  TryReadDataFile(test::get_data_file("dict-page-offset-zero.parquet"));
+}
+
 class TestArrowReaderAdHocSparkAndHvr
     : public ::testing::TestWithParam<
           std::tuple<std::string, std::shared_ptr<::DataType>>> {};
diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc
index 38d296d..152f4ca 100644
--- a/cpp/src/parquet/file_reader.cc
+++ b/cpp/src/parquet/file_reader.cc
@@ -94,7 +94,8 @@ class SerializedRowGroup : public RowGroupReader::Contents {
     auto col = row_group_metadata_->ColumnChunk(i);
 
     int64_t col_start = col->data_page_offset();
-    if (col->has_dictionary_page() && col_start > col->dictionary_page_offset()) {
+    if (col->has_dictionary_page() && col->dictionary_page_offset() > 0 &&
+        col_start > col->dictionary_page_offset()) {
       col_start = col->dictionary_page_offset();
     }
 
diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing
index bb7b6ab..2fc3ade 160000
--- a/cpp/submodules/parquet-testing
+++ b/cpp/submodules/parquet-testing
@@ -1 +1 @@
-Subproject commit bb7b6abbb3fbeff845646364a4286142127be04c
+Subproject commit 2fc3ade4ccbf17271194df0b1549bc6733204314