You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by mj...@apache.org on 2017/08/29 16:00:55 UTC

[7/7] incubator-impala git commit: IMPALA-5840: Don't write page-level statistics in Parquet files.

IMPALA-5840: Don't write page-level statistics in Parquet files.

Page level statistics in Parquet files are expected to be deprecated in
favor of page indexes (PARQUET-922). This change disables writing
statistics to pages. Impala is currently the only project writing them.
Neither Impala nor other projects make use of these right now and by not
writing them anymore we prevent others from depending on soon-to-be
deprecated fields.

Change-Id: I1b05131320370171d76e93a46b04880a7f9b6d84
Reviewed-on: http://gerrit.cloudera.org:8080/7817
Reviewed-by: Lars Volker <lv...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/8149d0e5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/8149d0e5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/8149d0e5

Branch: refs/heads/master
Commit: 8149d0e5778525cb4988953377685946f31d70a2
Parents: f6c38ac
Author: Lars Volker <lv...@cloudera.com>
Authored: Thu Aug 24 14:59:22 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Tue Aug 29 05:22:51 2017 +0000

----------------------------------------------------------------------
 be/src/exec/hdfs-parquet-table-writer.cc | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/8149d0e5/be/src/exec/hdfs-parquet-table-writer.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-table-writer.cc b/be/src/exec/hdfs-parquet-table-writer.cc
index 237dd83..4bbadb4 100644
--- a/be/src/exec/hdfs-parquet-table-writer.cc
+++ b/be/src/exec/hdfs-parquet-table-writer.cc
@@ -385,7 +385,8 @@ class HdfsParquetTableWriter::ColumnWriter :
   // Temporary string value to hold CHAR(N)
   StringValue temp_;
 
-  // Tracks statistics per page.
+  // Tracks statistics per page. These are not written out currently but are merged into
+  // the row group stats. TODO(IMPALA-5841): Write these to the page index.
   scoped_ptr<ColumnStats<T>> page_stats_;
 
   // Tracks statistics per row group. This gets reset when starting a new row group.
@@ -453,7 +454,8 @@ class HdfsParquetTableWriter::BoolColumnWriter :
   // Used to encode bools as single bit values. This is reused across pages.
   BitWriter* bool_values_;
 
-  // Tracks statistics per page.
+  // Tracks statistics per page. These are not written out currently but are merged into
+  // the row group stats. TODO(IMPALA-5841): Write these to the page index.
   ColumnStats<bool> page_stats_;
 
   // Tracks statistics per row group. This gets reset when starting a new file.
@@ -695,15 +697,9 @@ Status HdfsParquetTableWriter::BaseColumnWriter::FinalizeCurrentPage() {
         max_compressed_size - header.compressed_page_size);
   }
 
-  // Build page statistics and add them to the header.
-  DCHECK(page_stats_base_ != nullptr);
-  if (page_stats_base_->BytesNeeded() <= MAX_COLUMN_STATS_SIZE) {
-    page_stats_base_->EncodeToThrift(&header.data_page_header.statistics);
-    header.data_page_header.__isset.statistics = true;
-  }
-
   // Update row group statistics from page statistics.
   DCHECK(row_group_stats_base_ != nullptr);
+  DCHECK(page_stats_base_ != nullptr);
   row_group_stats_base_->Merge(*page_stats_base_);
 
   // Add the size of the data page header