You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2017/08/31 05:32:24 UTC

[5/7] incubator-impala git commit: IMPALA-5840: Don't write page-level statistics in Parquet files.

IMPALA-5840: Don't write page-level statistics in Parquet files.

Page level statistics in Parquet files are expected to be deprecated in
favor of page indexes (PARQUET-922). This change disables writing
statistics to pages. Impala is currently the only project writing them.
Neither Impala nor other projects make use of these right now and by not
writing them anymore we prevent others from depending on soon-to-be
deprecated fields.

Change-Id: I1b05131320370171d76e93a46b04880a7f9b6d84
Reviewed-on: http://gerrit.cloudera.org:8080/7817
Reviewed-by: Lars Volker <lv...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/1faf89f0
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/1faf89f0
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/1faf89f0

Branch: refs/heads/release-2.10.0
Commit: 1faf89f047e7d78c3a1f3b518269a3ae21a4ddea
Parents: 73cb9b8
Author: Lars Volker <lv...@cloudera.com>
Authored: Thu Aug 24 14:59:22 2017 -0700
Committer: Tim Armstrong <ta...@cloudera.com>
Committed: Wed Aug 30 14:54:49 2017 -0700

----------------------------------------------------------------------
 be/src/exec/hdfs-parquet-table-writer.cc | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1faf89f0/be/src/exec/hdfs-parquet-table-writer.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-table-writer.cc b/be/src/exec/hdfs-parquet-table-writer.cc
index 237dd83..4bbadb4 100644
--- a/be/src/exec/hdfs-parquet-table-writer.cc
+++ b/be/src/exec/hdfs-parquet-table-writer.cc
@@ -385,7 +385,8 @@ class HdfsParquetTableWriter::ColumnWriter :
   // Temporary string value to hold CHAR(N)
   StringValue temp_;
 
-  // Tracks statistics per page.
+  // Tracks statistics per page. These are not written out currently but are merged into
+  // the row group stats. TODO(IMPALA-5841): Write these to the page index.
   scoped_ptr<ColumnStats<T>> page_stats_;
 
   // Tracks statistics per row group. This gets reset when starting a new row group.
@@ -453,7 +454,8 @@ class HdfsParquetTableWriter::BoolColumnWriter :
   // Used to encode bools as single bit values. This is reused across pages.
   BitWriter* bool_values_;
 
-  // Tracks statistics per page.
+  // Tracks statistics per page. These are not written out currently but are merged into
+  // the row group stats. TODO(IMPALA-5841): Write these to the page index.
   ColumnStats<bool> page_stats_;
 
   // Tracks statistics per row group. This gets reset when starting a new file.
@@ -695,15 +697,9 @@ Status HdfsParquetTableWriter::BaseColumnWriter::FinalizeCurrentPage() {
         max_compressed_size - header.compressed_page_size);
   }
 
-  // Build page statistics and add them to the header.
-  DCHECK(page_stats_base_ != nullptr);
-  if (page_stats_base_->BytesNeeded() <= MAX_COLUMN_STATS_SIZE) {
-    page_stats_base_->EncodeToThrift(&header.data_page_header.statistics);
-    header.data_page_header.__isset.statistics = true;
-  }
-
   // Update row group statistics from page statistics.
   DCHECK(row_group_stats_base_ != nullptr);
+  DCHECK(page_stats_base_ != nullptr);
   row_group_stats_base_->Merge(*page_stats_base_);
 
   // Add the size of the data page header