You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2016/10/18 16:43:28 UTC
[05/32] incubator-impala git commit: IMPALA-3943: Address post-merge
comments.
IMPALA-3943: Address post-merge comments.
Adds code comments and issues a warning for Parquet files
with num_rows=0 but at least one non-empty row group.
Change-Id: I72ccf00191afddb8583ac961f1eaf11e5eb28791
Reviewed-on: http://gerrit.cloudera.org:8080/4696
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Internal Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/2a04b0e2
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/2a04b0e2
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/2a04b0e2
Branch: refs/heads/hadoop-next
Commit: 2a04b0e21a763e3d1f4269e7a8c3027ff0cab934
Parents: 47b8aa3
Author: Alex Behm <al...@cloudera.com>
Authored: Wed Oct 12 13:31:54 2016 -0700
Committer: Internal Jenkins <cl...@gerrit.cloudera.org>
Committed: Fri Oct 14 05:41:22 2016 +0000
----------------------------------------------------------------------
be/src/exec/hdfs-parquet-scanner.cc | 24 ++++++++++++++++++--
common/thrift/generate_error_codes.py | 3 +++
.../queries/QueryTest/parquet-zero-rows.test | 4 ++++
3 files changed, 29 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2a04b0e2/be/src/exec/hdfs-parquet-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-scanner.cc b/be/src/exec/hdfs-parquet-scanner.cc
index 7782e8a..86dbbf8 100644
--- a/be/src/exec/hdfs-parquet-scanner.cc
+++ b/be/src/exec/hdfs-parquet-scanner.cc
@@ -410,7 +410,10 @@ Status HdfsParquetScanner::NextRowGroup() {
++row_group_idx_;
if (row_group_idx_ >= file_metadata_.row_groups.size()) break;
const parquet::RowGroup& row_group = file_metadata_.row_groups[row_group_idx_];
- if (row_group.num_rows == 0 || file_metadata_.num_rows == 0) continue;
+ // Also check 'file_metadata_.num_rows' to make sure 'select count(*)' and 'select *'
+ // behave consistently for corrupt files that have 'file_metadata_.num_rows == 0'
+ // but some data in row groups.
+ if (row_group.num_rows == 0|| file_metadata_.num_rows == 0) continue;
const DiskIoMgr::ScanRange* split_range = static_cast<ScanRangeMetadata*>(
metadata_range_->meta_data())->original_split;
@@ -897,7 +900,24 @@ Status HdfsParquetScanner::ProcessFooter() {
RETURN_IF_ERROR(ParquetMetadataUtils::ValidateFileVersion(file_metadata_, filename()));
// IMPALA-3943: Do not throw an error for empty files for backwards compatibility.
- if (file_metadata_.num_rows == 0) return Status::OK();
+ if (file_metadata_.num_rows == 0) {
+ // Warn if the num_rows is inconsistent with the row group metadata.
+ if (!file_metadata_.row_groups.empty()) {
+ bool has_non_empty_row_group = false;
+ for (const parquet::RowGroup& row_group : file_metadata_.row_groups) {
+ if (row_group.num_rows > 0) {
+ has_non_empty_row_group = true;
+ break;
+ }
+ }
+ // Warn if there is at least one non-empty row group.
+ if (has_non_empty_row_group) {
+ ErrorMsg msg(TErrorCode::PARQUET_ZERO_ROWS_IN_NON_EMPTY_FILE, filename());
+ state_->LogError(msg);
+ }
+ }
+ return Status::OK();
+ }
// Parse out the created by application version string
if (file_metadata_.__isset.created_by) {
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2a04b0e2/common/thrift/generate_error_codes.py
----------------------------------------------------------------------
diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py
index 216d1c1..131947a 100755
--- a/common/thrift/generate_error_codes.py
+++ b/common/thrift/generate_error_codes.py
@@ -286,6 +286,9 @@ error_codes = (
"while spilling data to disk."),
("BUFFER_ALLOCATION_FAILED", 92, "Unexpected error allocating $0 byte buffer."),
+
+ ("PARQUET_ZERO_ROWS_IN_NON_EMPTY_FILE", 93, "File '$0' is corrupt: metadata indicates "
+ "a zero row count but there is at least one non-empty row group."),
)
import sys
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2a04b0e2/testdata/workloads/functional-query/queries/QueryTest/parquet-zero-rows.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-zero-rows.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-zero-rows.test
index e7de245..0dbf3e7 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/parquet-zero-rows.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-zero-rows.test
@@ -17,6 +17,8 @@ select * from zero_rows_one_row_group
---- TYPES
int
---- RESULTS
+---- ERRORS
+File '__HDFS_FILENAME__' is corrupt: metadata indicates a zero row count but there is at least one non-empty row group.
====
---- QUERY
select count(*) from zero_rows_one_row_group
@@ -24,4 +26,6 @@ select count(*) from zero_rows_one_row_group
bigint
---- RESULTS
0
+---- ERRORS
+File '__HDFS_FILENAME__' is corrupt: metadata indicates a zero row count but there is at least one non-empty row group.
====