You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2020/08/25 21:41:51 UTC
[impala] 01/04: IMPALA-7779 Parquet Scanner can write binary data
into profile
This is an automated email from the ASF dual-hosted git repository.
tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 2ebf554dfdb0dc9055ef95c8f2ec4fad51f1e657
Author: Qifan Chen <qc...@cloudera.com>
AuthorDate: Wed Aug 12 16:33:51 2020 -0400
IMPALA-7779 Parquet Scanner can write binary data into profile
This fix addresses the current limitation in that an ill-formatted
Parquet version string is not properly formatted before appearing
in an error message or impalad.INFO. With the fix, any such string is
converted to a hex string first. The hex string is a sequence of
four hex digit groups separated by spaces and each group is one or
two hex digits, such as "6c 65 2e a".
Testing:
Ran "core" tests successfully.
Change-Id: I281d6fa7cb2f88f04588110943e3e768678b9cf1
Reviewed-on: http://gerrit.cloudera.org:8080/16331
Tested-by: Impala Public Jenkins <im...@cloudera.com>
Reviewed-by: Sahil Takiar <st...@cloudera.com>
---
be/src/exec/parquet/hdfs-parquet-scanner.cc | 3 ++-
common/thrift/generate_error_codes.py | 2 +-
testdata/workloads/functional-query/queries/QueryTest/parquet.test | 2 +-
3 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/be/src/exec/parquet/hdfs-parquet-scanner.cc b/be/src/exec/parquet/hdfs-parquet-scanner.cc
index d065396..dd81e8c 100644
--- a/be/src/exec/parquet/hdfs-parquet-scanner.cc
+++ b/be/src/exec/parquet/hdfs-parquet-scanner.cc
@@ -1331,8 +1331,9 @@ Status HdfsParquetScanner::ProcessFooter() {
uint8_t* magic_number_ptr = buffer + scan_range_len - sizeof(PARQUET_VERSION_NUMBER);
if (memcmp(magic_number_ptr, PARQUET_VERSION_NUMBER,
sizeof(PARQUET_VERSION_NUMBER)) != 0) {
+ // Report the ill-formatted Parquet version string in hex.
return Status(TErrorCode::PARQUET_BAD_VERSION_NUMBER, filename(),
- string(reinterpret_cast<char*>(magic_number_ptr), sizeof(PARQUET_VERSION_NUMBER)),
+ ReadWriteUtil::HexDump(magic_number_ptr, sizeof(PARQUET_VERSION_NUMBER)),
scan_node_->hdfs_table()->fully_qualified_name());
}
diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py
index 39983a3..390b2ce 100755
--- a/common/thrift/generate_error_codes.py
+++ b/common/thrift/generate_error_codes.py
@@ -193,7 +193,7 @@ error_codes = (
"Try running \\\"refresh $1\\\" to reload the file metadata."),
("PARQUET_BAD_VERSION_NUMBER", 60, "File '$0' has an invalid Parquet version number: "
- "$1\\n. Please check that it is a valid Parquet file. "
+ "$1.\\nPlease check that it is a valid Parquet file. "
"This error can also occur due to stale metadata. "
"If you believe this is a valid Parquet file, try running \\\"refresh $2\\\"."),
diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet.test b/testdata/workloads/functional-query/queries/QueryTest/parquet.test
index b0b188f..2aee0d0 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/parquet.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/parquet.test
@@ -50,7 +50,7 @@ bigint,bigint,string,string,boolean,boolean,bigint,bigint,bigint,bigint
# Parquet file with invalid magic number
SELECT * from bad_magic_number
---- CATCH
-File '$NAMENODE/test-warehouse/bad_magic_number_parquet/bad_magic_number.parquet' has an invalid Parquet version number: XXXX
+File '$NAMENODE/test-warehouse/bad_magic_number_parquet/bad_magic_number.parquet' has an invalid Parquet version number: 58 58 58 58
====
---- QUERY
# count(*) query on parquet file with multiple blocks (one block per node)