You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2020/08/25 21:41:51 UTC

[impala] 01/04: IMPALA-7779 Parquet Scanner can write binary data into profile

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 2ebf554dfdb0dc9055ef95c8f2ec4fad51f1e657
Author: Qifan Chen <qc...@cloudera.com>
AuthorDate: Wed Aug 12 16:33:51 2020 -0400

    IMPALA-7779 Parquet Scanner can write binary data into profile
    
    This fix addresses the current limitation in that an ill-formatted
    Parquet version string is not properly formatted before appearing
    in an error message or impalad.INFO. With the fix, any such string is
    converted to a hex string first. The hex string is a sequence of
    four hex digit groups separated by spaces and each group is one or
    two hex digits, such as "6c 65 2e a".
    
    Testing:
     Ran "core" tests successfully.
    
    Change-Id: I281d6fa7cb2f88f04588110943e3e768678b9cf1
    Reviewed-on: http://gerrit.cloudera.org:8080/16331
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-by: Sahil Takiar <st...@cloudera.com>
---
 be/src/exec/parquet/hdfs-parquet-scanner.cc                        | 3 ++-
 common/thrift/generate_error_codes.py                              | 2 +-
 testdata/workloads/functional-query/queries/QueryTest/parquet.test | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/be/src/exec/parquet/hdfs-parquet-scanner.cc b/be/src/exec/parquet/hdfs-parquet-scanner.cc
index d065396..dd81e8c 100644
--- a/be/src/exec/parquet/hdfs-parquet-scanner.cc
+++ b/be/src/exec/parquet/hdfs-parquet-scanner.cc
@@ -1331,8 +1331,9 @@ Status HdfsParquetScanner::ProcessFooter() {
   uint8_t* magic_number_ptr = buffer + scan_range_len - sizeof(PARQUET_VERSION_NUMBER);
   if (memcmp(magic_number_ptr, PARQUET_VERSION_NUMBER,
              sizeof(PARQUET_VERSION_NUMBER)) != 0) {
+    // Report the ill-formatted Parquet version string in hex.
     return Status(TErrorCode::PARQUET_BAD_VERSION_NUMBER, filename(),
-        string(reinterpret_cast<char*>(magic_number_ptr), sizeof(PARQUET_VERSION_NUMBER)),
+        ReadWriteUtil::HexDump(magic_number_ptr, sizeof(PARQUET_VERSION_NUMBER)),
         scan_node_->hdfs_table()->fully_qualified_name());
   }
 
diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py
index 39983a3..390b2ce 100755
--- a/common/thrift/generate_error_codes.py
+++ b/common/thrift/generate_error_codes.py
@@ -193,7 +193,7 @@ error_codes = (
    "Try running \\\"refresh $1\\\" to reload the file metadata."),
 
   ("PARQUET_BAD_VERSION_NUMBER", 60, "File '$0' has an invalid Parquet version number: "
-   "$1\\n. Please check that it is a valid Parquet file. "
+   "$1.\\nPlease check that it is a valid Parquet file. "
    "This error can also occur due to stale metadata. "
    "If you believe this is a valid Parquet file, try running \\\"refresh $2\\\"."),
 
diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet.test b/testdata/workloads/functional-query/queries/QueryTest/parquet.test
index b0b188f..2aee0d0 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/parquet.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/parquet.test
@@ -50,7 +50,7 @@ bigint,bigint,string,string,boolean,boolean,bigint,bigint,bigint,bigint
 # Parquet file with invalid magic number
 SELECT * from bad_magic_number
 ---- CATCH
-File '$NAMENODE/test-warehouse/bad_magic_number_parquet/bad_magic_number.parquet' has an invalid Parquet version number: XXXX
+File '$NAMENODE/test-warehouse/bad_magic_number_parquet/bad_magic_number.parquet' has an invalid Parquet version number: 58 58 58 58
 ====
 ---- QUERY
 # count(*) query on parquet file with multiple blocks (one block per node)