You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ph...@apache.org on 2018/01/17 05:24:20 UTC
impala git commit: IMPALA-6353: Fix crash in snappy decompressor

Repository: impala
Updated Branches:
  refs/heads/master f8b406222 -> 6cc76d720


IMPALA-6353: Fix crash in snappy decompressor

SnappyDecompressor::MaxOutputLen assumes the input pointer to be
non-null. It's not true when the parquet file is corrupted and the
compressed_page_size field in a page header is 0. This patch handles
this error instead of failing a DCHECK.

Testing: A bad parquet file with 0 compressed_page_size is added. It
crashes impala without this patch.

Change-Id: I0d42937aab92a74f8e104d2f7fcd64dc24f6a500
Reviewed-on: http://gerrit.cloudera.org:8080/8977
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/6cc76d72
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/6cc76d72
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/6cc76d72

Branch: refs/heads/master
Commit: 6cc76d72016b8d5672676e8e8a979b0807803ad9
Parents: f8b4062
Author: Tianyi Wang <tw...@cloudera.com>
Authored: Tue Jan 9 13:03:45 2018 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Jan 17 04:18:24 2018 +0000

----------------------------------------------------------------------
 be/src/util/decompress.cc                            |   1 +
 testdata/data/README                                 |   5 +++++
 testdata/data/bad_compressed_dict_page_size.parquet  | Bin 0 -> 293 bytes
 .../parquet-bad-compressed-dict-page-size.test       |   7 +++++++
 tests/query_test/test_scanners.py                    |  13 +++++++++++++
 5 files changed, 26 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/6cc76d72/be/src/util/decompress.cc
----------------------------------------------------------------------
diff --git a/be/src/util/decompress.cc b/be/src/util/decompress.cc
index f3466b9..d92dfe2 100644
--- a/be/src/util/decompress.cc
+++ b/be/src/util/decompress.cc
@@ -524,6 +524,7 @@ SnappyDecompressor::SnappyDecompressor(MemPool* mem_pool, bool reuse_buffer)
 }
 
 int64_t SnappyDecompressor::MaxOutputLen(int64_t input_len, const uint8_t* input) {
+  if (input_len <= 0) return -1;
   DCHECK(input != nullptr);
   size_t result;
   if (!snappy::GetUncompressedLength(reinterpret_cast<const char*>(input),

http://git-wip-us.apache.org/repos/asf/impala/blob/6cc76d72/testdata/data/README
----------------------------------------------------------------------
diff --git a/testdata/data/README b/testdata/data/README
index 25aa09b..2b92a0a 100644
--- a/testdata/data/README
+++ b/testdata/data/README
@@ -5,6 +5,11 @@ Contains 3 single-column rows:
 "is"
 "fun"
 
+bad_compressed_dict_page_size.parquet:
+Generated by hacking Impala's Parquet writer.
+Contains a single string column 'col' with one row ("a"). The compressed_page_size field
+in dict page header is modifed to 0 to test if it is correctly handled.
+
 bad_rle_literal_count.parquet:
 Generated by hacking Impala's Parquet writer.
 Contains a single bigint column 'c' with the values 1, 3, 7 stored

http://git-wip-us.apache.org/repos/asf/impala/blob/6cc76d72/testdata/data/bad_compressed_dict_page_size.parquet
----------------------------------------------------------------------
diff --git a/testdata/data/bad_compressed_dict_page_size.parquet b/testdata/data/bad_compressed_dict_page_size.parquet
new file mode 100644
index 0000000..a5b5ed9
Binary files /dev/null and b/testdata/data/bad_compressed_dict_page_size.parquet differ

http://git-wip-us.apache.org/repos/asf/impala/blob/6cc76d72/testdata/workloads/functional-query/queries/QueryTest/parquet-bad-compressed-dict-page-size.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-bad-compressed-dict-page-size.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-bad-compressed-dict-page-size.test
new file mode 100644
index 0000000..4b9a317
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-bad-compressed-dict-page-size.test
@@ -0,0 +1,7 @@
+====
+---- QUERY
+# Parquet file with invalid (0) compressed_page_size in a dict page.
+select * from bad_compressed_dict_page_size;
+---- CATCH
+Snappy: GetUncompressedLength failed
+====
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/impala/blob/6cc76d72/tests/query_test/test_scanners.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index fe0577a..9b252da 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -388,6 +388,19 @@ class TestParquet(ImpalaTestSuite):
     self.run_test_case('QueryTest/parquet-corrupt-rle-counts-abort',
                        vector, unique_database)
 
+  def test_bad_compressed_page_size(self, vector, unique_database):
+    """IMPALA-6353: Tests that a parquet dict page with 0 compressed_page_size is
+    gracefully handled. """
+    self.client.execute(
+        "create table %s.bad_compressed_dict_page_size (col string) stored as parquet"
+        % unique_database)
+    tbl_loc = get_fs_path("/test-warehouse/%s.db/%s" % (unique_database,
+        "bad_compressed_dict_page_size"))
+    check_call(['hdfs', 'dfs', '-copyFromLocal', os.environ['IMPALA_HOME'] +
+        "/testdata/data/bad_compressed_dict_page_size.parquet", tbl_loc])
+    self.run_test_case('QueryTest/parquet-bad-compressed-dict-page-size', vector,
+        unique_database)
+
   def test_bitpacked_def_levels(self, vector, unique_database):
     """Test that Impala can read a Parquet file with the deprecated bit-packed def
        level encoding."""