You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2017/12/04 21:47:45 UTC

impala git commit: IMPALA-5987: LZ4 Codec silently produces bogus compressed data for large inputs

Repository: impala
Updated Branches:
  refs/heads/master d428c16f1 -> 7e368b8f0


IMPALA-5987: LZ4 Codec silently produces bogus compressed data for large inputs

When Lz4Compressor::MaxOutputLen returns 0, it
means that the input is too large to compress.
When invoked Lz4Compressor::ProcessBlock with
an input too large, it silently produced a bogus
result. This bogus result even decompresses
successfully, but not to the data that was
originally compressed.

After this commit, Lz4Compressor::ProcessBlock
will return error if it cannot compress the
input.

I also added a comment on Codec::MaxOutputLen()
that return value 0 means that the input is too
large.

I added some checks after the invocations of
MaxOutputLen() where the compressor can be
a Lz4Compressor.

I added an  automated test case to
be/src/util/decompress-test.cc.

Change-Id: Ifb0bc4ed98c5d7b628b791aa90ead36347b9fbb8
Reviewed-on: http://gerrit.cloudera.org:8080/8748
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/7e368b8f
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/7e368b8f
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/7e368b8f

Branch: refs/heads/master
Commit: 7e368b8f0f6255d338beb127ff6a3c4e11a28c9e
Parents: d428c16
Author: Zoltan Borok-Nagy <bo...@cloudera.com>
Authored: Mon Dec 4 18:05:25 2017 +0100
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Mon Dec 4 21:43:00 2017 +0000

----------------------------------------------------------------------
 .../benchmarks/row-batch-serialize-benchmark.cc |  1 +
 be/src/catalog/catalog-util.cc                  |  1 +
 be/src/runtime/row-batch.cc                     |  1 +
 be/src/util/codec.h                             |  1 +
 be/src/util/compress.cc                         |  3 +++
 be/src/util/decompress-test.cc                  | 26 ++++++++++++++++++++
 be/src/util/runtime-profile.cc                  |  4 ++-
 common/thrift/generate_error_codes.py           |  3 +++
 8 files changed, 39 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/7e368b8f/be/src/benchmarks/row-batch-serialize-benchmark.cc
----------------------------------------------------------------------
diff --git a/be/src/benchmarks/row-batch-serialize-benchmark.cc b/be/src/benchmarks/row-batch-serialize-benchmark.cc
index 699b91b..a2071ca 100644
--- a/be/src/benchmarks/row-batch-serialize-benchmark.cc
+++ b/be/src/benchmarks/row-batch-serialize-benchmark.cc
@@ -124,6 +124,7 @@ class RowBatchSerializeBaseline {
           MakeScopeExitTrigger([&compressor]() { compressor.Close(); });
 
       int64_t compressed_size = compressor.MaxOutputLen(size);
+      DCHECK_GT(compressed_size, 0);
       if (batch->compression_scratch_.size() < compressed_size) {
         batch->compression_scratch_.resize(compressed_size);
       }

http://git-wip-us.apache.org/repos/asf/impala/blob/7e368b8f/be/src/catalog/catalog-util.cc
----------------------------------------------------------------------
diff --git a/be/src/catalog/catalog-util.cc b/be/src/catalog/catalog-util.cc
index de4f2fd..e968742 100644
--- a/be/src/catalog/catalog-util.cc
+++ b/be/src/catalog/catalog-util.cc
@@ -195,6 +195,7 @@ Status CompressCatalogObject(string* catalog_object) {
       &compressor));
   string output_buffer;
   int64_t compressed_data_len = compressor->MaxOutputLen(catalog_object->size());
+  DCHECK_GT(compressed_data_len, 0);
   int64_t output_buffer_len = compressed_data_len + sizeof(uint32_t);
   output_buffer.resize(output_buffer_len);
   uint8_t* output_buffer_ptr =

http://git-wip-us.apache.org/repos/asf/impala/blob/7e368b8f/be/src/runtime/row-batch.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/row-batch.cc b/be/src/runtime/row-batch.cc
index cd8c936..bad1f1c 100644
--- a/be/src/runtime/row-batch.cc
+++ b/be/src/runtime/row-batch.cc
@@ -265,6 +265,7 @@ Status RowBatch::Serialize(bool full_dedup, vector<int32_t>* tuple_offsets,
         MakeScopeExitTrigger([&compressor]() { compressor.Close(); });
 
     int64_t compressed_size = compressor.MaxOutputLen(size);
+    DCHECK_GT(compressed_size, 0);
     if (compression_scratch_.size() < compressed_size) {
       compression_scratch_.resize(compressed_size);
     }

http://git-wip-us.apache.org/repos/asf/impala/blob/7e368b8f/be/src/util/codec.h
----------------------------------------------------------------------
diff --git a/be/src/util/codec.h b/be/src/util/codec.h
index d21e399..8d6b88f 100644
--- a/be/src/util/codec.h
+++ b/be/src/util/codec.h
@@ -149,6 +149,7 @@ class Codec {
   /// a buffer.
   /// This must be an O(1) operation (i.e. cannot read all of input).  Codecs that
   /// don't support this should return -1.
+  /// Return value 0 means error, the input size is too large.
   virtual int64_t MaxOutputLen(int64_t input_len, const uint8_t* input = nullptr) = 0;
 
   /// Must be called on codec before destructor for final cleanup.

http://git-wip-us.apache.org/repos/asf/impala/blob/7e368b8f/be/src/util/compress.cc
----------------------------------------------------------------------
diff --git a/be/src/util/compress.cc b/be/src/util/compress.cc
index 90656aa..7a0181b 100644
--- a/be/src/util/compress.cc
+++ b/be/src/util/compress.cc
@@ -311,6 +311,9 @@ Status Lz4Compressor::ProcessBlock(bool output_preallocated, int64_t input_lengt
   DCHECK_GE(input_length, 0);
   CHECK(output_preallocated) << "Output was not allocated for Lz4 Codec";
   if (input_length == 0) return Status::OK();
+  if (MaxOutputLen(input_length, input) == 0) {
+    return Status(TErrorCode::LZ4_COMPRESSION_INPUT_TOO_LARGE, input_length);
+  }
   *output_length = LZ4_compress_default(reinterpret_cast<const char*>(input),
       reinterpret_cast<char*>(*output), input_length, *output_length);
   return Status::OK();

http://git-wip-us.apache.org/repos/asf/impala/blob/7e368b8f/be/src/util/decompress-test.cc
----------------------------------------------------------------------
diff --git a/be/src/util/decompress-test.cc b/be/src/util/decompress-test.cc
index d170501..2a2299c 100644
--- a/be/src/util/decompress-test.cc
+++ b/be/src/util/decompress-test.cc
@@ -251,6 +251,7 @@ class DecompressorTest : public ::testing::Test {
       Codec* decompressor, int64_t input_len, uint8_t* input) {
     // Preallocated output buffers for compressor
     int64_t max_compressed_length = compressor->MaxOutputLen(input_len, input);
+    ASSERT_GT(max_compressed_length, 0);
     uint8_t* compressed = mem_pool_.Allocate(max_compressed_length);
     int64_t compressed_length = max_compressed_length;
 
@@ -441,6 +442,31 @@ TEST_F(DecompressorTest, Impala5250) {
   EXPECT_EQ(output_length, 0);
 }
 
+TEST_F(DecompressorTest, LZ4Huge) {
+  // IMPALA-5987: When Lz4Compressor::MaxOutputLen() returns 0,
+  // it means that the input is too large to compress, therefore trying
+  // to compress it should fail.
+
+  // Generate a big random payload.
+  int payload_len = numeric_limits<int>::max();
+  uint8_t* payload = new uint8_t[payload_len];
+  for (int i = 0 ; i < payload_len; ++i) payload[i] = rand();
+
+  scoped_ptr<Codec> compressor;
+  EXPECT_OK(Codec::CreateCompressor(nullptr, true, impala::THdfsCompression::LZ4,
+      &compressor));
+
+  // The returned max_size is 0 because the payload is too big.
+  int64_t max_size = compressor->MaxOutputLen(payload_len);
+  ASSERT_EQ(max_size, 0);
+
+  // Trying to compress it should give an error
+  int64_t compressed_len = max_size;
+  uint8_t* compressed = new uint8_t[max_size];
+  EXPECT_ERROR(compressor->ProcessBlock(true, payload_len, payload,
+      &compressed_len, &compressed), TErrorCode::LZ4_COMPRESSION_INPUT_TOO_LARGE);
+}
+
 }
 
 int main(int argc, char **argv) {

http://git-wip-us.apache.org/repos/asf/impala/blob/7e368b8f/be/src/util/runtime-profile.cc
----------------------------------------------------------------------
diff --git a/be/src/util/runtime-profile.cc b/be/src/util/runtime-profile.cc
index 4254b9c..abb691d 100644
--- a/be/src/util/runtime-profile.cc
+++ b/be/src/util/runtime-profile.cc
@@ -758,7 +758,9 @@ Status RuntimeProfile::SerializeToArchiveString(stringstream* out) const {
       MakeScopeExitTrigger([&compressor]() { compressor->Close(); });
 
   vector<uint8_t> compressed_buffer;
-  compressed_buffer.resize(compressor->MaxOutputLen(serialized_buffer.size()));
+  int64_t max_compressed_size = compressor->MaxOutputLen(serialized_buffer.size());
+  DCHECK_GT(max_compressed_size, 0);
+  compressed_buffer.resize(max_compressed_size);
   int64_t result_len = compressed_buffer.size();
   uint8_t* compressed_buffer_ptr = compressed_buffer.data();
   RETURN_IF_ERROR(compressor->ProcessBlock(true, serialized_buffer.size(),

http://git-wip-us.apache.org/repos/asf/impala/blob/7e368b8f/common/thrift/generate_error_codes.py
----------------------------------------------------------------------
diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py
index a108bb7..bf1953e 100755
--- a/common/thrift/generate_error_codes.py
+++ b/common/thrift/generate_error_codes.py
@@ -341,6 +341,9 @@ error_codes = (
 
   ("BAD_PRINCIPAL_FORMAT", 112,
     "Kerberos principal should be of the form: <service>/<hostname>@<realm> - got: $0"),
+
+  ("LZ4_COMPRESSION_INPUT_TOO_LARGE", 113,
+   "The input size is too large for LZ4 compression: $0"),
 )
 
 import sys