You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2019/02/26 17:47:44 UTC
[impala] 01/02: IMPALA-7979: Fix bug of SkipValues() and GetNextValues() interaction

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 5a270e097214624ced7814692c58b03d575e7ace
Author: Zoltan Borok-Nagy <bo...@cloudera.com>
AuthorDate: Fri Feb 22 17:52:47 2019 +0100

    IMPALA-7979: Fix bug of SkipValues() and GetNextValues() interaction
    
    Alternating calls to DictDecoder's SkipValues() and GetNextValues()
    could result in a decoding error.
    
    RleBatchDecoder<T>::DecodeLiteralValues() returned false when
    invoked with num_literals_to_consume=0 and had buffered literals.
    Then the caller (DictDecoder::GetNextValues()) thought it failed,
    and in the end it resulted in a decoding error.
    
    It could only occur if GetNextValues() was invoked after SkipValues().
    Otherwise the RleBatchDecoder in DictDecoder could not have buffered
    literals, since they were buffered in the dictionary decoder.
    
    Since SkipValues() is not used yet, it couldn't happen in production
    code.
    
    I added a backend test to fuzz test the interaction between
    SkipValues() and GetNextValues(). I looped the test for a few hours
    without seeing an error.
    
    Change-Id: Id0426cdef54ee9bc2306a542098a92c640dc41c4
    Reviewed-on: http://gerrit.cloudera.org:8080/12555
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/testutil/rand-util.h      |  5 ++--
 be/src/util/dict-encoding.h      |  2 +-
 be/src/util/dict-test.cc         | 61 ++++++++++++++++++++++++++++++++++++++++
 be/src/util/encoding-test-util.h | 60 +++++++++++++++++++++++++++++++++++++++
 be/src/util/rle-encoding.h       |  7 +++--
 be/src/util/rle-test.cc          | 56 ++++++------------------------------
 6 files changed, 138 insertions(+), 53 deletions(-)

diff --git a/be/src/testutil/rand-util.h b/be/src/testutil/rand-util.h
index 255fcb0..4e69428 100644
--- a/be/src/testutil/rand-util.h
+++ b/be/src/testutil/rand-util.h
@@ -30,8 +30,9 @@ namespace impala {
 class RandTestUtil {
  public:
   /// Seed 'rng' with a seed either from the environment variable 'env_var' or the
-  /// current time.
-  static void SeedRng(const char* env_var, std::mt19937* rng) {
+  /// random device of the platform.
+  template <typename RandomEngine>
+  static void SeedRng(const char* env_var, RandomEngine* rng) {
     const char* seed_str = getenv(env_var);
     int64_t seed;
     if (seed_str != nullptr) {
diff --git a/be/src/util/dict-encoding.h b/be/src/util/dict-encoding.h
index 78dc42f..fa52275 100644
--- a/be/src/util/dict-encoding.h
+++ b/be/src/util/dict-encoding.h
@@ -519,7 +519,7 @@ ALWAYS_INLINE inline bool DictDecoder<T>::GetNextValues(
         count -= num_literals;
       } else { // Case 2
         uint32_t num_to_decode = BitUtil::RoundDown(count, 32);
-        if (UNLIKELY(!data_decoder_.DecodeLiteralValues(
+        if (num_to_decode > 0 && UNLIKELY(!data_decoder_.DecodeLiteralValues(
                 num_to_decode, dict_.data(), dict_.size(), &out))) {
           return false;
         }
diff --git a/be/src/util/dict-test.cc b/be/src/util/dict-test.cc
index 4f46cf2..bb533d3 100644
--- a/be/src/util/dict-test.cc
+++ b/be/src/util/dict-test.cc
@@ -25,8 +25,10 @@
 #include "runtime/string-value.inline.h"
 #include "runtime/timestamp-value.h"
 #include "testutil/gtest-util.h"
+#include "testutil/rand-util.h"
 #include "util/bit-packing.inline.h"
 #include "util/dict-encoding.h"
+#include "util/encoding-test-util.h"
 
 #include "common/names.h"
 
@@ -336,6 +338,65 @@ TEST(DictTest, DecodeErrors) {
   }
 }
 
+TEST(DictTest, TestGetNextValuesAndSkippingFuzzy) {
+  const int values_size = 8192;
+  const int rounds = 100;
+  MemTracker tracker;
+  MemTracker track_encoder;
+  MemTracker track_decoder;
+  MemPool pool(&tracker);
+  DictEncoder<int> large_dict_encoder(&pool, sizeof(int), &track_encoder);
+
+  std::default_random_engine random_eng;
+  RandTestUtil::SeedRng("DICT_TEST_SEED", &random_eng);
+
+  // Generates random number between 'bottom' and 'top' (inclusive intervals).
+  auto GetRandom = [&random_eng](int bottom, int top) {
+    std::uniform_int_distribution<int> uni_dist(bottom, top);
+    return uni_dist(random_eng);
+  };
+
+  vector<int> values = MakeRandomSequence(random_eng, values_size, 200, 10);
+  for (int val : values) {
+    large_dict_encoder.Put(val);
+  }
+
+  vector<uint8_t> data_buffer(large_dict_encoder.EstimatedDataEncodedSize() * 2);
+  int data_len = large_dict_encoder.WriteData(data_buffer.data(), data_buffer.size());
+  ASSERT_GT(data_len, 0);
+
+  vector<uint8_t> dict_buffer(large_dict_encoder.dict_encoded_size());
+  large_dict_encoder.WriteDict(dict_buffer.data());
+  large_dict_encoder.Close();
+
+  vector<int32_t> decoded_values(values.size());
+  DictDecoder<int> decoder(&track_decoder);
+  ASSERT_TRUE(decoder.template Reset<parquet::Type::INT32>(
+      dict_buffer.data(), dict_buffer.size(), sizeof(int)));
+
+  for (int round = 0; round < rounds; ++round) {
+    ASSERT_OK(decoder.SetData(data_buffer.data(), data_buffer.size()));
+    int i = 0;
+    while (i < values.size()) {
+      int length = GetRandom(1, 200);
+      if (i + length > values.size()) length = values.size() - i;
+      int skip_or_get = GetRandom(0, 1);
+      if (skip_or_get == 0) {
+        // skip values
+        ASSERT_TRUE(decoder.SkipValues(length));
+      } else {
+        // decode values
+        ASSERT_TRUE(decoder.GetNextValues(&decoded_values[i],
+                sizeof(int32_t), length));
+        for (int j = 0; j < length; ++j) {
+          EXPECT_EQ(values[i+j], decoded_values[i+j]);
+        }
+      }
+      i += length;
+    }
+  }
+}
+
 TEST(DictTest, TestSkippingValues) {
   auto ValidateSkipping = [](const vector<int32_t>& values,
       const vector<int32_t>& dict_values, int skip_at, int skip_count,
diff --git a/be/src/util/encoding-test-util.h b/be/src/util/encoding-test-util.h
new file mode 100644
index 0000000..2e1f5d0
--- /dev/null
+++ b/be/src/util/encoding-test-util.h
@@ -0,0 +1,60 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <random>
+
+namespace impala {
+
+/// Generates a sequence that contains repeated and literal runs with random lengths.
+/// Total length of the sequence is limited by 'max_run_length'. It only generates values
+/// that can be represented on 'bit_width' size bits.
+template<typename RandomEngine>
+std::vector<int> MakeRandomSequence(RandomEngine& random_eng, int total_length,
+    int max_run_length, int bit_width) {
+  auto NextRunLength = [&]() {
+    std::uniform_int_distribution<int> uni_dist(1, max_run_length);
+    return uni_dist(random_eng);
+  };
+  auto IsNextRunRepeated = [&random_eng]() {
+    std::uniform_int_distribution<int> uni_dist(0, 1);
+    return uni_dist(random_eng) == 0;
+  };
+  auto NextVal = [bit_width](int val) {
+    if (bit_width == CHAR_BIT * sizeof(int)) return val + 1;
+    return (val + 1) % (1 << bit_width);
+  };
+
+  std::vector<int> ret;
+  int run_length = 0;
+  int val = 0;
+  int is_repeated = false;
+  while (ret.size() < total_length) {
+    if (run_length == 0) {
+      run_length = NextRunLength();
+      is_repeated = IsNextRunRepeated();
+      val = NextVal(val);
+    }
+    ret.push_back(val);
+    if (!is_repeated) val = NextVal(val);
+    --run_length;
+  }
+  return ret;
+}
+
+}
diff --git a/be/src/util/rle-encoding.h b/be/src/util/rle-encoding.h
index 21adc02..36cadbb 100644
--- a/be/src/util/rle-encoding.h
+++ b/be/src/util/rle-encoding.h
@@ -714,9 +714,11 @@ template <typename T>
 template <typename OutType>
 inline bool RleBatchDecoder<T>::DecodeLiteralValues(int32_t num_literals_to_consume,
     OutType* dict, int64_t dict_len, StrideWriter<OutType>* RESTRICT out) {
-  DCHECK_GE(num_literals_to_consume, 0);
+  DCHECK_GT(num_literals_to_consume, 0);
   DCHECK_GE(literal_count_, num_literals_to_consume);
 
+  if (num_literals_to_consume == 0) return false;
+
   int32_t num_remaining = num_literals_to_consume;
   // Decode any buffered literals left over from previous calls.
   if (HaveBufferedLiterals()) {
@@ -892,8 +894,7 @@ inline int32_t RleBatchDecoder<T>::SkipValues(int32_t num_values) {
     // Skip RLE encoded values
     int32_t num_repeats = NextNumRepeats();
     if (num_repeats > 0) {
-       int32_t num_repeats_to_consume =
-           std::min(num_repeats, num_values - num_skipped);
+       int32_t num_repeats_to_consume = std::min(num_repeats, num_values - num_skipped);
        SkipRepeatedValues(num_repeats_to_consume);
        num_skipped += num_repeats_to_consume;
        continue;
diff --git a/be/src/util/rle-test.cc b/be/src/util/rle-test.cc
index 1e3caec..3eb9087 100644
--- a/be/src/util/rle-test.cc
+++ b/be/src/util/rle-test.cc
@@ -24,8 +24,10 @@
 #include <random>
 
 #include "testutil/gtest-util.h"
+#include "testutil/rand-util.h"
 #include "util/bit-packing.inline.h"
 #include "util/bit-stream-utils.h"
+#include "util/encoding-test-util.h"
 #include "util/rle-encoding.h"
 
 #include "common/names.h"
@@ -172,14 +174,13 @@ class RleTest : public ::testing::Test {
   }
 
   int ValidateRleSkip(const vector<int>& values, int bit_width,
-      int min_repeated_run_length, int skip_at, int skip_count, unsigned int seed=0) {
+      int min_repeated_run_length, int skip_at, int skip_count) {
     stringstream ss;
     ss << "bit_width=" << bit_width
        << " min_repeated_run_length_=" << min_repeated_run_length
        << " skip_at=" << skip_at
        << " skip_count=" << skip_count
-       << " values.size()=" << values.size()
-       << " seed=" << seed;
+       << " values.size()=" << values.size();
     const string& description = ss.str();
     const int len = 64 * 1024;
     uint8_t buffer[len];
@@ -285,44 +286,6 @@ class RleTest : public ::testing::Test {
     return MakeSequenceBitWidth(values, intitial_literal_length, repeated_length,
         trailing_literal_length, 1);
   }
-
-  /// Generates a sequence that contains repeated and literal runs with random lengths.
-  /// Total length of the sequence is limited by 'max_run_length'. The random generation
-  /// is seeded by 'seed' to allow deterministic behavior.
-  vector<int> MakeRandomSequence(unsigned int seed, int total_length, int max_run_length,
-      int bit_width) {
-    std::default_random_engine random_eng(seed);
-    auto NextRunLength = [&]() {
-      std::uniform_int_distribution<int> uni_dist(1, max_run_length);
-      return uni_dist(random_eng);
-    };
-    auto IsNextRunRepeated = [&random_eng]() {
-      std::uniform_int_distribution<int> uni_dist(0, 1);
-      return uni_dist(random_eng) == 0;
-    };
-    auto NextVal = [bit_width](int val) {
-      if (bit_width == CHAR_BIT * sizeof(int)) return val + 1;
-      return (val + 1) % (1 << bit_width);
-    };
-
-    vector<int> ret;
-    int run_length = 0;
-    int val = 0;
-    int is_repeated = false;
-    while (ret.size() < total_length) {
-      if (run_length == 0) {
-        run_length = NextRunLength();
-        is_repeated = IsNextRunRepeated();
-        val = NextVal(val);
-      }
-      ret.push_back(val);
-      if (!is_repeated) {
-        val = NextVal(val);
-      }
-      --run_length;
-    }
-    return ret;
-  }
 };
 
 /// Basic test case for literal unpacking - two literals in a run.
@@ -376,9 +339,8 @@ TEST_F(RleTest, ValueSkippingFuzzy) {
   const int probe_iteration = 100;
   const int total_sequence_length = 2048;
 
-  std::random_device r;
-  unsigned int seed = r();
-  std::default_random_engine random_eng(seed);
+  std::default_random_engine random_eng;
+  RandTestUtil::SeedRng("RLE_TEST_SEED", &random_eng);
 
   // Generates random number between 'bottom' and 'top' (inclusive intervals).
   auto GetRandom = [&random_eng](int bottom, int top) {
@@ -390,12 +352,12 @@ TEST_F(RleTest, ValueSkippingFuzzy) {
     for (int i = 0; i < bitwidth_iteration; ++i) {
       int bit_width = GetRandom(1, 32);
       int max_run_length = GetRandom(5, 200);
-      vector<int> seq = MakeRandomSequence(seed, total_sequence_length, max_run_length,
-          bit_width);
+      vector<int> seq = MakeRandomSequence(random_eng, total_sequence_length,
+          max_run_length, bit_width);
       for (int j = 0; j < probe_iteration; ++j) {
         int skip_at = GetRandom(0, seq.size() - 1);
         int skip_count = GetRandom(1, seq.size() - skip_at);
-        ValidateRleSkip(seq, bit_width, min_run_length, skip_at, skip_count, seed);
+        ValidateRleSkip(seq, bit_width, min_run_length, skip_at, skip_count);
       }
     }
   }