You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by we...@apache.org on 2016/09/06 20:23:09 UTC

parquet-cpp git commit: PARQUET-708: account for "worst case scenario" in MaxBufferSize for bit_width > 1

Repository: parquet-cpp
Updated Branches:
  refs/heads/master 05409546f -> 708507f3e


PARQUET-708: account for "worst case scenario" in MaxBufferSize for bit_width > 1

Author: Uwe L. Korn <uw...@xhochy.com>

Closes #155 from xhochy/parquet-708 and squashes the following commits:

a537717 [Uwe L. Korn] Test for more bit widths
f346435 [Uwe L. Korn] PARQUET-708: account for "worst case scenario" in MaxBufferSize for bit_width > 1


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/708507f3
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/708507f3
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/708507f3

Branch: refs/heads/master
Commit: 708507f3e1dd69d097d674c31b0105e716c199e1
Parents: 0540954
Author: Uwe L. Korn <uw...@xhochy.com>
Authored: Tue Sep 6 16:20:54 2016 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Tue Sep 6 16:20:54 2016 -0400

----------------------------------------------------------------------
 src/parquet/column/levels-test.cc | 32 ++++++++++++++++++++++++++++++++
 src/parquet/util/rle-encoding.h   |  8 ++++++--
 2 files changed, 38 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/708507f3/src/parquet/column/levels-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/levels-test.cc b/src/parquet/column/levels-test.cc
index cb7cc38..126873c 100644
--- a/src/parquet/column/levels-test.cc
+++ b/src/parquet/column/levels-test.cc
@@ -206,4 +206,36 @@ TEST(TestLevelEncoder, MinimumBufferSize) {
   ASSERT_EQ(kNumToEncode, encode_count);
 }
 
+TEST(TestLevelEncoder, MinimumBufferSize2) {
+  // PARQUET-708
+  // Test the worst case for bit_width=2 consisting of
+  // LiteralRun(size=8)
+  // RepeatedRun(size=8)
+  // LiteralRun(size=8)
+  // ...
+  const int kNumToEncode = 1024;
+
+  std::vector<int16_t> levels;
+  for (int i = 0; i < kNumToEncode; ++i) {
+    // This forces a literal run of 00000001
+    // followed by eight 1s
+    if ((i % 16) < 7) {
+      levels.push_back(0);
+    } else {
+      levels.push_back(1);
+    }
+  }
+
+  for (int bit_width = 1; bit_width <= 8; bit_width++) {
+    std::vector<uint8_t> output(
+        LevelEncoder::MaxBufferSize(Encoding::RLE, bit_width, kNumToEncode));
+
+    LevelEncoder encoder;
+    encoder.Init(Encoding::RLE, bit_width, kNumToEncode, output.data(), output.size());
+    int encode_count = encoder.Encode(kNumToEncode, levels.data());
+
+    ASSERT_EQ(kNumToEncode, encode_count);
+  }
+}
+
 }  // namespace parquet

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/708507f3/src/parquet/util/rle-encoding.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/rle-encoding.h b/src/parquet/util/rle-encoding.h
index 15fd550..8fa1c41 100644
--- a/src/parquet/util/rle-encoding.h
+++ b/src/parquet/util/rle-encoding.h
@@ -171,8 +171,12 @@ class RleEncoder {
 
   /// Returns the maximum byte size it could take to encode 'num_values'.
   static int MaxBufferSize(int bit_width, int num_values) {
-    int bytes_per_run = BitUtil::Ceil(bit_width * MAX_VALUES_PER_LITERAL_RUN, 8.0);
-    int num_runs = BitUtil::Ceil(num_values, MAX_VALUES_PER_LITERAL_RUN);
+    // For a bit_width > 1, the worst case is the repetition of "literal run of length 8
+    // and then a repeated run of length 8".
+    // 8 values per smallest run, 8 bits per byte
+    // int bytes_per_run = BitUtil::Ceil(bit_width * 8, 8);
+    int bytes_per_run = bit_width;
+    int num_runs = BitUtil::Ceil(num_values, 8);
     int literal_max_size = num_runs + num_runs * bytes_per_run;
 
     // In the very worst case scenario, the data is a concatenation of repeated