You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by we...@apache.org on 2016/09/06 20:23:09 UTC
parquet-cpp git commit: PARQUET-708: account for "worst case
scenario" in MaxBufferSize for bit_width > 1
Repository: parquet-cpp
Updated Branches:
refs/heads/master 05409546f -> 708507f3e
PARQUET-708: account for "worst case scenario" in MaxBufferSize for bit_width > 1
Author: Uwe L. Korn <uw...@xhochy.com>
Closes #155 from xhochy/parquet-708 and squashes the following commits:
a537717 [Uwe L. Korn] Test for more bit widths
f346435 [Uwe L. Korn] PARQUET-708: account for "worst case scenario" in MaxBufferSize for bit_width > 1
Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/708507f3
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/708507f3
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/708507f3
Branch: refs/heads/master
Commit: 708507f3e1dd69d097d674c31b0105e716c199e1
Parents: 0540954
Author: Uwe L. Korn <uw...@xhochy.com>
Authored: Tue Sep 6 16:20:54 2016 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Tue Sep 6 16:20:54 2016 -0400
----------------------------------------------------------------------
src/parquet/column/levels-test.cc | 32 ++++++++++++++++++++++++++++++++
src/parquet/util/rle-encoding.h | 8 ++++++--
2 files changed, 38 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/708507f3/src/parquet/column/levels-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/levels-test.cc b/src/parquet/column/levels-test.cc
index cb7cc38..126873c 100644
--- a/src/parquet/column/levels-test.cc
+++ b/src/parquet/column/levels-test.cc
@@ -206,4 +206,36 @@ TEST(TestLevelEncoder, MinimumBufferSize) {
ASSERT_EQ(kNumToEncode, encode_count);
}
+TEST(TestLevelEncoder, MinimumBufferSize2) {
+ // PARQUET-708
+ // Test the worst case for bit_width=2 consisting of
+ // LiteralRun(size=8)
+ // RepeatedRun(size=8)
+ // LiteralRun(size=8)
+ // ...
+ const int kNumToEncode = 1024;
+
+ std::vector<int16_t> levels;
+ for (int i = 0; i < kNumToEncode; ++i) {
+ // This forces a literal run of 00000001
+ // followed by eight 1s
+ if ((i % 16) < 7) {
+ levels.push_back(0);
+ } else {
+ levels.push_back(1);
+ }
+ }
+
+ for (int bit_width = 1; bit_width <= 8; bit_width++) {
+ std::vector<uint8_t> output(
+ LevelEncoder::MaxBufferSize(Encoding::RLE, bit_width, kNumToEncode));
+
+ LevelEncoder encoder;
+ encoder.Init(Encoding::RLE, bit_width, kNumToEncode, output.data(), output.size());
+ int encode_count = encoder.Encode(kNumToEncode, levels.data());
+
+ ASSERT_EQ(kNumToEncode, encode_count);
+ }
+}
+
} // namespace parquet
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/708507f3/src/parquet/util/rle-encoding.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/rle-encoding.h b/src/parquet/util/rle-encoding.h
index 15fd550..8fa1c41 100644
--- a/src/parquet/util/rle-encoding.h
+++ b/src/parquet/util/rle-encoding.h
@@ -171,8 +171,12 @@ class RleEncoder {
/// Returns the maximum byte size it could take to encode 'num_values'.
static int MaxBufferSize(int bit_width, int num_values) {
- int bytes_per_run = BitUtil::Ceil(bit_width * MAX_VALUES_PER_LITERAL_RUN, 8.0);
- int num_runs = BitUtil::Ceil(num_values, MAX_VALUES_PER_LITERAL_RUN);
+ // For a bit_width > 1, the worst case is the repetition of "literal run of length 8
+ // and then a repeated run of length 8".
+ // 8 values per smallest run, 8 bits per byte
+ // int bytes_per_run = BitUtil::Ceil(bit_width * 8, 8);
+ int bytes_per_run = bit_width;
+ int num_runs = BitUtil::Ceil(num_values, 8);
int literal_max_size = num_runs + num_runs * bytes_per_run;
// In the very worst case scenario, the data is a concatenation of repeated