You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2021/09/19 21:18:27 UTC
[orc] branch main updated: ORC-1005: Make that the java and C++
impl of determineEncoding in RLEV2 are consistent (#913)
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new 57fc199 ORC-1005: Make that the java and C++ impl of determineEncoding in RLEV2 are consistent (#913)
57fc199 is described below
commit 57fc199c8961cfffc5fb7052d00380bba4e65fc6
Author: Guiyanakaung <gu...@gmail.com>
AuthorDate: Mon Sep 20 05:18:23 2021 +0800
ORC-1005: Make that the java and C++ impl of determineEncoding in RLEV2 are consistent (#913)
### What changes were proposed in this pull request?
1. Make that the java and C++ impl of determineEncoding in RLEV2 are consistent.
2. Extracts the prepare for Direct or PatchedBase encoding into a separate function (prepareForDirectOrPatchedBase) to avoid code duplication.
### Why are the changes needed?
The current Java determineEncoding impl is not optimal and may compute ZigzagLiterals and zzBits100p before deciding on DELTA encoding, which is redundant.
The C++ implementation is better, keeping Java consistent with C++.
### How was this patch tested?
Pass the CIs.
---
c++/src/RLEv2.hh | 1 +
c++/src/RleEncoderV2.cc | 32 ++++++++++++----------
.../apache/orc/impl/RunLengthIntegerWriterV2.java | 21 ++++++++++----
3 files changed, 33 insertions(+), 21 deletions(-)
diff --git a/c++/src/RLEv2.hh b/c++/src/RLEv2.hh
index f85dabd..e12688e 100644
--- a/c++/src/RLEv2.hh
+++ b/c++/src/RLEv2.hh
@@ -93,6 +93,7 @@ private:
int64_t* adjDeltas;
uint32_t getOpCode(EncodingType encoding);
+ int64_t* prepareForDirectOrPatchedBase(EncodingOption& option);
void determineEncoding(EncodingOption& option);
void computeZigZagLiterals(EncodingOption& option);
void preparePatchedBlob(EncodingOption& option);
diff --git a/c++/src/RleEncoderV2.cc b/c++/src/RleEncoderV2.cc
index f823a3d..bdae97a 100644
--- a/c++/src/RleEncoderV2.cc
+++ b/c++/src/RleEncoderV2.cc
@@ -276,6 +276,20 @@ void RleEncoderV2::preparePatchedBlob(EncodingOption& option) {
}
}
+/**
+ * Prepare for Direct or PatchedBase encoding
+ * compute zigZagLiterals and zzBits100p (Max number of encoding bits required)
+ * @return zigzagLiterals
+ */
+int64_t* RleEncoderV2::prepareForDirectOrPatchedBase(EncodingOption& option) {
+ if (isSigned) {
+ computeZigZagLiterals(option);
+ }
+ int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals : literals;
+ option.zzBits100p = percentileBits(currentZigzagLiterals, 0, numLiterals, 1.0);
+ return currentZigzagLiterals;
+}
+
void RleEncoderV2::determineEncoding(EncodingOption& option) {
// We need to compute zigzag values for DIRECT and PATCHED_BASE encodings,
// but not for SHORT_REPEAT or DELTA. So we only perform the zigzag
@@ -285,11 +299,7 @@ void RleEncoderV2::determineEncoding(EncodingOption& option) {
if (numLiterals <= MIN_REPEAT) {
// we need to compute zigzag values for DIRECT encoding if we decide to
// break early for delta overflows or for shorter runs
- if (isSigned) {
- computeZigZagLiterals(option);
- }
- int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals : literals;
- option.zzBits100p = percentileBits(currentZigzagLiterals, 0, numLiterals, 1.0);
+ prepareForDirectOrPatchedBase(option);
option.encoding = DIRECT;
return;
}
@@ -329,11 +339,7 @@ void RleEncoderV2::determineEncoding(EncodingOption& option) {
// PATCHED_BASE condition as encoding using DIRECT is faster and has less
// overhead than PATCHED_BASE
if (!isSafeSubtract(max, option.min)) {
- if (isSigned) {
- computeZigZagLiterals(option);
- }
- int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals : literals;
- option.zzBits100p = percentileBits(currentZigzagLiterals, 0, numLiterals, 1.0);
+ prepareForDirectOrPatchedBase(option);
option.encoding = DIRECT;
return;
}
@@ -389,11 +395,7 @@ void RleEncoderV2::determineEncoding(EncodingOption& option) {
// beyond a threshold then we need to patch the values. if the variation
// is not significant then we can use direct encoding
- if (isSigned) {
- computeZigZagLiterals(option);
- }
- int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals : literals;
- option.zzBits100p = percentileBits(currentZigzagLiterals, 0, numLiterals, 1.0);
+ int64_t* currentZigzagLiterals = prepareForDirectOrPatchedBase(option);
option.zzBits90p = percentileBits(currentZigzagLiterals, 0, numLiterals, 0.9, true);
uint32_t diffBitsLH = option.zzBits100p - option.zzBits90p;
diff --git a/java/core/src/java/org/apache/orc/impl/RunLengthIntegerWriterV2.java b/java/core/src/java/org/apache/orc/impl/RunLengthIntegerWriterV2.java
index 266f1bb..5ddc8ec 100644
--- a/java/core/src/java/org/apache/orc/impl/RunLengthIntegerWriterV2.java
+++ b/java/core/src/java/org/apache/orc/impl/RunLengthIntegerWriterV2.java
@@ -410,20 +410,28 @@ public class RunLengthIntegerWriterV2 implements IntegerWriter {
fixedRunLength = 0;
}
- private void determineEncoding() {
-
- // we need to compute zigzag values for DIRECT encoding if we decide to
- // break early for delta overflows or for shorter runs
+ /**
+ * Prepare for Direct or PatchedBase encoding
+ * compute zigZagLiterals and zzBits100p (Max number of encoding bits required)
+ * @return zigzagLiterals
+ */
+ private long[] prepareForDirectOrPatchedBase() {
// only signed numbers need to compute zigzag values
if (signed) {
computeZigZagLiterals();
}
-
long[] currentZigzagLiterals = signed ? zigzagLiterals : literals;
zzBits100p = utils.percentileBits(currentZigzagLiterals, 0, numLiterals, 1.0);
+ return currentZigzagLiterals;
+ }
+
+ private void determineEncoding() {
+ // we need to compute zigzag values for DIRECT encoding if we decide to
+ // break early for delta overflows or for shorter runs
// not a big win for shorter runs to determine encoding
if (numLiterals <= MIN_REPEAT) {
+ prepareForDirectOrPatchedBase();
encoding = EncodingType.DIRECT;
return;
}
@@ -463,6 +471,7 @@ public class RunLengthIntegerWriterV2 implements IntegerWriter {
// PATCHED_BASE condition as encoding using DIRECT is faster and has less
// overhead than PATCHED_BASE
if (!utils.isSafeSubtract(max, min)) {
+ prepareForDirectOrPatchedBase();
encoding = EncodingType.DIRECT;
return;
}
@@ -509,7 +518,7 @@ public class RunLengthIntegerWriterV2 implements IntegerWriter {
// number of bit requirement between 90th and 100th percentile varies
// beyond a threshold then we need to patch the values. if the variation
// is not significant then we can use direct encoding
-
+ long[] currentZigzagLiterals = prepareForDirectOrPatchedBase();
zzBits90p = utils.percentileBits(currentZigzagLiterals, 0, numLiterals, 0.9);
int diffBitsLH = zzBits100p - zzBits90p;