You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2021/08/30 02:54:10 UTC
[orc] branch branch-1.7 updated: ORC-976: Optimize compute
zigzagliterals (#887)
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-1.7
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-1.7 by this push:
new 0d7b02c ORC-976: Optimize compute zigzagliterals (#887)
0d7b02c is described below
commit 0d7b02c6c033f3557d8f8cb0827693fe4e86fc24
Author: guiyanakaung <gu...@gmail.com>
AuthorDate: Mon Aug 30 10:53:33 2021 +0800
ORC-976: Optimize compute zigzagliterals (#887)
### What changes were proposed in this pull request?
```java
private void computeZigZagLiterals() {
// populate zigzag encoded literals
long zzEncVal = 0;
for (int i = 0; i < numLiterals; i++) {
if (signed) {
zzEncVal = utils.zigzagEncode(literals[i]);
} else {
zzEncVal = literals[i];
}
zigzagLiterals[i] = zzEncVal;
}
}
```
Avoid conditional judgments in loops.
The unsigned case can use literals instead of zigzagLiterals.
### Why are the changes needed?
optimize write performance.
### How was this patch tested?
Pass the CIs.
(cherry picked from commit ffa5b2da67d259054900f1c0b3694077ea9104db)
Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
c++/src/RleEncoderV2.cc | 37 ++++++++++++----------
.../apache/orc/impl/RunLengthIntegerWriterV2.java | 25 ++++++++-------
2 files changed, 34 insertions(+), 28 deletions(-)
diff --git a/c++/src/RleEncoderV2.cc b/c++/src/RleEncoderV2.cc
index 44e2761..f823a3d 100644
--- a/c++/src/RleEncoderV2.cc
+++ b/c++/src/RleEncoderV2.cc
@@ -67,7 +67,7 @@ RleEncoderV2::RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream,
prevDelta(0){
literals = new int64_t[MAX_LITERAL_SIZE];
gapVsPatchList = new int64_t[MAX_LITERAL_SIZE];
- zigzagLiterals = new int64_t[MAX_LITERAL_SIZE];
+ zigzagLiterals = hasSigned ? new int64_t[MAX_LITERAL_SIZE] : nullptr;
baseRedLiterals = new int64_t[MAX_LITERAL_SIZE];
adjDeltas = new int64_t[MAX_LITERAL_SIZE];
}
@@ -168,14 +168,9 @@ void RleEncoderV2::write(int64_t val) {
}
void RleEncoderV2::computeZigZagLiterals(EncodingOption &option) {
- int64_t zzEncVal = 0;
+ assert (isSigned);
for (size_t i = 0; i < numLiterals; i++) {
- if (isSigned) {
- zzEncVal = zigZag(literals[i]);
- } else {
- zzEncVal = literals[i];
- }
- zigzagLiterals[option.zigzagLiteralsCount++] = zzEncVal;
+ zigzagLiterals[option.zigzagLiteralsCount++] = zigZag(literals[i]);
}
}
@@ -290,8 +285,11 @@ void RleEncoderV2::determineEncoding(EncodingOption& option) {
if (numLiterals <= MIN_REPEAT) {
// we need to compute zigzag values for DIRECT encoding if we decide to
// break early for delta overflows or for shorter runs
- computeZigZagLiterals(option);
- option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0);
+ if (isSigned) {
+ computeZigZagLiterals(option);
+ }
+ int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals : literals;
+ option.zzBits100p = percentileBits(currentZigzagLiterals, 0, numLiterals, 1.0);
option.encoding = DIRECT;
return;
}
@@ -331,8 +329,11 @@ void RleEncoderV2::determineEncoding(EncodingOption& option) {
// PATCHED_BASE condition as encoding using DIRECT is faster and has less
// overhead than PATCHED_BASE
if (!isSafeSubtract(max, option.min)) {
- computeZigZagLiterals(option);
- option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0);
+ if (isSigned) {
+ computeZigZagLiterals(option);
+ }
+ int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals : literals;
+ option.zzBits100p = percentileBits(currentZigzagLiterals, 0, numLiterals, 1.0);
option.encoding = DIRECT;
return;
}
@@ -388,9 +389,12 @@ void RleEncoderV2::determineEncoding(EncodingOption& option) {
// beyond a threshold then we need to patch the values. if the variation
// is not significant then we can use direct encoding
- computeZigZagLiterals(option);
- option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0);
- option.zzBits90p = percentileBits(zigzagLiterals, 0, numLiterals, 0.9, true);
+ if (isSigned) {
+ computeZigZagLiterals(option);
+ }
+ int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals : literals;
+ option.zzBits100p = percentileBits(currentZigzagLiterals, 0, numLiterals, 1.0);
+ option.zzBits90p = percentileBits(currentZigzagLiterals, 0, numLiterals, 0.9, true);
uint32_t diffBitsLH = option.zzBits100p - option.zzBits90p;
// if the difference between 90th percentile and 100th percentile fixed
@@ -539,7 +543,8 @@ void RleEncoderV2::writeDirectValues(EncodingOption& option) {
writeByte(headerSecondByte);
// bit packing the zigzag encoded literals
- writeInts(zigzagLiterals, 0, numLiterals, fb);
+ int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals : literals;
+ writeInts(currentZigzagLiterals, 0, numLiterals, fb);
// reset run length
variableRunLength = 0;
diff --git a/java/core/src/java/org/apache/orc/impl/RunLengthIntegerWriterV2.java b/java/core/src/java/org/apache/orc/impl/RunLengthIntegerWriterV2.java
index a0e1e78..266f1bb 100644
--- a/java/core/src/java/org/apache/orc/impl/RunLengthIntegerWriterV2.java
+++ b/java/core/src/java/org/apache/orc/impl/RunLengthIntegerWriterV2.java
@@ -134,7 +134,7 @@ public class RunLengthIntegerWriterV2 implements IntegerWriter {
private final boolean signed;
private EncodingType encoding;
private int numLiterals;
- private final long[] zigzagLiterals = new long[MAX_SCOPE];
+ private final long[] zigzagLiterals;
private final long[] baseRedLiterals = new long[MAX_SCOPE];
private final long[] adjDeltas = new long[MAX_SCOPE];
private long fixedDelta;
@@ -160,6 +160,7 @@ public class RunLengthIntegerWriterV2 implements IntegerWriter {
boolean alignedBitpacking) {
this.output = output;
this.signed = signed;
+ this.zigzagLiterals = signed ? new long[MAX_SCOPE] : null;
this.alignedBitpacking = alignedBitpacking;
this.utils = new SerializationUtils();
clear();
@@ -367,7 +368,8 @@ public class RunLengthIntegerWriterV2 implements IntegerWriter {
output.write(headerSecondByte);
// bit packing the zigzag encoded literals
- utils.writeInts(zigzagLiterals, 0, numLiterals, fb, output);
+ long[] currentZigzagLiterals = signed ? zigzagLiterals : literals;
+ utils.writeInts(currentZigzagLiterals, 0, numLiterals, fb, output);
// reset run length
variableRunLength = 0;
@@ -412,9 +414,13 @@ public class RunLengthIntegerWriterV2 implements IntegerWriter {
// we need to compute zigzag values for DIRECT encoding if we decide to
// break early for delta overflows or for shorter runs
- computeZigZagLiterals();
+ // only signed numbers need to compute zigzag values
+ if (signed) {
+ computeZigZagLiterals();
+ }
- zzBits100p = utils.percentileBits(zigzagLiterals, 0, numLiterals, 1.0);
+ long[] currentZigzagLiterals = signed ? zigzagLiterals : literals;
+ zzBits100p = utils.percentileBits(currentZigzagLiterals, 0, numLiterals, 1.0);
// not a big win for shorter runs to determine encoding
if (numLiterals <= MIN_REPEAT) {
@@ -504,7 +510,7 @@ public class RunLengthIntegerWriterV2 implements IntegerWriter {
// beyond a threshold then we need to patch the values. if the variation
// is not significant then we can use direct encoding
- zzBits90p = utils.percentileBits(zigzagLiterals, 0, numLiterals, 0.9);
+ zzBits90p = utils.percentileBits(currentZigzagLiterals, 0, numLiterals, 0.9);
int diffBitsLH = zzBits100p - zzBits90p;
// if the difference between 90th percentile and 100th percentile fixed
@@ -545,14 +551,9 @@ public class RunLengthIntegerWriterV2 implements IntegerWriter {
private void computeZigZagLiterals() {
// populate zigzag encoded literals
- long zzEncVal = 0;
+ assert signed : "only signed numbers need to compute zigzag values";
for (int i = 0; i < numLiterals; i++) {
- if (signed) {
- zzEncVal = utils.zigzagEncode(literals[i]);
- } else {
- zzEncVal = literals[i];
- }
- zigzagLiterals[i] = zzEncVal;
+ zigzagLiterals[i] = utils.zigzagEncode(literals[i]);
}
}