You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2021/08/30 02:54:10 UTC

[orc] branch branch-1.7 updated: ORC-976: Optimize compute zigzagliterals (#887)

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-1.7
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/branch-1.7 by this push:
     new 0d7b02c  ORC-976: Optimize compute zigzagliterals (#887)
0d7b02c is described below

commit 0d7b02c6c033f3557d8f8cb0827693fe4e86fc24
Author: guiyanakaung <gu...@gmail.com>
AuthorDate: Mon Aug 30 10:53:33 2021 +0800

    ORC-976: Optimize compute zigzagliterals (#887)
    
    ### What changes were proposed in this pull request?
    
    ```java
      private void computeZigZagLiterals() {
        // populate zigzag encoded literals
        long zzEncVal = 0;
        for (int i = 0; i < numLiterals; i++) {
          if (signed) {
            zzEncVal = utils.zigzagEncode(literals[i]);
          } else {
            zzEncVal = literals[i];
          }
          zigzagLiterals[i] = zzEncVal;
        }
      }
    ```
    
    Avoid conditional judgments in loops.
    The unsigned case can use literals instead of zigzagLiterals.
    
    ### Why are the changes needed?
    
    optimize write performance.
    
    ### How was this patch tested?
    
    Pass the CIs.
    
    (cherry picked from commit ffa5b2da67d259054900f1c0b3694077ea9104db)
    Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
 c++/src/RleEncoderV2.cc                            | 37 ++++++++++++----------
 .../apache/orc/impl/RunLengthIntegerWriterV2.java  | 25 ++++++++-------
 2 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/c++/src/RleEncoderV2.cc b/c++/src/RleEncoderV2.cc
index 44e2761..f823a3d 100644
--- a/c++/src/RleEncoderV2.cc
+++ b/c++/src/RleEncoderV2.cc
@@ -67,7 +67,7 @@ RleEncoderV2::RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream,
         prevDelta(0){
     literals = new int64_t[MAX_LITERAL_SIZE];
     gapVsPatchList = new int64_t[MAX_LITERAL_SIZE];
-    zigzagLiterals = new int64_t[MAX_LITERAL_SIZE];
+    zigzagLiterals = hasSigned ? new int64_t[MAX_LITERAL_SIZE] : nullptr;
     baseRedLiterals = new int64_t[MAX_LITERAL_SIZE];
     adjDeltas = new int64_t[MAX_LITERAL_SIZE];
 }
@@ -168,14 +168,9 @@ void RleEncoderV2::write(int64_t val) {
 }
 
 void RleEncoderV2::computeZigZagLiterals(EncodingOption &option) {
-    int64_t zzEncVal = 0;
+    assert (isSigned);
     for (size_t i = 0; i < numLiterals; i++) {
-        if (isSigned) {
-            zzEncVal = zigZag(literals[i]);
-        } else {
-            zzEncVal = literals[i];
-        }
-        zigzagLiterals[option.zigzagLiteralsCount++] = zzEncVal;
+        zigzagLiterals[option.zigzagLiteralsCount++] = zigZag(literals[i]);
     }
 }
 
@@ -290,8 +285,11 @@ void RleEncoderV2::determineEncoding(EncodingOption& option) {
     if (numLiterals <= MIN_REPEAT) {
         // we need to compute zigzag values for DIRECT encoding if we decide to
         // break early for delta overflows or for shorter runs
-        computeZigZagLiterals(option);
-        option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0);
+        if (isSigned) {
+            computeZigZagLiterals(option);
+        }
+        int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals : literals;
+        option.zzBits100p = percentileBits(currentZigzagLiterals, 0, numLiterals, 1.0);
         option.encoding = DIRECT;
         return;
     }
@@ -331,8 +329,11 @@ void RleEncoderV2::determineEncoding(EncodingOption& option) {
     // PATCHED_BASE condition as encoding using DIRECT is faster and has less
     // overhead than PATCHED_BASE
     if (!isSafeSubtract(max, option.min)) {
-        computeZigZagLiterals(option);
-        option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0);
+        if (isSigned) {
+            computeZigZagLiterals(option);
+        }
+        int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals : literals;
+        option.zzBits100p = percentileBits(currentZigzagLiterals, 0, numLiterals, 1.0);
         option.encoding = DIRECT;
         return;
     }
@@ -388,9 +389,12 @@ void RleEncoderV2::determineEncoding(EncodingOption& option) {
     // beyond a threshold then we need to patch the values. if the variation
     // is not significant then we can use direct encoding
 
-    computeZigZagLiterals(option);
-    option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0);
-    option.zzBits90p = percentileBits(zigzagLiterals, 0, numLiterals, 0.9, true);
+    if (isSigned) {
+        computeZigZagLiterals(option);
+    }
+    int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals : literals;
+    option.zzBits100p = percentileBits(currentZigzagLiterals, 0, numLiterals, 1.0);
+    option.zzBits90p = percentileBits(currentZigzagLiterals, 0, numLiterals, 0.9, true);
     uint32_t diffBitsLH = option.zzBits100p - option.zzBits90p;
 
     // if the difference between 90th percentile and 100th percentile fixed
@@ -539,7 +543,8 @@ void RleEncoderV2::writeDirectValues(EncodingOption& option) {
     writeByte(headerSecondByte);
 
     // bit packing the zigzag encoded literals
-    writeInts(zigzagLiterals, 0, numLiterals, fb);
+    int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals : literals;
+    writeInts(currentZigzagLiterals, 0, numLiterals, fb);
 
     // reset run length
     variableRunLength = 0;
diff --git a/java/core/src/java/org/apache/orc/impl/RunLengthIntegerWriterV2.java b/java/core/src/java/org/apache/orc/impl/RunLengthIntegerWriterV2.java
index a0e1e78..266f1bb 100644
--- a/java/core/src/java/org/apache/orc/impl/RunLengthIntegerWriterV2.java
+++ b/java/core/src/java/org/apache/orc/impl/RunLengthIntegerWriterV2.java
@@ -134,7 +134,7 @@ public class RunLengthIntegerWriterV2 implements IntegerWriter {
   private final boolean signed;
   private EncodingType encoding;
   private int numLiterals;
-  private final long[] zigzagLiterals = new long[MAX_SCOPE];
+  private final long[] zigzagLiterals;
   private final long[] baseRedLiterals = new long[MAX_SCOPE];
   private final long[] adjDeltas = new long[MAX_SCOPE];
   private long fixedDelta;
@@ -160,6 +160,7 @@ public class RunLengthIntegerWriterV2 implements IntegerWriter {
       boolean alignedBitpacking) {
     this.output = output;
     this.signed = signed;
+    this.zigzagLiterals = signed ? new long[MAX_SCOPE] : null;
     this.alignedBitpacking = alignedBitpacking;
     this.utils = new SerializationUtils();
     clear();
@@ -367,7 +368,8 @@ public class RunLengthIntegerWriterV2 implements IntegerWriter {
     output.write(headerSecondByte);
 
     // bit packing the zigzag encoded literals
-    utils.writeInts(zigzagLiterals, 0, numLiterals, fb, output);
+    long[] currentZigzagLiterals = signed ? zigzagLiterals : literals;
+    utils.writeInts(currentZigzagLiterals, 0, numLiterals, fb, output);
 
     // reset run length
     variableRunLength = 0;
@@ -412,9 +414,13 @@ public class RunLengthIntegerWriterV2 implements IntegerWriter {
 
     // we need to compute zigzag values for DIRECT encoding if we decide to
     // break early for delta overflows or for shorter runs
-    computeZigZagLiterals();
+    // only signed numbers need to compute zigzag values
+    if (signed) {
+      computeZigZagLiterals();
+    }
 
-    zzBits100p = utils.percentileBits(zigzagLiterals, 0, numLiterals, 1.0);
+    long[] currentZigzagLiterals = signed ? zigzagLiterals : literals;
+    zzBits100p = utils.percentileBits(currentZigzagLiterals, 0, numLiterals, 1.0);
 
     // not a big win for shorter runs to determine encoding
     if (numLiterals <= MIN_REPEAT) {
@@ -504,7 +510,7 @@ public class RunLengthIntegerWriterV2 implements IntegerWriter {
     // beyond a threshold then we need to patch the values. if the variation
     // is not significant then we can use direct encoding
 
-    zzBits90p = utils.percentileBits(zigzagLiterals, 0, numLiterals, 0.9);
+    zzBits90p = utils.percentileBits(currentZigzagLiterals, 0, numLiterals, 0.9);
     int diffBitsLH = zzBits100p - zzBits90p;
 
     // if the difference between 90th percentile and 100th percentile fixed
@@ -545,14 +551,9 @@ public class RunLengthIntegerWriterV2 implements IntegerWriter {
 
   private void computeZigZagLiterals() {
     // populate zigzag encoded literals
-    long zzEncVal = 0;
+    assert signed : "only signed numbers need to compute zigzag values";
     for (int i = 0; i < numLiterals; i++) {
-      if (signed) {
-        zzEncVal = utils.zigzagEncode(literals[i]);
-      } else {
-        zzEncVal = literals[i];
-      }
-      zigzagLiterals[i] = zzEncVal;
+      zigzagLiterals[i] = utils.zigzagEncode(literals[i]);
     }
   }