You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by do...@apache.org on 2021/09/19 21:18:27 UTC

[orc] branch main updated: ORC-1005: Make that the java and C++ impl of determineEncoding in RLEV2 are consistent (#913)

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/main by this push:
     new 57fc199  ORC-1005: Make that the java and C++ impl of determineEncoding in RLEV2 are consistent (#913)
57fc199 is described below

commit 57fc199c8961cfffc5fb7052d00380bba4e65fc6
Author: Guiyanakaung <gu...@gmail.com>
AuthorDate: Mon Sep 20 05:18:23 2021 +0800

    ORC-1005: Make that the java and C++ impl of determineEncoding in RLEV2 are consistent (#913)
    
    ### What changes were proposed in this pull request?
    
    1. Make that the java and C++ impl of determineEncoding in RLEV2 are consistent.
    2. Extracts the prepare for Direct or PatchedBase encoding into a separate function (prepareForDirectOrPatchedBase) to avoid code duplication.
    
    ### Why are the changes needed?
    
    The current Java determineEncoding impl is not optimal and may compute ZigzagLiterals and zzBits100p before deciding on DELTA encoding, which is redundant.
    
    The C++ implementation is better, keeping Java consistent with C++.
    
    ### How was this patch tested?
    
    Pass the CIs.
---
 c++/src/RLEv2.hh                                   |  1 +
 c++/src/RleEncoderV2.cc                            | 32 ++++++++++++----------
 .../apache/orc/impl/RunLengthIntegerWriterV2.java  | 21 ++++++++++----
 3 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/c++/src/RLEv2.hh b/c++/src/RLEv2.hh
index f85dabd..e12688e 100644
--- a/c++/src/RLEv2.hh
+++ b/c++/src/RLEv2.hh
@@ -93,6 +93,7 @@ private:
     int64_t*  adjDeltas;
 
     uint32_t getOpCode(EncodingType encoding);
+    int64_t* prepareForDirectOrPatchedBase(EncodingOption& option);
     void determineEncoding(EncodingOption& option);
     void computeZigZagLiterals(EncodingOption& option);
     void preparePatchedBlob(EncodingOption& option);
diff --git a/c++/src/RleEncoderV2.cc b/c++/src/RleEncoderV2.cc
index f823a3d..bdae97a 100644
--- a/c++/src/RleEncoderV2.cc
+++ b/c++/src/RleEncoderV2.cc
@@ -276,6 +276,20 @@ void RleEncoderV2::preparePatchedBlob(EncodingOption& option) {
     }
 }
 
+/**
+ * Prepare for Direct or PatchedBase encoding
+ * compute zigZagLiterals and zzBits100p (Max number of encoding bits required)
+ * @return zigzagLiterals
+ */
+int64_t* RleEncoderV2::prepareForDirectOrPatchedBase(EncodingOption& option) {
+    if (isSigned) {
+        computeZigZagLiterals(option);
+    }
+    int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals : literals;
+    option.zzBits100p = percentileBits(currentZigzagLiterals, 0, numLiterals, 1.0);
+    return currentZigzagLiterals;
+}
+
 void RleEncoderV2::determineEncoding(EncodingOption& option) {
     // We need to compute zigzag values for DIRECT and PATCHED_BASE encodings,
     // but not for SHORT_REPEAT or DELTA. So we only perform the zigzag
@@ -285,11 +299,7 @@ void RleEncoderV2::determineEncoding(EncodingOption& option) {
     if (numLiterals <= MIN_REPEAT) {
         // we need to compute zigzag values for DIRECT encoding if we decide to
         // break early for delta overflows or for shorter runs
-        if (isSigned) {
-            computeZigZagLiterals(option);
-        }
-        int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals : literals;
-        option.zzBits100p = percentileBits(currentZigzagLiterals, 0, numLiterals, 1.0);
+        prepareForDirectOrPatchedBase(option);
         option.encoding = DIRECT;
         return;
     }
@@ -329,11 +339,7 @@ void RleEncoderV2::determineEncoding(EncodingOption& option) {
     // PATCHED_BASE condition as encoding using DIRECT is faster and has less
     // overhead than PATCHED_BASE
     if (!isSafeSubtract(max, option.min)) {
-        if (isSigned) {
-            computeZigZagLiterals(option);
-        }
-        int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals : literals;
-        option.zzBits100p = percentileBits(currentZigzagLiterals, 0, numLiterals, 1.0);
+        prepareForDirectOrPatchedBase(option);
         option.encoding = DIRECT;
         return;
     }
@@ -389,11 +395,7 @@ void RleEncoderV2::determineEncoding(EncodingOption& option) {
     // beyond a threshold then we need to patch the values. if the variation
     // is not significant then we can use direct encoding
 
-    if (isSigned) {
-        computeZigZagLiterals(option);
-    }
-    int64_t* currentZigzagLiterals = isSigned ? zigzagLiterals : literals;
-    option.zzBits100p = percentileBits(currentZigzagLiterals, 0, numLiterals, 1.0);
+    int64_t* currentZigzagLiterals = prepareForDirectOrPatchedBase(option);
     option.zzBits90p = percentileBits(currentZigzagLiterals, 0, numLiterals, 0.9, true);
     uint32_t diffBitsLH = option.zzBits100p - option.zzBits90p;
 
diff --git a/java/core/src/java/org/apache/orc/impl/RunLengthIntegerWriterV2.java b/java/core/src/java/org/apache/orc/impl/RunLengthIntegerWriterV2.java
index 266f1bb..5ddc8ec 100644
--- a/java/core/src/java/org/apache/orc/impl/RunLengthIntegerWriterV2.java
+++ b/java/core/src/java/org/apache/orc/impl/RunLengthIntegerWriterV2.java
@@ -410,20 +410,28 @@ public class RunLengthIntegerWriterV2 implements IntegerWriter {
     fixedRunLength = 0;
   }
 
-  private void determineEncoding() {
-
-    // we need to compute zigzag values for DIRECT encoding if we decide to
-    // break early for delta overflows or for shorter runs
+  /**
+   * Prepare for Direct or PatchedBase encoding
+   * compute zigZagLiterals and zzBits100p (Max number of encoding bits required)
+   * @return zigzagLiterals
+   */
+  private long[] prepareForDirectOrPatchedBase() {
     // only signed numbers need to compute zigzag values
     if (signed) {
       computeZigZagLiterals();
     }
-
     long[] currentZigzagLiterals = signed ? zigzagLiterals : literals;
     zzBits100p = utils.percentileBits(currentZigzagLiterals, 0, numLiterals, 1.0);
+    return currentZigzagLiterals;
+  }
+
+  private void determineEncoding() {
+    // we need to compute zigzag values for DIRECT encoding if we decide to
+    // break early for delta overflows or for shorter runs
 
     // not a big win for shorter runs to determine encoding
     if (numLiterals <= MIN_REPEAT) {
+      prepareForDirectOrPatchedBase();
       encoding = EncodingType.DIRECT;
       return;
     }
@@ -463,6 +471,7 @@ public class RunLengthIntegerWriterV2 implements IntegerWriter {
     // PATCHED_BASE condition as encoding using DIRECT is faster and has less
     // overhead than PATCHED_BASE
     if (!utils.isSafeSubtract(max, min)) {
+      prepareForDirectOrPatchedBase();
       encoding = EncodingType.DIRECT;
       return;
     }
@@ -509,7 +518,7 @@ public class RunLengthIntegerWriterV2 implements IntegerWriter {
     // number of bit requirement between 90th and 100th percentile varies
     // beyond a threshold then we need to patch the values. if the variation
     // is not significant then we can use direct encoding
-
+    long[] currentZigzagLiterals = prepareForDirectOrPatchedBase();
     zzBits90p = utils.percentileBits(currentZigzagLiterals, 0, numLiterals, 0.9);
     int diffBitsLH = zzBits100p - zzBits90p;