You are viewing a plain text version of this content. The canonical link for it is here.

Posted to dev@orc.apache.org by yuruiz <gi...@git.apache.org> on 2018/05/25 04:03:41 UTC

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

GitHub user yuruiz opened a pull request:

    https://github.com/apache/orc/pull/273

    ORC-343 Enable C++ writer to support RleV2

    1. Port RleV2 implementation from Java to C++
    2. Add RleV2 relevant tests to C++

You can merge this pull request into a Git repository by running:

    $ git pull https://github.com/yuruiz/orc dev/yuruiz/RLEv2

Alternatively you can review and apply these changes as the patch at:

    https://github.com/apache/orc/pull/273.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

    This closes #273
    
----
commit 3fba0988cbd7f324c5d21ae67f8cd50156957886
Author: Yurui Zhou <yu...@...>
Date:   2018-03-09T05:23:37Z

    ORC-343 Enable C++ writer to support RleV2
    1. Port RleV2 implementation from Java to C++
    2. Add RleV2 relevant tests to C++

----


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by majetideepak <gi...@git.apache.org>.

Github user majetideepak commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191291163
  
    --- Diff: c++/src/RLEv2.hh ---
    @@ -25,13 +25,89 @@
     
     #include <vector>
     
    +#define MIN_REPEAT 3
    +#define HIST_LEN 32
     namespace orc {
     
    -class RleDecoderV2 : public RleDecoder {
    +struct FixedBitSizes {
    +    enum FBS {
    +        ONE = 0, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE,
    +        THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN,
    +        TWENTY, TWENTYONE, TWENTYTWO, TWENTYTHREE, TWENTYFOUR, TWENTYSIX,
    +        TWENTYEIGHT, THIRTY, THIRTYTWO, FORTY, FORTYEIGHT, FIFTYSIX, SIXTYFOUR
    --- End diff --
    
    can you add another element `LAST=SIXTYFOUR` towards the end?


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by yuruiz <gi...@git.apache.org>.

Github user yuruiz commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191337473
  
    --- Diff: c++/src/Writer.cc ---
    @@ -38,9 +38,10 @@ namespace orc {
         FileVersion fileVersion;
         double dictionaryKeySizeThreshold;
         bool enableIndex;
    +    RleVersion rleVersion;
     
         WriterOptionsPrivate() :
    -                            fileVersion(0, 11) { // default to Hive_0_11
    +                            fileVersion(0, 12) { // default to Hive_0_12
    --- End diff --
    
    I can do that after PR #274 checked in 


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by rip-nsk <gi...@git.apache.org>.

Github user rip-nsk commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r190921533
  
    --- Diff: c++/src/RLEV2Util.hh ---
    @@ -0,0 +1,145 @@
    +/**
    +* Licensed to the Apache Software Foundation (ASF) under one
    +* or more contributor license agreements.  See the NOTICE file
    +* distributed with this work for additional information
    +* regarding copyright ownership.  The ASF licenses this file
    +* to you under the Apache License, Version 2.0 (the
    +* "License"); you may not use this file except in compliance
    +* with the License.  You may obtain a copy of the License at
    +*
    +*     http://www.apache.org/licenses/LICENSE-2.0
    +*
    +* Unless required by applicable law or agreed to in writing, software
    +* distributed under the License is distributed on an "AS IS" BASIS,
    +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +* See the License for the specific language governing permissions and
    +* limitations under the License.
    +*/
    +
    +#ifndef ORC_RLEV2UTIL_HH
    +#define ORC_RLEV2UTIL_HH
    +
    +#include "RLEv2.hh"
    +
    +namespace orc {
    +  inline uint32_t decodeBitWidth(uint32_t n) {
    +    if (n <= FixedBitSizes::TWENTYFOUR) {
    +      return n + 1;
    +    } else if (n == FixedBitSizes::TWENTYSIX) {
    +      return 26;
    +    } else if (n == FixedBitSizes::TWENTYEIGHT) {
    +      return 28;
    +    } else if (n == FixedBitSizes::THIRTY) {
    +      return 30;
    +    } else if (n == FixedBitSizes::THIRTYTWO) {
    +      return 32;
    +    } else if (n == FixedBitSizes::FORTY) {
    +      return 40;
    +    } else if (n == FixedBitSizes::FORTYEIGHT) {
    +      return 48;
    +    } else if (n == FixedBitSizes::FIFTYSIX) {
    +      return 56;
    +    } else {
    +      return 64;
    +    }
    +  }
    +
    +  inline uint32_t getClosestFixedBits(uint32_t n) {
    +    if (n == 0) {
    +      return 1;
    +    }
    +
    +    if (n >= 1 && n <= 24) {
    +      return n;
    +    } else if (n <= 26) {
    +      return 26;
    +    } else if (n <= 28) {
    +      return 28;
    +    } else if (n <= 30) {
    +      return 30;
    +    } else if (n <= 32) {
    +      return 32;
    +    } else if (n <= 40) {
    +      return 40;
    +    } else if (n <= 48) {
    +      return 48;
    +    } else if (n <= 56) {
    +      return 56;
    +    } else {
    +      return 64;
    +    }
    +  }
    +
    +  inline uint32_t getClosestAlignedFixedBits(uint32_t n) {
    +    if (n == 0 ||  n == 1) {
    +      return 1;
    +    } else if (n <= 2) {
    +      return 2;
    +    } else if (n <= 4) {
    +      return 4;
    +    } else if (n <= 8) {
    +      return 8;
    +    } else if (n <= 16) {
    +      return 16;
    +    } else if (n <= 24) {
    +      return 24;
    +    } else if (n <= 32) {
    +      return 32;
    +    } else if (n <= 40) {
    +      return 40;
    +    } else if (n <= 48) {
    +      return 48;
    +    } else if (n <= 56) {
    +      return 56;
    +    } else {
    +      return 64;
    +    }
    +  }
    +
    +  inline uint32_t encodeBitWidth(uint32_t n) {
    +    n = getClosestFixedBits(n);
    +
    +    if (n >= 1 && n <= 24) {
    +      return n - 1;
    +    } else if (n <= 26) {
    +      return FixedBitSizes::TWENTYSIX;
    +    } else if (n <= 28) {
    +      return FixedBitSizes::TWENTYEIGHT;
    +    } else if (n <= 30) {
    +      return FixedBitSizes::THIRTY;
    +    } else if (n <= 32) {
    +      return FixedBitSizes::THIRTYTWO;
    +    } else if (n <= 40) {
    +      return FixedBitSizes::FORTY;
    +    } else if (n <= 48) {
    +      return FixedBitSizes::FORTYEIGHT;
    +    } else if (n <= 56) {
    +      return FixedBitSizes::FIFTYSIX;
    +    } else {
    +      return FixedBitSizes::SIXTYFOUR;
    +    }
    +  }
    +
    +  inline uint32_t findClosestNumBits(int64_t value) {
    +    if (value < 0) {
    +      return getClosestFixedBits(64);
    +    }
    +
    +    uint32_t count = 0;
    +    while (value != 0) {
    +      count++;
    +      value = value >> 1;
    +    }
    +    return getClosestFixedBits(count);
    +  }
    +
    +  inline bool isSafeSubtract(long left, long right) {
    --- End diff --
    
    s/long/int64_t


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by wgtmac <gi...@git.apache.org>.

Github user wgtmac commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r190801085
  
    --- Diff: c++/src/RLE.hh ---
    @@ -68,7 +76,24 @@ namespace orc {
          * record current position
          * @param recorder use the recorder to record current positions
          */
    -    virtual void recordPosition(PositionRecorder* recorder) const = 0;
    +    virtual void recordPosition(PositionRecorder* recorder) const;
    +
    +  protected:
    +    std::unique_ptr<BufferedOutputStream> outputStream;
    +    size_t bufferPosition;
    +    size_t bufferLength;
    +    size_t numLiterals;
    +    int64_t* literals;
    +    bool isSigned;
    +    char* buffer;
    +
    +    virtual void write(int64_t val) = 0;
    +
    +    virtual void writeByte(char c);
    +
    +    virtual void writeVulong(int64_t val);
    +
    +    virtual void writeVslong(int64_t val);  protected:
    --- End diff --
    
    remove protected


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by yuruiz <gi...@git.apache.org>.

Github user yuruiz commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191995987
  
    --- Diff: c++/src/RLE.hh ---
    @@ -68,7 +76,24 @@ namespace orc {
          * record current position
          * @param recorder use the recorder to record current positions
          */
    -    virtual void recordPosition(PositionRecorder* recorder) const = 0;
    +    virtual void recordPosition(PositionRecorder* recorder) const;
    +
    +  protected:
    +    std::unique_ptr<BufferedOutputStream> outputStream;
    +    size_t bufferPosition;
    +    size_t bufferLength;
    +    size_t numLiterals;
    +    int64_t* literals;
    +    bool isSigned;
    +    char* buffer;
    --- End diff --
    
    Added initialisation to constructor to initialised it to nullptr.


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by jamesclampffer <gi...@git.apache.org>.

Github user jamesclampffer commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191932328
  
    --- Diff: c++/include/orc/Writer.hh ---
    @@ -38,6 +38,11 @@ namespace orc {
         CompressionStrategy_COMPRESSION
       };
     
    +  enum RleVersion {
    +    RleVersion_1,
    --- End diff --
    
    You may want to explicitly assign values here to ensure the addition of new encodings or refactoring work can't unintentionally change the underlying integer values.


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by majetideepak <gi...@git.apache.org>.

Github user majetideepak commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191615840
  
    --- Diff: c++/src/RleEncoderV2.cc ---
    @@ -0,0 +1,768 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with option work for additional information
    + * regarding copyright ownership.  The ASF licenses option file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use option file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +#include "Adaptor.hh"
    +#include "Compression.hh"
    +#include "RLEv2.hh"
    +#include "RLEV2Util.hh"
    +
    +#define MAX_LITERAL_SIZE 512
    +#define MAX_SHORT_REPEAT_LENGTH 10
    +
    +namespace orc {
    +
    +/**
    + * Compute the bits required to represent pth percentile value
    + * @param data - array
    + * @param p - percentile value (&gt;=0.0 to &lt;=1.0)
    + * @return pth percentile bits
    + */
    +uint32_t RleEncoderV2::percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist) {
    +    if ((p > 1.0) || (p <= 0.0)) {
    +        throw InvalidArgument("Invalid p value: " + std::to_string(p));
    +    }
    +
    +    if (!reuseHist) {
    +        // histogram that store the encoded bit requirement for each values.
    +        // maximum number of bits that can encoded is 32 (refer FixedBitSizes)
    +        memset(histgram, 0, 32 * sizeof(int32_t));
    --- End diff --
    
    Good point! Should we just add `FixedBitSizes::SIZE` as another element and use it then? 


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by majetideepak <gi...@git.apache.org>.

Github user majetideepak commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191269083
  
    --- Diff: c++/src/CMakeLists.txt ---
    @@ -179,15 +179,15 @@ set(SOURCE_FILES
       OrcFile.cc
       Reader.cc
       RLEv1.cc
    -  RLEv2.cc
    +  RleDecoderV2.cc
    +  RleEncoderV2.cc
    --- End diff --
    
    We split the Encoder and Decoder into two files for V2 and not for V1. Can we combine them into a single file for V2 as well?


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by majetideepak <gi...@git.apache.org>.

Github user majetideepak commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191816344
  
    --- Diff: c++/test/TestWriter.cc ---
    @@ -139,7 +136,6 @@ namespace orc {
                                           *type,
                                           pool,
                                           &memStream,
    -                                      rleVersion,
                                           FileVersion(0, 11));
    --- End diff --
    
    `FileVersion::v_0_11()`


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by majetideepak <gi...@git.apache.org>.

Github user majetideepak commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191832950
  
    --- Diff: c++/src/Writer.cc ---
    @@ -122,9 +127,17 @@ namespace orc {
       }
     
       WriterOptions& WriterOptions::setFileVersion(const FileVersion& version) {
    -    // Only Hive_0_11 version is supported currently
    -    if (version.getMajor() == 0 && version.getMinor() == 11) {
    +    // Only Hive_0_11 and Hive_0_12 version are supported currently
    +    if (version.getMajor() == 0 && (version.getMinor() == 11 || version.getMinor() == 12)) {
    --- End diff --
    
    My suggestion is to use this logic to implement `WriterOptions::getRleVersion()`.


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by majetideepak <gi...@git.apache.org>.

Github user majetideepak commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r192593861
  
    --- Diff: c++/src/RLEv2.hh ---
    @@ -25,13 +25,89 @@
     
     #include <vector>
     
    +#define MIN_REPEAT 3
    +#define HIST_LEN 32
     namespace orc {
     
    -class RleDecoderV2 : public RleDecoder {
    +struct FixedBitSizes {
    +    enum FBS {
    +        ONE = 0, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE,
    +        THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN,
    +        TWENTY, TWENTYONE, TWENTYTWO, TWENTYTHREE, TWENTYFOUR, TWENTYSIX,
    +        TWENTYEIGHT, THIRTY, THIRTYTWO, FORTY, FORTYEIGHT, FIFTYSIX, SIXTYFOUR, SIZE
    +    };
    +};
    +
    +enum EncodingType { SHORT_REPEAT=0, DIRECT=1, PATCHED_BASE=2, DELTA=3 };
    +
    +struct EncodingOption {
    +  EncodingType encoding;
    +  int64_t fixedDelta;
    +  int64_t gapVsPatchListCount;
    +  int64_t zigzagLiteralsCount;
    +  int64_t baseRedLiteralsCount;
    +  int64_t adjDeltasCount;
    +  uint32_t zzBits90p;
    +  uint32_t zzBits100p;
    +  uint32_t brBits95p;
    +  uint32_t brBits100p;
    +  uint32_t bitsDeltaMax;
    +  uint32_t patchWidth;
    +  uint32_t patchGapWidth;
    +  uint32_t patchLength;
    +  int64_t min;
    +  bool isFixedDelta;
    +};
    +
    +class RleEncoderV2 : public RleEncoder {
     public:
    +    RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned, bool alignBitPacking = true);
    --- End diff --
    
    `alignedBitPacking` is always true. Should we add a WriterOption to enable/disable it?
    Java uses the Encoding Strategy to choose this. C++ currently does not have this.
    ```
    java/core/src/java/org/apache/orc/impl/writer/TreeWriterBase.java:144
    if (writer.getEncodingStrategy().equals(OrcFile.EncodingStrategy.SPEED)) {
         alignedBitpacking = true;
    }
    ```


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by yuruiz <gi...@git.apache.org>.

Github user yuruiz commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191995946
  
    --- Diff: c++/src/Writer.cc ---
    @@ -122,9 +127,17 @@ namespace orc {
       }
     
       WriterOptions& WriterOptions::setFileVersion(const FileVersion& version) {
    -    // Only Hive_0_11 version is supported currently
    -    if (version.getMajor() == 0 && version.getMinor() == 11) {
    +    // Only Hive_0_11 and Hive_0_12 version are supported currently
    +    if (version.getMajor() == 0 && (version.getMinor() == 11 || version.getMinor() == 12)) {
    --- End diff --
    
    Done


---

[GitHub] orc issue #273: ORC-343 Enable C++ writer to support RleV2

Posted by majetideepak <gi...@git.apache.org>.

Github user majetideepak commented on the issue:

    https://github.com/apache/orc/pull/273
  
    @xndai  and @yuruiz  thanks for contributing this code. I will take a look at this.


---

[GitHub] orc issue #273: ORC-343 Enable C++ writer to support RleV2

Posted by wgtmac <gi...@git.apache.org>.

Github user wgtmac commented on the issue:

    https://github.com/apache/orc/pull/273
  
    Thanks everyone for reviewing this!
    
    Does anyone have any comment? If not, I will merge tomorrow.


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by rip-nsk <gi...@git.apache.org>.

Github user rip-nsk commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r190921029
  
    --- Diff: c++/src/RleEncoderV2.cc ---
    @@ -0,0 +1,768 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with option work for additional information
    + * regarding copyright ownership.  The ASF licenses option file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use option file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +#include "Adaptor.hh"
    +#include "Compression.hh"
    +#include "RLEv2.hh"
    +#include "RLEV2Util.hh"
    +
    +#define MAX_LITERAL_SIZE 512
    +#define MAX_SHORT_REPEAT_LENGTH 10
    +
    +namespace orc {
    +
    +/**
    + * Compute the bits required to represent pth percentile value
    + * @param data - array
    + * @param p - percentile value (&gt;=0.0 to &lt;=1.0)
    + * @return pth percentile bits
    + */
    +uint32_t RleEncoderV2::percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist) {
    +    if ((p > 1.0) || (p <= 0.0)) {
    +        throw InvalidArgument("Invalid p value: " + std::to_string(p));
    +    }
    +
    +    if (!reuseHist) {
    +        // histogram that store the encoded bit requirement for each values.
    +        // maximum number of bits that can encoded is 32 (refer FixedBitSizes)
    +        memset(histgram, 0, 32 * sizeof(int32_t));
    +        // compute the histogram
    +        for(size_t i = offset; i < (offset + length); i++) {
    +            uint32_t idx = encodeBitWidth(findClosestNumBits(data[i]));
    +            histgram[idx] += 1;
    +        }
    +    }
    +
    +    int32_t perLen = static_cast<int32_t>(static_cast<double>(length) * (1.0 - p));
    +
    +    // return the bits required by pth percentile length
    +    for(int32_t i = HIST_LEN - 1; i >= 0; i--) {
    +        perLen -= histgram[i];
    +        if (perLen < 0) {
    +            return decodeBitWidth(static_cast<uint32_t>(i));
    +        }
    +    }
    +
    +    return 0;
    +}
    +
    +RleEncoderV2::RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream,
    +                           bool hasSigned, bool alignBitPacking) :
    +        RleEncoder(std::move(outStream), hasSigned),
    +        alignedBitPacking(alignBitPacking),
    +        prevDelta(0){
    +    literals = new int64_t[MAX_LITERAL_SIZE];
    +    gapVsPatchList = new int64_t[MAX_LITERAL_SIZE];
    +    zigzagLiterals = new int64_t[MAX_LITERAL_SIZE];
    +    baseRedLiterals = new int64_t[MAX_LITERAL_SIZE];
    +    adjDeltas = new int64_t[MAX_LITERAL_SIZE];
    +}
    +
    +void RleEncoderV2::write(int64_t val) {
    +    if(numLiterals == 0) {
    +        initializeLiterals(val);
    +        return;
    +    }
    +
    +    if(numLiterals == 1) {
    +        prevDelta = val - literals[0];
    +        literals[numLiterals++] = val;
    +
    +        if(val == literals[0]) {
    +            fixedRunLength = 2;
    +            variableRunLength = 0;
    +        } else {
    +            fixedRunLength = 0;
    +            variableRunLength = 2;
    +        }
    +        return;
    +    }
    +
    +    int64_t currentDelta = val - literals[numLiterals - 1];
    +    EncodingOption option = {};
    +    if (prevDelta == 0 && currentDelta == 0) {
    +        // case 1: fixed delta run
    +        literals[numLiterals++] = val;
    +
    +        if (variableRunLength > 0) {
    +            // if variable run is non-zero then we are seeing repeating
    +            // values at the end of variable run in which case fixed Run
    +            // length is 2
    +            fixedRunLength = 2;
    +        }
    +        fixedRunLength++;
    +
    +        // if fixed run met the minimum condition and if variable
    +        // run is non-zero then flush the variable run and shift the
    +        // tail fixed runs to start of the buffer
    +        if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) {
    +            numLiterals -= MIN_REPEAT;
    +            variableRunLength -= (MIN_REPEAT - 1);
    +
    +            int64_t tailVals[MIN_REPEAT] = {0};
    +
    +            memcpy(tailVals, literals + numLiterals, sizeof(int64_t) * MIN_REPEAT);
    +            determineEncoding(option);
    +            writeValues(option);
    +
    +            // shift tail fixed runs to beginning of the buffer
    +            for (size_t i = 0; i < MIN_REPEAT; ++i) {
    +                literals[numLiterals++] = tailVals[i];
    +            }
    +        }
    +
    +        if (fixedRunLength == MAX_LITERAL_SIZE) {;
    +            determineEncoding(option);
    +            writeValues(option);
    +        }
    +        return;
    +    }
    +
    +    // case 2: variable delta run
    +
    +    // if fixed run length is non-zero and if it satisfies the
    +    // short repeat conditions then write the values as short repeats
    +    // else use delta encoding
    +    if (fixedRunLength >= MIN_REPEAT) {
    +        if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
    +            option.encoding = SHORT_REPEAT;
    +        } else {
    +            option.encoding = DELTA;
    +            option.isFixedDelta = true;
    +        }
    +        writeValues(option);
    +    }
    +
    +    // if fixed run length is <MIN_REPEAT and current value is
    +    // different from previous then treat it as variable run
    +    if (fixedRunLength > 0 && fixedRunLength < MIN_REPEAT && val != literals[numLiterals - 1]) {
    +        variableRunLength = fixedRunLength;
    +        fixedRunLength = 0;
    +    }
    +
    +    // after writing values re-initialize the variables
    +    if (numLiterals == 0) {
    +        initializeLiterals(val);
    +    } else {
    +        prevDelta = val - literals[numLiterals - 1];
    +        literals[numLiterals++] = val;
    +        variableRunLength++;
    +
    +        if (variableRunLength == MAX_LITERAL_SIZE) {
    +            determineEncoding(option);
    +            writeValues(option);
    +        }
    +    }
    +}
    +
    +void RleEncoderV2::computeZigZagLiterals(EncodingOption &option) {
    +    int64_t zzEncVal = 0;
    +    for (size_t i = 0; i < numLiterals; i++) {
    +        if (isSigned) {
    +            zzEncVal = zigZag(literals[i]);
    +        } else {
    +            zzEncVal = literals[i];
    +        }
    +        zigzagLiterals[option.zigzagLiteralsCount++] = zzEncVal;
    +    }
    +}
    +
    +void RleEncoderV2::preparePatchedBlob(EncodingOption& option) {
    +    // mask will be max value beyond which patch will be generated
    +    int64_t mask = static_cast<int64_t>(1LU << option.brBits95p) - 1;
    +
    +    // since we are considering only 95 percentile, the size of gap and
    +    // patch array can contain only be 5% values
    +    option.patchLength = static_cast<uint32_t>(std::ceil((numLiterals / 20)));
    +
    +    // #bit for patch
    +    option.patchWidth = option.brBits100p - option.brBits95p;
    +    option.patchWidth = getClosestFixedBits(option.patchWidth);
    +
    +    // if patch bit requirement is 64 then it will not possible to pack
    +    // gap and patch together in a long. To make sure gap and patch can be
    +    // packed together adjust the patch width
    +    if (option.patchWidth == 64) {
    +        option.patchWidth = 56;
    +        option.brBits95p = 8;
    +        mask = static_cast<int64_t>(1LU << option.brBits95p) - 1;
    --- End diff --
    
    (static_cast<int64_t>(1) << option.brBits95p)


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by yuruiz <gi...@git.apache.org>.

Github user yuruiz commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191337994
  
    --- Diff: c++/src/CMakeLists.txt ---
    @@ -179,15 +179,15 @@ set(SOURCE_FILES
       OrcFile.cc
       Reader.cc
       RLEv1.cc
    -  RLEv2.cc
    +  RleDecoderV2.cc
    +  RleEncoderV2.cc
    --- End diff --
    
    The reason I prefer to Split Encoder and Decoder into two files for V2 simply because the code has grow too big that it would be very difficult to navigate if combine them into a single file.


---

[GitHub] orc issue #273: ORC-343 Enable C++ writer to support RleV2

Posted by yuruiz <gi...@git.apache.org>.

Github user yuruiz commented on the issue:

    https://github.com/apache/orc/pull/273
  
    [benchmark.xlsx](https://github.com/apache/orc/files/2047317/benchmark.xlsx)
    RleV2 benchmark


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by majetideepak <gi...@git.apache.org>.

Github user majetideepak commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191816749
  
    --- Diff: c++/test/TestWriter.cc ---
    @@ -47,7 +47,6 @@ namespace orc {
                                           const Type& type,
                                           MemoryPool* memoryPool,
                                           OutputStream* stream,
    -                                      RleVersion rleVersion,
                                           FileVersion version = FileVersion(0, 12)){
    --- End diff --
    
    `FileVersion::v_0_12()`


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by yuruiz <gi...@git.apache.org>.

Github user yuruiz commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191340586
  
    --- Diff: c++/src/RleEncoderV2.cc ---
    @@ -0,0 +1,768 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with option work for additional information
    + * regarding copyright ownership.  The ASF licenses option file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use option file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +#include "Adaptor.hh"
    +#include "Compression.hh"
    +#include "RLEv2.hh"
    +#include "RLEV2Util.hh"
    +
    +#define MAX_LITERAL_SIZE 512
    +#define MAX_SHORT_REPEAT_LENGTH 10
    +
    +namespace orc {
    +
    +/**
    + * Compute the bits required to represent pth percentile value
    + * @param data - array
    + * @param p - percentile value (&gt;=0.0 to &lt;=1.0)
    + * @return pth percentile bits
    + */
    +uint32_t RleEncoderV2::percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist) {
    +    if ((p > 1.0) || (p <= 0.0)) {
    +        throw InvalidArgument("Invalid p value: " + std::to_string(p));
    +    }
    +
    +    if (!reuseHist) {
    +        // histogram that store the encoded bit requirement for each values.
    +        // maximum number of bits that can encoded is 32 (refer FixedBitSizes)
    +        memset(histgram, 0, 32 * sizeof(int32_t));
    --- End diff --
    
    but FixedBitSizes::LAST equal to 31 here


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by majetideepak <gi...@git.apache.org>.

Github user majetideepak commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191271607
  
    --- Diff: c++/src/RleDecoderV2.cc ---
    @@ -1,10 +1,10 @@
     /**
      * Licensed to the Apache Software Foundation (ASF) under one
      * or more contributor license agreements.  See the NOTICE file
    - * distributed with this work for additional information
    - * regarding copyright ownership.  The ASF licenses this file
    + * distributed with option work for additional information
    --- End diff --
    
    The Apache license header must not change.


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by majetideepak <gi...@git.apache.org>.

Github user majetideepak commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191830703
  
    --- Diff: c++/src/Writer.cc ---
    @@ -38,9 +38,10 @@ namespace orc {
         FileVersion fileVersion;
         double dictionaryKeySizeThreshold;
         bool enableIndex;
    +    RleVersion rleVersion;
    --- End diff --
    
    To be clear, do we need this `RleVersion` here and in the `WriterOptions`?


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by wgtmac <gi...@git.apache.org>.

Github user wgtmac commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r190801522
  
    --- Diff: c++/src/CMakeLists.txt ---
    @@ -179,15 +179,17 @@ set(SOURCE_FILES
       OrcFile.cc
       Reader.cc
       RLEv1.cc
    -  RLEv2.cc
    +  RLEv2.hh
    --- End diff --
    
    remove these two .hh files.


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by majetideepak <gi...@git.apache.org>.

Github user majetideepak commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191267866
  
    --- Diff: c++/include/orc/Writer.hh ---
    @@ -164,6 +169,16 @@ namespace orc {
          */
         std::ostream * getErrorStream() const;
     
    +    /**
    +     * Set the RLE version.
    +     */
    +    WriterOptions& setRleVersion(RleVersion version);
    --- End diff --
    
    `WriterOptions& setRleVersion(const RleVersion& version);`


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by rip-nsk <gi...@git.apache.org>.

Github user rip-nsk commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r190920599
  
    --- Diff: c++/src/RleEncoderV2.cc ---
    @@ -0,0 +1,768 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with option work for additional information
    + * regarding copyright ownership.  The ASF licenses option file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use option file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +#include "Adaptor.hh"
    +#include "Compression.hh"
    +#include "RLEv2.hh"
    +#include "RLEV2Util.hh"
    +
    +#define MAX_LITERAL_SIZE 512
    +#define MAX_SHORT_REPEAT_LENGTH 10
    +
    +namespace orc {
    +
    +/**
    + * Compute the bits required to represent pth percentile value
    + * @param data - array
    + * @param p - percentile value (&gt;=0.0 to &lt;=1.0)
    + * @return pth percentile bits
    + */
    +uint32_t RleEncoderV2::percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist) {
    +    if ((p > 1.0) || (p <= 0.0)) {
    +        throw InvalidArgument("Invalid p value: " + std::to_string(p));
    +    }
    +
    +    if (!reuseHist) {
    +        // histogram that store the encoded bit requirement for each values.
    +        // maximum number of bits that can encoded is 32 (refer FixedBitSizes)
    +        memset(histgram, 0, 32 * sizeof(int32_t));
    +        // compute the histogram
    +        for(size_t i = offset; i < (offset + length); i++) {
    +            uint32_t idx = encodeBitWidth(findClosestNumBits(data[i]));
    +            histgram[idx] += 1;
    +        }
    +    }
    +
    +    int32_t perLen = static_cast<int32_t>(static_cast<double>(length) * (1.0 - p));
    +
    +    // return the bits required by pth percentile length
    +    for(int32_t i = HIST_LEN - 1; i >= 0; i--) {
    +        perLen -= histgram[i];
    +        if (perLen < 0) {
    +            return decodeBitWidth(static_cast<uint32_t>(i));
    +        }
    +    }
    +
    +    return 0;
    +}
    +
    +RleEncoderV2::RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream,
    +                           bool hasSigned, bool alignBitPacking) :
    +        RleEncoder(std::move(outStream), hasSigned),
    +        alignedBitPacking(alignBitPacking),
    +        prevDelta(0){
    +    literals = new int64_t[MAX_LITERAL_SIZE];
    +    gapVsPatchList = new int64_t[MAX_LITERAL_SIZE];
    +    zigzagLiterals = new int64_t[MAX_LITERAL_SIZE];
    +    baseRedLiterals = new int64_t[MAX_LITERAL_SIZE];
    +    adjDeltas = new int64_t[MAX_LITERAL_SIZE];
    +}
    +
    +void RleEncoderV2::write(int64_t val) {
    +    if(numLiterals == 0) {
    +        initializeLiterals(val);
    +        return;
    +    }
    +
    +    if(numLiterals == 1) {
    +        prevDelta = val - literals[0];
    +        literals[numLiterals++] = val;
    +
    +        if(val == literals[0]) {
    +            fixedRunLength = 2;
    +            variableRunLength = 0;
    +        } else {
    +            fixedRunLength = 0;
    +            variableRunLength = 2;
    +        }
    +        return;
    +    }
    +
    +    int64_t currentDelta = val - literals[numLiterals - 1];
    +    EncodingOption option = {};
    +    if (prevDelta == 0 && currentDelta == 0) {
    +        // case 1: fixed delta run
    +        literals[numLiterals++] = val;
    +
    +        if (variableRunLength > 0) {
    +            // if variable run is non-zero then we are seeing repeating
    +            // values at the end of variable run in which case fixed Run
    +            // length is 2
    +            fixedRunLength = 2;
    +        }
    +        fixedRunLength++;
    +
    +        // if fixed run met the minimum condition and if variable
    +        // run is non-zero then flush the variable run and shift the
    +        // tail fixed runs to start of the buffer
    +        if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) {
    +            numLiterals -= MIN_REPEAT;
    +            variableRunLength -= (MIN_REPEAT - 1);
    +
    +            int64_t tailVals[MIN_REPEAT] = {0};
    +
    +            memcpy(tailVals, literals + numLiterals, sizeof(int64_t) * MIN_REPEAT);
    +            determineEncoding(option);
    +            writeValues(option);
    +
    +            // shift tail fixed runs to beginning of the buffer
    +            for (size_t i = 0; i < MIN_REPEAT; ++i) {
    +                literals[numLiterals++] = tailVals[i];
    +            }
    +        }
    +
    +        if (fixedRunLength == MAX_LITERAL_SIZE) {;
    +            determineEncoding(option);
    +            writeValues(option);
    +        }
    +        return;
    +    }
    +
    +    // case 2: variable delta run
    +
    +    // if fixed run length is non-zero and if it satisfies the
    +    // short repeat conditions then write the values as short repeats
    +    // else use delta encoding
    +    if (fixedRunLength >= MIN_REPEAT) {
    +        if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
    +            option.encoding = SHORT_REPEAT;
    +        } else {
    +            option.encoding = DELTA;
    +            option.isFixedDelta = true;
    +        }
    +        writeValues(option);
    +    }
    +
    +    // if fixed run length is <MIN_REPEAT and current value is
    +    // different from previous then treat it as variable run
    +    if (fixedRunLength > 0 && fixedRunLength < MIN_REPEAT && val != literals[numLiterals - 1]) {
    +        variableRunLength = fixedRunLength;
    +        fixedRunLength = 0;
    +    }
    +
    +    // after writing values re-initialize the variables
    +    if (numLiterals == 0) {
    +        initializeLiterals(val);
    +    } else {
    +        prevDelta = val - literals[numLiterals - 1];
    +        literals[numLiterals++] = val;
    +        variableRunLength++;
    +
    +        if (variableRunLength == MAX_LITERAL_SIZE) {
    +            determineEncoding(option);
    +            writeValues(option);
    +        }
    +    }
    +}
    +
    +void RleEncoderV2::computeZigZagLiterals(EncodingOption &option) {
    +    int64_t zzEncVal = 0;
    +    for (size_t i = 0; i < numLiterals; i++) {
    +        if (isSigned) {
    +            zzEncVal = zigZag(literals[i]);
    +        } else {
    +            zzEncVal = literals[i];
    +        }
    +        zigzagLiterals[option.zigzagLiteralsCount++] = zzEncVal;
    +    }
    +}
    +
    +void RleEncoderV2::preparePatchedBlob(EncodingOption& option) {
    +    // mask will be max value beyond which patch will be generated
    +    int64_t mask = static_cast<int64_t>(1LU << option.brBits95p) - 1;
    --- End diff --
    
    (static_cast<int64_t>(1) << option.brBits95p)


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by majetideepak <gi...@git.apache.org>.

Github user majetideepak commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191815756
  
    --- Diff: c++/test/TestWriter.cc ---
    @@ -1174,5 +1170,5 @@ namespace orc {
         }
       }
     
    -  INSTANTIATE_TEST_CASE_P(OrcTest, WriterTest, Values(RleVersion_1, RleVersion_2));
    +  INSTANTIATE_TEST_CASE_P(OrcTest, WriterTest, Values(FileVersion::v_0_11(), FileVersion::v_0_11()));
    --- End diff --
    
    Should be `Values(FileVersion::v_0_11(), FileVersion::v_0_12()))`


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by yuruiz <gi...@git.apache.org>.

Github user yuruiz commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191672633
  
    --- Diff: c++/src/RleEncoderV2.cc ---
    @@ -0,0 +1,768 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with option work for additional information
    + * regarding copyright ownership.  The ASF licenses option file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use option file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +#include "Adaptor.hh"
    +#include "Compression.hh"
    +#include "RLEv2.hh"
    +#include "RLEV2Util.hh"
    +
    +#define MAX_LITERAL_SIZE 512
    +#define MAX_SHORT_REPEAT_LENGTH 10
    +
    +namespace orc {
    +
    +/**
    + * Compute the bits required to represent pth percentile value
    + * @param data - array
    + * @param p - percentile value (&gt;=0.0 to &lt;=1.0)
    + * @return pth percentile bits
    + */
    +uint32_t RleEncoderV2::percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist) {
    +    if ((p > 1.0) || (p <= 0.0)) {
    +        throw InvalidArgument("Invalid p value: " + std::to_string(p));
    +    }
    +
    +    if (!reuseHist) {
    +        // histogram that store the encoded bit requirement for each values.
    +        // maximum number of bits that can encoded is 32 (refer FixedBitSizes)
    +        memset(histgram, 0, 32 * sizeof(int32_t));
    --- End diff --
    
    sure


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by majetideepak <gi...@git.apache.org>.

Github user majetideepak commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191291625
  
    --- Diff: c++/src/RleEncoderV2.cc ---
    @@ -0,0 +1,768 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with option work for additional information
    + * regarding copyright ownership.  The ASF licenses option file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use option file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +#include "Adaptor.hh"
    +#include "Compression.hh"
    +#include "RLEv2.hh"
    +#include "RLEV2Util.hh"
    +
    +#define MAX_LITERAL_SIZE 512
    +#define MAX_SHORT_REPEAT_LENGTH 10
    +
    +namespace orc {
    +
    +/**
    + * Compute the bits required to represent pth percentile value
    + * @param data - array
    + * @param p - percentile value (&gt;=0.0 to &lt;=1.0)
    + * @return pth percentile bits
    + */
    +uint32_t RleEncoderV2::percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist) {
    +    if ((p > 1.0) || (p <= 0.0)) {
    +        throw InvalidArgument("Invalid p value: " + std::to_string(p));
    +    }
    +
    +    if (!reuseHist) {
    +        // histogram that store the encoded bit requirement for each values.
    +        // maximum number of bits that can encoded is 32 (refer FixedBitSizes)
    +        memset(histgram, 0, 32 * sizeof(int32_t));
    +        // compute the histogram
    +        for(size_t i = offset; i < (offset + length); i++) {
    +            uint32_t idx = encodeBitWidth(findClosestNumBits(data[i]));
    +            histgram[idx] += 1;
    +        }
    +    }
    +
    +    int32_t perLen = static_cast<int32_t>(static_cast<double>(length) * (1.0 - p));
    +
    +    // return the bits required by pth percentile length
    +    for(int32_t i = HIST_LEN - 1; i >= 0; i--) {
    +        perLen -= histgram[i];
    +        if (perLen < 0) {
    +            return decodeBitWidth(static_cast<uint32_t>(i));
    +        }
    +    }
    +
    --- End diff --
    
    extra line


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by majetideepak <gi...@git.apache.org>.

Github user majetideepak commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r192729045
  
    --- Diff: c++/test/TestRleEncoder.cc ---
    @@ -0,0 +1,243 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with this work for additional information
    + * regarding copyright ownership.  The ASF licenses this file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +#include <cstdlib>
    +
    +#include "MemoryOutputStream.hh"
    +#include "RLEv1.hh"
    +
    +#include "wrap/orc-proto-wrapper.hh"
    +#include "wrap/gtest-wrapper.h"
    +
    +namespace orc {
    +
    +  const int DEFAULT_MEM_STREAM_SIZE = 1024 * 1024; // 1M
    +
    +  void generateData(
    +                     uint64_t numValues,
    +                     int64_t start,
    +                     int64_t delta,
    +                     bool random,
    +                     int64_t* data,
    +                     uint64_t numNulls = 0,
    +                     char* notNull = nullptr) {
    +    if (numNulls != 0 && notNull != nullptr) {
    +      memset(notNull, 1, numValues);
    +      while (numNulls > 0) {
    +        uint64_t pos = static_cast<uint64_t>(std::rand()) % numValues;
    +        if (notNull[pos]) {
    +          notNull[pos] = static_cast<char>(0);
    +          --numNulls;
    +        }
    +      }
    +    }
    +
    +    for (uint64_t i = 0; i < numValues; ++i) {
    +      if (notNull == nullptr || notNull[i])
    +      {
    +        if (!random) {
    +          data[i] = start + delta * static_cast<int64_t>(i);
    +        } else {
    +          data[i] = std::rand();
    +        }
    +      }
    +    }
    +  }
    +
    +  void decodeAndVerify(
    +                       RleVersion version,
    +                       const MemoryOutputStream& memStream,
    +                       int64_t * data,
    +                       uint64_t numValues,
    +                       const char* notNull,
    +                       bool isSinged) {
    +    std::unique_ptr<RleDecoder> decoder = createRleDecoder(
    +            std::unique_ptr<SeekableArrayInputStream>(new SeekableArrayInputStream(
    +                    memStream.getData(),
    +                    memStream.getLength())),
    +            isSinged, version, *getDefaultPool());
    +
    +    int64_t* decodedData = new int64_t[numValues];
    +    decoder->next(decodedData, numValues, notNull);
    +
    +    for (uint64_t i = 0; i < numValues; ++i) {
    +      if (!notNull || notNull[i]) {
    +        EXPECT_EQ(data[i], decodedData[i]);
    +      }
    +    }
    +
    +    delete [] decodedData;
    +  }
    +
    +  std::unique_ptr<RleEncoder> getEncoder(RleVersion version,
    +                                        MemoryOutputStream& memStream,
    +                                        bool isSigned)
    +  {
    +    MemoryPool * pool = getDefaultPool();
    +
    +    return createRleEncoder(
    +            std::unique_ptr<BufferedOutputStream>(
    +                    new BufferedOutputStream(*pool, &memStream, 500 * 1024, 1024)),
    +            isSigned, version, *pool, true);
    --- End diff --
    
    can we template these tests for `alignedBitpacking =  false`?


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by wgtmac <gi...@git.apache.org>.

Github user wgtmac commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r190800936
  
    --- Diff: c++/src/RLE.cc ---
    @@ -64,4 +66,55 @@ namespace orc {
         }
       }
     
    +  void RleEncoder::add(const int64_t* data, uint64_t numValues,
    +                         const char* notNull) {
    +    for (uint64_t i = 0; i < numValues; ++i) {
    +      if (!notNull || notNull[i]) {
    +        write(data[i]);
    +      }
    +    }
    +  }
    +
    +  void RleEncoder::writeVslong(int64_t val) {
    +    writeVulong((val << 1) ^ (val >> 63));
    +  }
    +
    +  void RleEncoder::writeVulong(int64_t val) {
    +    while (true) {
    +      if ((val & ~0x7f) == 0) {
    +        writeByte(static_cast<char>(val));
    +        return;
    +      } else {
    +        writeByte(static_cast<char>(0x80 | (val & 0x7f)));
    +        // cast val to unsigned so as to force 0-fill right shift
    +        val = (static_cast<uint64_t>(val) >> 7);
    +      }
    +    }
    +  }
    +
    +  void RleEncoder::writeByte(char c) {
    +    if (bufferPosition == bufferLength) {
    +      int addedSize = 0;
    +      if (!outputStream->Next(reinterpret_cast<void **>(&buffer), &addedSize)) {
    +        throw std::bad_alloc();
    +      }
    +      bufferPosition = 0;
    +      bufferLength = static_cast<size_t>(addedSize);
    +    }
    +    buffer[bufferPosition++] = c;
    +  }
    +
    +  void RleEncoder::recordPosition(PositionRecorder* recorder) const {
    --- End diff --
    
    We haven't added support for writing index stream so far. Remove this function for now and that should be in a separate change.


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by yuruiz <gi...@git.apache.org>.

Github user yuruiz commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191973999
  
    --- Diff: c++/src/Writer.cc ---
    @@ -38,9 +38,10 @@ namespace orc {
         FileVersion fileVersion;
         double dictionaryKeySizeThreshold;
         bool enableIndex;
    +    RleVersion rleVersion;
    --- End diff --
    
    yes, please check the Writer.cc:129 setFileVersion(), we removed setRleVersion and will only update RleVersion on setting FileVersion.
    
    The reason we still keep RleVersion in WriterOptions is that there is dependency on RleVersion everywhere, removing it should be done in separate PR.


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by majetideepak <gi...@git.apache.org>.

Github user majetideepak commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191291449
  
    --- Diff: c++/src/RleEncoderV2.cc ---
    @@ -0,0 +1,768 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with option work for additional information
    + * regarding copyright ownership.  The ASF licenses option file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use option file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +#include "Adaptor.hh"
    +#include "Compression.hh"
    +#include "RLEv2.hh"
    +#include "RLEV2Util.hh"
    +
    +#define MAX_LITERAL_SIZE 512
    +#define MAX_SHORT_REPEAT_LENGTH 10
    +
    +namespace orc {
    +
    +/**
    + * Compute the bits required to represent pth percentile value
    + * @param data - array
    + * @param p - percentile value (&gt;=0.0 to &lt;=1.0)
    + * @return pth percentile bits
    + */
    +uint32_t RleEncoderV2::percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist) {
    +    if ((p > 1.0) || (p <= 0.0)) {
    +        throw InvalidArgument("Invalid p value: " + std::to_string(p));
    +    }
    +
    +    if (!reuseHist) {
    +        // histogram that store the encoded bit requirement for each values.
    +        // maximum number of bits that can encoded is 32 (refer FixedBitSizes)
    +        memset(histgram, 0, 32 * sizeof(int32_t));
    --- End diff --
    
    Use `FixedBitSizes::LAST` instead of 32?


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by asfgit <gi...@git.apache.org>.

Github user asfgit closed the pull request at:

    https://github.com/apache/orc/pull/273


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by yuruiz <gi...@git.apache.org>.

Github user yuruiz commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191102909
  
    --- Diff: c++/src/RLE.cc ---
    @@ -64,4 +66,55 @@ namespace orc {
         }
       }
     
    +  void RleEncoder::add(const int64_t* data, uint64_t numValues,
    +                         const char* notNull) {
    +    for (uint64_t i = 0; i < numValues; ++i) {
    +      if (!notNull || notNull[i]) {
    +        write(data[i]);
    +      }
    +    }
    +  }
    +
    +  void RleEncoder::writeVslong(int64_t val) {
    +    writeVulong((val << 1) ^ (val >> 63));
    +  }
    +
    +  void RleEncoder::writeVulong(int64_t val) {
    +    while (true) {
    +      if ((val & ~0x7f) == 0) {
    +        writeByte(static_cast<char>(val));
    +        return;
    +      } else {
    +        writeByte(static_cast<char>(0x80 | (val & 0x7f)));
    +        // cast val to unsigned so as to force 0-fill right shift
    +        val = (static_cast<uint64_t>(val) >> 7);
    +      }
    +    }
    +  }
    +
    +  void RleEncoder::writeByte(char c) {
    +    if (bufferPosition == bufferLength) {
    +      int addedSize = 0;
    +      if (!outputStream->Next(reinterpret_cast<void **>(&buffer), &addedSize)) {
    +        throw std::bad_alloc();
    +      }
    +      bufferPosition = 0;
    +      bufferLength = static_cast<size_t>(addedSize);
    +    }
    +    buffer[bufferPosition++] = c;
    +  }
    +
    +  void RleEncoder::recordPosition(PositionRecorder* recorder) const {
    --- End diff --
    
    This method has been exits for a while, removing it requires a wide range refactoring, which is not the purpose of this PR.


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by jamesclampffer <gi...@git.apache.org>.

Github user jamesclampffer commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191935614
  
    --- Diff: c++/src/RLE.hh ---
    @@ -68,7 +76,24 @@ namespace orc {
          * record current position
          * @param recorder use the recorder to record current positions
          */
    -    virtual void recordPosition(PositionRecorder* recorder) const = 0;
    +    virtual void recordPosition(PositionRecorder* recorder) const;
    +
    +  protected:
    +    std::unique_ptr<BufferedOutputStream> outputStream;
    +    size_t bufferPosition;
    +    size_t bufferLength;
    +    size_t numLiterals;
    +    int64_t* literals;
    +    bool isSigned;
    +    char* buffer;
    --- End diff --
    
    Is this initialized anywhere?


---

[GitHub] orc issue #273: ORC-343 Enable C++ writer to support RleV2

Posted by rip-nsk <gi...@git.apache.org>.

Github user rip-nsk commented on the issue:

    https://github.com/apache/orc/pull/273
  
      C:\projects\orc\c++\src\RleEncoderV2.cc(187): warning C4334: '<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?) [C:\projects\orc\build\c++\src\orc.vcxproj]
      C:\projects\orc\c++\src\RleEncoderV2.cc(203): warning C4334: '<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?) [C:\projects\orc\build\c++\src\orc.vcxproj]
      C:\projects\orc\c++\src\RleEncoderV2.cc(334): warning C4244: 'argument': conversion from 'int64_t' to 'long', possible loss of data [C:\projects\orc\build\c++\src\orc.vcxproj]
      C:\projects\orc\c++\src\RleEncoderV2.cc(735): warning C4244: 'initializing': conversion from 'int64_t' to 'long', possible


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by majetideepak <gi...@git.apache.org>.

Github user majetideepak commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191829402
  
    --- Diff: c++/src/Writer.cc ---
    @@ -38,9 +38,10 @@ namespace orc {
         FileVersion fileVersion;
         double dictionaryKeySizeThreshold;
         bool enableIndex;
    +    RleVersion rleVersion;
    --- End diff --
    
    I think the file version should determine the `RleVersion`. Refer `isNewWriteFormat` and `isDirectV2` on the Java side.



---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by majetideepak <gi...@git.apache.org>.

Github user majetideepak commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191275478
  
    --- Diff: c++/src/Writer.cc ---
    @@ -38,9 +38,10 @@ namespace orc {
         FileVersion fileVersion;
         double dictionaryKeySizeThreshold;
         bool enableIndex;
    +    RleVersion rleVersion;
     
         WriterOptionsPrivate() :
    -                            fileVersion(0, 11) { // default to Hive_0_11
    +                            fileVersion(0, 12) { // default to Hive_0_12
    --- End diff --
    
    We should use the static constants proposed in PR https://github.com/apache/orc/pull/274 moving forward.


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by yuruiz <gi...@git.apache.org>.

Github user yuruiz commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r192614780
  
    --- Diff: c++/src/RLEv2.hh ---
    @@ -25,13 +25,89 @@
     
     #include <vector>
     
    +#define MIN_REPEAT 3
    +#define HIST_LEN 32
     namespace orc {
     
    -class RleDecoderV2 : public RleDecoder {
    +struct FixedBitSizes {
    +    enum FBS {
    +        ONE = 0, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE,
    +        THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN,
    +        TWENTY, TWENTYONE, TWENTYTWO, TWENTYTHREE, TWENTYFOUR, TWENTYSIX,
    +        TWENTYEIGHT, THIRTY, THIRTYTWO, FORTY, FORTYEIGHT, FIFTYSIX, SIXTYFOUR, SIZE
    +    };
    +};
    +
    +enum EncodingType { SHORT_REPEAT=0, DIRECT=1, PATCHED_BASE=2, DELTA=3 };
    +
    +struct EncodingOption {
    +  EncodingType encoding;
    +  int64_t fixedDelta;
    +  int64_t gapVsPatchListCount;
    +  int64_t zigzagLiteralsCount;
    +  int64_t baseRedLiteralsCount;
    +  int64_t adjDeltasCount;
    +  uint32_t zzBits90p;
    +  uint32_t zzBits100p;
    +  uint32_t brBits95p;
    +  uint32_t brBits100p;
    +  uint32_t bitsDeltaMax;
    +  uint32_t patchWidth;
    +  uint32_t patchGapWidth;
    +  uint32_t patchLength;
    +  int64_t min;
    +  bool isFixedDelta;
    +};
    +
    +class RleEncoderV2 : public RleEncoder {
     public:
    +    RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned, bool alignBitPacking = true);
    --- End diff --
    
    Done


---

[GitHub] orc issue #273: ORC-343 Enable C++ writer to support RleV2

Posted by majetideepak <gi...@git.apache.org>.

Github user majetideepak commented on the issue:

    https://github.com/apache/orc/pull/273
  
    +1 LGTM


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by majetideepak <gi...@git.apache.org>.

Github user majetideepak commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191290786
  
    --- Diff: c++/src/RleEncoderV2.cc ---
    @@ -0,0 +1,768 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with option work for additional information
    + * regarding copyright ownership.  The ASF licenses option file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use option file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +#include "Adaptor.hh"
    +#include "Compression.hh"
    +#include "RLEv2.hh"
    +#include "RLEV2Util.hh"
    +
    +#define MAX_LITERAL_SIZE 512
    +#define MAX_SHORT_REPEAT_LENGTH 10
    +
    +namespace orc {
    +
    +/**
    + * Compute the bits required to represent pth percentile value
    + * @param data - array
    + * @param p - percentile value (&gt;=0.0 to &lt;=1.0)
    --- End diff --
    
    `>=0.0 to <=1.0`


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by jamesclampffer <gi...@git.apache.org>.

Github user jamesclampffer commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r191933565
  
    --- Diff: c++/src/ColumnWriter.cc ---
    @@ -1675,7 +1677,9 @@ namespace orc {
       void ListColumnWriter::getColumnEncoding(
                         std::vector<proto::ColumnEncoding>& encodings) const {
         proto::ColumnEncoding encoding;
    -    encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
    +    encoding.set_kind(rleVersion == RleVersion_1 ?
    --- End diff --
    
    You could clean up the initializer list and avoid long term version mismatch issues if you add a helper function that maps RleVersion1 to ColumnEncoding_Kind_DIRECT and RleVersion_2 to ColumnEncoding_Kind_DIRECT_V2 and errors/throws on other values.


---

[GitHub] orc issue #273: ORC-343 Enable C++ writer to support RleV2

Posted by majetideepak <gi...@git.apache.org>.

Github user majetideepak commented on the issue:

    https://github.com/apache/orc/pull/273
  
    The PR looks overall good to me apart from a minor change requested. This is an important patch to align the C++ and Java implementations. Thanks again for working on this!


---

[GitHub] orc pull request #273: ORC-343 Enable C++ writer to support RleV2

Posted by rip-nsk <gi...@git.apache.org>.

Github user rip-nsk commented on a diff in the pull request:

    https://github.com/apache/orc/pull/273#discussion_r190919609
  
    --- Diff: c++/src/RleEncoderV2.cc ---
    @@ -0,0 +1,768 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with option work for additional information
    + * regarding copyright ownership.  The ASF licenses option file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use option file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +#include "Adaptor.hh"
    +#include "Compression.hh"
    +#include "RLEv2.hh"
    +#include "RLEV2Util.hh"
    +
    +#define MAX_LITERAL_SIZE 512
    +#define MAX_SHORT_REPEAT_LENGTH 10
    +
    +namespace orc {
    +
    +/**
    + * Compute the bits required to represent pth percentile value
    + * @param data - array
    + * @param p - percentile value (&gt;=0.0 to &lt;=1.0)
    + * @return pth percentile bits
    + */
    +uint32_t RleEncoderV2::percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist) {
    +    if ((p > 1.0) || (p <= 0.0)) {
    +        throw InvalidArgument("Invalid p value: " + std::to_string(p));
    +    }
    +
    +    if (!reuseHist) {
    +        // histogram that store the encoded bit requirement for each values.
    +        // maximum number of bits that can encoded is 32 (refer FixedBitSizes)
    +        memset(histgram, 0, 32 * sizeof(int32_t));
    +        // compute the histogram
    +        for(size_t i = offset; i < (offset + length); i++) {
    +            uint32_t idx = encodeBitWidth(findClosestNumBits(data[i]));
    +            histgram[idx] += 1;
    +        }
    +    }
    +
    +    int32_t perLen = static_cast<int32_t>(static_cast<double>(length) * (1.0 - p));
    +
    +    // return the bits required by pth percentile length
    +    for(int32_t i = HIST_LEN - 1; i >= 0; i--) {
    +        perLen -= histgram[i];
    +        if (perLen < 0) {
    +            return decodeBitWidth(static_cast<uint32_t>(i));
    +        }
    +    }
    +
    +    return 0;
    +}
    +
    +RleEncoderV2::RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream,
    +                           bool hasSigned, bool alignBitPacking) :
    +        RleEncoder(std::move(outStream), hasSigned),
    +        alignedBitPacking(alignBitPacking),
    +        prevDelta(0){
    +    literals = new int64_t[MAX_LITERAL_SIZE];
    +    gapVsPatchList = new int64_t[MAX_LITERAL_SIZE];
    +    zigzagLiterals = new int64_t[MAX_LITERAL_SIZE];
    +    baseRedLiterals = new int64_t[MAX_LITERAL_SIZE];
    +    adjDeltas = new int64_t[MAX_LITERAL_SIZE];
    +}
    +
    +void RleEncoderV2::write(int64_t val) {
    +    if(numLiterals == 0) {
    +        initializeLiterals(val);
    +        return;
    +    }
    +
    +    if(numLiterals == 1) {
    +        prevDelta = val - literals[0];
    +        literals[numLiterals++] = val;
    +
    +        if(val == literals[0]) {
    +            fixedRunLength = 2;
    +            variableRunLength = 0;
    +        } else {
    +            fixedRunLength = 0;
    +            variableRunLength = 2;
    +        }
    +        return;
    +    }
    +
    +    int64_t currentDelta = val - literals[numLiterals - 1];
    +    EncodingOption option = {};
    +    if (prevDelta == 0 && currentDelta == 0) {
    +        // case 1: fixed delta run
    +        literals[numLiterals++] = val;
    +
    +        if (variableRunLength > 0) {
    +            // if variable run is non-zero then we are seeing repeating
    +            // values at the end of variable run in which case fixed Run
    +            // length is 2
    +            fixedRunLength = 2;
    +        }
    +        fixedRunLength++;
    +
    +        // if fixed run met the minimum condition and if variable
    +        // run is non-zero then flush the variable run and shift the
    +        // tail fixed runs to start of the buffer
    +        if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) {
    +            numLiterals -= MIN_REPEAT;
    +            variableRunLength -= (MIN_REPEAT - 1);
    +
    +            int64_t tailVals[MIN_REPEAT] = {0};
    +
    +            memcpy(tailVals, literals + numLiterals, sizeof(int64_t) * MIN_REPEAT);
    +            determineEncoding(option);
    +            writeValues(option);
    +
    +            // shift tail fixed runs to beginning of the buffer
    +            for (size_t i = 0; i < MIN_REPEAT; ++i) {
    +                literals[numLiterals++] = tailVals[i];
    +            }
    +        }
    +
    +        if (fixedRunLength == MAX_LITERAL_SIZE) {;
    +            determineEncoding(option);
    +            writeValues(option);
    +        }
    +        return;
    +    }
    +
    +    // case 2: variable delta run
    +
    +    // if fixed run length is non-zero and if it satisfies the
    +    // short repeat conditions then write the values as short repeats
    +    // else use delta encoding
    +    if (fixedRunLength >= MIN_REPEAT) {
    +        if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
    +            option.encoding = SHORT_REPEAT;
    +        } else {
    +            option.encoding = DELTA;
    +            option.isFixedDelta = true;
    +        }
    +        writeValues(option);
    +    }
    +
    +    // if fixed run length is <MIN_REPEAT and current value is
    +    // different from previous then treat it as variable run
    +    if (fixedRunLength > 0 && fixedRunLength < MIN_REPEAT && val != literals[numLiterals - 1]) {
    +        variableRunLength = fixedRunLength;
    +        fixedRunLength = 0;
    +    }
    +
    +    // after writing values re-initialize the variables
    +    if (numLiterals == 0) {
    +        initializeLiterals(val);
    +    } else {
    +        prevDelta = val - literals[numLiterals - 1];
    +        literals[numLiterals++] = val;
    +        variableRunLength++;
    +
    +        if (variableRunLength == MAX_LITERAL_SIZE) {
    +            determineEncoding(option);
    +            writeValues(option);
    +        }
    +    }
    +}
    +
    +void RleEncoderV2::computeZigZagLiterals(EncodingOption &option) {
    +    int64_t zzEncVal = 0;
    +    for (size_t i = 0; i < numLiterals; i++) {
    +        if (isSigned) {
    +            zzEncVal = zigZag(literals[i]);
    +        } else {
    +            zzEncVal = literals[i];
    +        }
    +        zigzagLiterals[option.zigzagLiteralsCount++] = zzEncVal;
    +    }
    +}
    +
    +void RleEncoderV2::preparePatchedBlob(EncodingOption& option) {
    +    // mask will be max value beyond which patch will be generated
    +    int64_t mask = static_cast<int64_t>(1LU << option.brBits95p) - 1;
    +
    +    // since we are considering only 95 percentile, the size of gap and
    +    // patch array can contain only be 5% values
    +    option.patchLength = static_cast<uint32_t>(std::ceil((numLiterals / 20)));
    +
    +    // #bit for patch
    +    option.patchWidth = option.brBits100p - option.brBits95p;
    +    option.patchWidth = getClosestFixedBits(option.patchWidth);
    +
    +    // if patch bit requirement is 64 then it will not possible to pack
    +    // gap and patch together in a long. To make sure gap and patch can be
    +    // packed together adjust the patch width
    +    if (option.patchWidth == 64) {
    +        option.patchWidth = 56;
    +        option.brBits95p = 8;
    +        mask = static_cast<int64_t>(1LU << option.brBits95p) - 1;
    +    }
    +
    +    uint32_t gapIdx = 0;
    +    uint32_t patchIdx = 0;
    +    size_t prev = 0;
    +    size_t maxGap = 0;
    +
    +    std::vector<int64_t> gapList;
    +    std::vector<int64_t> patchList;
    +
    +    for(size_t i = 0; i < numLiterals; i++) {
    +        // if value is above mask then create the patch and record the gap
    +        if (baseRedLiterals[i] > mask) {
    +            size_t gap = i - prev;
    +            if (gap > maxGap) {
    +                maxGap = gap;
    +            }
    +
    +            // gaps are relative, so store the previous patched value index
    +            prev = i;
    +            gapList.push_back(static_cast<int64_t>(gap));
    +            gapIdx++;
    +
    +            // extract the most significant bits that are over mask bits
    +            int64_t patch = baseRedLiterals[i] >> option.brBits95p;
    +            patchList.push_back(patch);
    +            patchIdx++;
    +
    +            // strip off the MSB to enable safe bit packing
    +            baseRedLiterals[i] &= mask;
    +        }
    +    }
    +
    +    // adjust the patch length to number of entries in gap list
    +    option.patchLength = gapIdx;
    +
    +    // if the element to be patched is the first and only element then
    +    // max gap will be 0, but to store the gap as 0 we need atleast 1 bit
    +    if (maxGap == 0 && option.patchLength != 0) {
    +        option.patchGapWidth = 1;
    +    } else {
    +        option.patchGapWidth = findClosestNumBits(static_cast<int64_t>(maxGap));
    +    }
    +
    +    // special case: if the patch gap width is greater than 256, then
    +    // we need 9 bits to encode the gap width. But we only have 3 bits in
    +    // header to record the gap width. To deal with this case, we will save
    +    // two entries in patch list in the following way
    +    // 256 gap width => 0 for patch value
    +    // actual gap - 256 => actual patch value
    +    // We will do the same for gap width = 511. If the element to be patched is
    +    // the last element in the scope then gap width will be 511. In this case we
    +    // will have 3 entries in the patch list in the following way
    +    // 255 gap width => 0 for patch value
    +    // 255 gap width => 0 for patch value
    +    // 1 gap width => actual patch value
    +    if (option.patchGapWidth > 8) {
    +        option.patchGapWidth = 8;
    +        // for gap = 511, we need two additional entries in patch list
    +        if (maxGap == 511) {
    +            option.patchLength += 2;
    +        } else {
    +            option.patchLength += 1;
    +        }
    +    }
    +
    +    // create gap vs patch list
    +    gapIdx = 0;
    +    patchIdx = 0;
    +    for(size_t i = 0; i < option.patchLength; i++) {
    +        int64_t g = gapList[gapIdx++];
    +        int64_t p = patchList[patchIdx++];
    +        while (g > 255) {
    +            gapVsPatchList[option.gapVsPatchListCount++] = (255L << option.patchWidth);
    +            i++;
    +            g -= 255;
    +        }
    +
    +        // store patch value in LSBs and gap in MSBs
    +        gapVsPatchList[option.gapVsPatchListCount++] = ((g << option.patchWidth) | p);
    +    }
    +}
    +
    +void RleEncoderV2::determineEncoding(EncodingOption& option) {
    +    // we need to compute zigzag values for DIRECT encoding if we decide to
    +    // break early for delta overflows or for shorter runs
    +    computeZigZagLiterals(option);
    +
    +    option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0);
    +
    +    // not a big win for shorter runs to determine encoding
    +    if (numLiterals <= MIN_REPEAT) {
    +        option.encoding = DIRECT;
    +        return;
    +    }
    +
    +    // DELTA encoding check
    +
    +    // for identifying monotonic sequences
    +    bool isIncreasing = true;
    +    bool isDecreasing = true;
    +    option.isFixedDelta = true;
    +
    +    option.min = literals[0];
    +    int64_t max = literals[0];
    +    int64_t initialDelta = literals[1] - literals[0];
    +    int64_t currDelta = 0;
    +    int64_t deltaMax = 0;
    +    adjDeltas[option.adjDeltasCount++] = initialDelta;
    +
    +    for (size_t i = 1; i < numLiterals; i++) {
    +        const int64_t l1 = literals[i];
    +        const int64_t l0 = literals[i - 1];
    +        currDelta = l1 - l0;
    +        option.min = std::min(option.min, l1);
    +        max = std::max(max, l1);
    +
    +        isIncreasing &= (l0 <= l1);
    +        isDecreasing &= (l0 >= l1);
    +
    +        option.isFixedDelta &= (currDelta == initialDelta);
    +        if (i > 1) {
    +            adjDeltas[option.adjDeltasCount++] = std::abs(currDelta);
    +            deltaMax = std::max(deltaMax, adjDeltas[i - 1]);
    +        }
    +    }
    +
    +    // it's faster to exit under delta overflow condition without checking for
    +    // PATCHED_BASE condition as encoding using DIRECT is faster and has less
    +    // overhead than PATCHED_BASE
    +    if (!isSafeSubtract(max, option.min)) {
    +        option.encoding = DIRECT;
    +        return;
    +    }
    +
    +    // invariant - subtracting any number from any other in the literals after
    +    // option point won't overflow
    +
    +    // if min is equal to max then the delta is 0, option condition happens for
    +    // fixed values run >10 which cannot be encoded with SHORT_REPEAT
    +    if (option.min == max) {
    +        if (!option.isFixedDelta) {
    +            throw InvalidArgument(std::to_string(option.min) + "==" + std::to_string(max) + ", isFixedDelta cannot be false");
    +        }
    +
    +        if(currDelta != 0) {
    +            throw InvalidArgument(std::to_string(option.min) + "==" + std::to_string(max) + ", currDelta should be zero");
    +        }
    +        option.fixedDelta = 0;
    +        option.encoding = DELTA;
    +        return;
    +    }
    +
    +    if (option.isFixedDelta) {
    +        if (currDelta != initialDelta) {
    +            throw InvalidArgument("currDelta should be equal to initialDelta for fixed delta encoding");
    +        }
    +
    +        option.encoding = DELTA;
    +        option.fixedDelta = currDelta;
    +        return;
    +    }
    +
    +    // if initialDelta is 0 then we cannot delta encode as we cannot identify
    +    // the sign of deltas (increasing or decreasing)
    +    if (initialDelta != 0) {
    +        // stores the number of bits required for packing delta blob in
    +        // delta encoding
    +        option.bitsDeltaMax = findClosestNumBits(deltaMax);
    +
    +        // monotonic condition
    +        if (isIncreasing || isDecreasing) {
    +            option.encoding = DELTA;
    +            return;
    +        }
    +    }
    +
    +    // PATCHED_BASE encoding check
    +
    +    // percentile values are computed for the zigzag encoded values. if the
    +    // number of bit requirement between 90th and 100th percentile varies
    +    // beyond a threshold then we need to patch the values. if the variation
    +    // is not significant then we can use direct encoding
    +
    +    option.zzBits90p = percentileBits(zigzagLiterals, 0, numLiterals, 0.9, true);
    +    uint32_t diffBitsLH = option.zzBits100p - option.zzBits90p;
    +
    +    // if the difference between 90th percentile and 100th percentile fixed
    +    // bits is > 1 then we need patch the values
    +    if (diffBitsLH > 1) {
    +
    +        // patching is done only on base reduced values.
    +        // remove base from literals
    +        for (size_t i = 0; i < numLiterals; i++) {
    +            baseRedLiterals[option.baseRedLiteralsCount++] = (literals[i] - option.min);
    +        }
    +
    +        // 95th percentile width is used to determine max allowed value
    +        // after which patching will be done
    +        option.brBits95p = percentileBits(baseRedLiterals, 0, numLiterals, 0.95);
    +
    +        // 100th percentile is used to compute the max patch width
    +        option.brBits100p = percentileBits(baseRedLiterals, 0, numLiterals, 1.0, true);
    +
    +        // after base reducing the values, if the difference in bits between
    +        // 95th percentile and 100th percentile value is zero then there
    +        // is no point in patching the values, in which case we will
    +        // fallback to DIRECT encoding.
    +        // The decision to use patched base was based on zigzag values, but the
    +        // actual patching is done on base reduced literals.
    +        if ((option.brBits100p - option.brBits95p) != 0) {
    +            option.encoding = PATCHED_BASE;
    +            preparePatchedBlob(option);
    +            return;
    +        } else {
    +            option.encoding = DIRECT;
    +            return;
    +        }
    +    } else {
    +        // if difference in bits between 95th percentile and 100th percentile is
    +        // 0, then patch length will become 0. Hence we will fallback to direct
    +        option.encoding = DIRECT;
    +        return;
    +    }
    +}
    +
    +uint64_t RleEncoderV2::flush() {
    +    if (numLiterals != 0) {
    +        EncodingOption option = {};
    +        if (variableRunLength != 0) {
    +            determineEncoding(option);
    +            writeValues(option);
    +        } else if (fixedRunLength != 0) {
    +            if (fixedRunLength < MIN_REPEAT) {
    +                variableRunLength = fixedRunLength;
    +                fixedRunLength = 0;
    +                determineEncoding(option);
    +                writeValues(option);
    +            } else if (fixedRunLength >= MIN_REPEAT
    +                       && fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
    +                option.encoding = SHORT_REPEAT;
    +                writeValues(option);
    +            } else {
    +                option.encoding = DELTA;
    +                option.isFixedDelta = true;
    +                writeValues(option);
    +            }
    +        }
    +    }
    +
    +    outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition));
    +    uint64_t dataSize = outputStream->flush();
    +    bufferLength = bufferPosition = 0;
    +    return dataSize;
    +}
    +
    +void RleEncoderV2::writeValues(EncodingOption& option) {
    +    if (numLiterals != 0) {
    +        switch (option.encoding) {
    +            case SHORT_REPEAT:
    +                writeShortRepeatValues(option);
    +                break;
    +            case DIRECT:
    +                writeDirectValues(option);
    +                break;
    +            case PATCHED_BASE:
    +                writePatchedBasedValues(option);
    +                break;
    +            case DELTA:
    +                writeDeltaValues(option);
    +                break;
    +            default:
    +                throw NotImplementedYet("Not implemented yet");
    +        }
    +
    +        numLiterals = 0;
    +        prevDelta = 0;
    +    }
    +}
    +
    +void RleEncoderV2::writeShortRepeatValues(EncodingOption&) {
    +    int64_t repeatVal;
    +    if (isSigned) {
    +        repeatVal = zigZag(literals[0]);
    +    } else {
    +        repeatVal = literals[0];
    +    }
    +
    +    const uint32_t numBitsRepeatVal = findClosestNumBits(repeatVal);
    +    const uint32_t numBytesRepeatVal = numBitsRepeatVal % 8 == 0 ? (numBitsRepeatVal >> 3) : ((numBitsRepeatVal >> 3) + 1);
    +
    +    uint32_t header = getOpCode(SHORT_REPEAT);
    +
    +    fixedRunLength -= MIN_REPEAT;
    +    header |= fixedRunLength;
    +    header |= ((numBytesRepeatVal - 1) << 3);
    +
    +    writeByte(static_cast<char>(header));
    +
    +    for(int32_t i = static_cast<int32_t>(numBytesRepeatVal - 1); i >= 0; i--) {
    +        int64_t b = ((repeatVal >> (i * 8)) & 0xff);
    +        writeByte(static_cast<char>(b));
    +    }
    +
    +    fixedRunLength = 0;
    +}
    +
    +void RleEncoderV2::writeDirectValues(EncodingOption& option) {
    +    // write the number of fixed bits required in next 5 bits
    +    uint32_t fb = option.zzBits100p;
    +    if (alignedBitPacking) {
    +        fb = getClosestAlignedFixedBits(fb);
    +    }
    +
    +    const uint32_t efb = encodeBitWidth(fb) << 1;
    +
    +    // adjust variable run length
    +    variableRunLength -= 1;
    +
    +    // extract the 9th bit of run length
    +    const uint32_t tailBits = (variableRunLength & 0x100) >> 8;
    +
    +    // create first byte of the header
    +    const char headerFirstByte = static_cast<char>(getOpCode(DIRECT) | efb | tailBits);
    +
    +    // second byte of the header stores the remaining 8 bits of runlength
    +    const char headerSecondByte = static_cast<char>(variableRunLength & 0xff);
    +
    +    // write header
    +    writeByte(headerFirstByte);
    +    writeByte(headerSecondByte);
    +
    +    // bit packing the zigzag encoded literals
    +    writeInts(zigzagLiterals, 0, numLiterals, fb);
    +
    +    // reset run length
    +    variableRunLength = 0;
    +}
    +
    +void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) {
    +    // NOTE: Aligned bit packing cannot be applied for PATCHED_BASE encoding
    +    // because patch is applied to MSB bits. For example: If fixed bit width of
    +    // base value is 7 bits and if patch is 3 bits, the actual value is
    +    // constructed by shifting the patch to left by 7 positions.
    +    // actual_value = patch << 7 | base_value
    +    // So, if we align base_value then actual_value can not be reconstructed.
    +
    +    // write the number of fixed bits required in next 5 bits
    +    const uint32_t efb = encodeBitWidth(option.brBits95p) << 1;
    +
    +    // adjust variable run length, they are one off
    +    variableRunLength -= 1;
    +
    +    // extract the 9th bit of run length
    +    const uint32_t tailBits = (variableRunLength & 0x100) >> 8;
    +
    +    // create first byte of the header
    +    const char headerFirstByte = static_cast<char>(getOpCode(PATCHED_BASE) | efb | tailBits);
    +
    +    // second byte of the header stores the remaining 8 bits of runlength
    +    const char headerSecondByte = static_cast<char>(variableRunLength & 0xff);
    +
    +    // if the min value is negative toggle the sign
    +    const bool isNegative = (option.min < 0);
    +    if (isNegative) {
    +        option.min = -option.min;
    +    }
    +
    +    // find the number of bytes required for base and shift it by 5 bits
    +    // to accommodate patch width. The additional bit is used to store the sign
    +    // of the base value.
    +    const uint32_t baseWidth = findClosestNumBits(option.min) + 1;
    +    const uint32_t baseBytes = baseWidth % 8 == 0 ? baseWidth / 8 : (baseWidth / 8) + 1;
    +    const uint32_t bb = (baseBytes - 1) << 5;
    +
    +    // if the base value is negative then set MSB to 1
    +    if (isNegative) {
    +        option.min |= (1LL << ((baseBytes * 8) - 1));
    +    }
    +
    +    // third byte contains 3 bits for number of bytes occupied by base
    +    // and 5 bits for patchWidth
    +    const char headerThirdByte = static_cast<char>(bb | encodeBitWidth(option.patchWidth));
    +
    +    // fourth byte contains 3 bits for page gap width and 5 bits for
    +    // patch length
    +    const char headerFourthByte = static_cast<char>((option.patchGapWidth - 1) << 5 | option.patchLength);
    +
    +    // write header
    +    writeByte(headerFirstByte);
    +    writeByte(headerSecondByte);
    +    writeByte(headerThirdByte);
    +    writeByte(headerFourthByte);
    +
    +    // write the base value using fixed bytes in big endian order
    +    for(int32_t i = static_cast<int32_t>(baseBytes - 1); i >= 0; i--) {
    +        char b = static_cast<char>(((option.min >> (i * 8)) & 0xff));
    +        writeByte(b);
    +    }
    +
    +    // base reduced literals are bit packed
    +    uint32_t closestFixedBits = getClosestFixedBits(option.brBits95p);
    +
    +    writeInts(baseRedLiterals, 0, numLiterals, closestFixedBits);
    +
    +    // write patch list
    +    closestFixedBits = getClosestFixedBits(option.patchGapWidth + option.patchWidth);
    +
    +    writeInts(gapVsPatchList, 0, option.patchLength, closestFixedBits);
    +
    +    // reset run length
    +    variableRunLength = 0;
    +}
    +
    +void RleEncoderV2::writeDeltaValues(EncodingOption& option) {
    +    uint32_t len = 0;
    +    uint32_t fb = option.bitsDeltaMax;
    +    uint32_t efb = 0;
    +
    +    if (alignedBitPacking) {
    +        fb = getClosestAlignedFixedBits(fb);
    +    }
    +
    +    if (option.isFixedDelta) {
    +        // if fixed run length is greater than threshold then it will be fixed
    +        // delta sequence with delta value 0 else fixed delta sequence with
    +        // non-zero delta value
    +        if (fixedRunLength > MIN_REPEAT) {
    +            // ex. sequence: 2 2 2 2 2 2 2 2
    +            len = fixedRunLength - 1;
    +            fixedRunLength = 0;
    +        } else {
    +            // ex. sequence: 4 6 8 10 12 14 16
    +            len = variableRunLength - 1;
    +            variableRunLength = 0;
    +        }
    +    } else {
    +        // fixed width 0 is used for long repeating values.
    +        // sequences that require only 1 bit to encode will have an additional bit
    +        if (fb == 1) {
    +            fb = 2;
    +        }
    +        efb = encodeBitWidth(fb) << 1;
    +        len = variableRunLength - 1;
    +        variableRunLength = 0;
    +    }
    +
    +    // extract the 9th bit of run length
    +    const uint32_t tailBits = (len & 0x100) >> 8;
    +
    +    // create first byte of the header
    +    const char headerFirstByte = static_cast<char>(getOpCode(DELTA) | efb | tailBits);
    +
    +    // second byte of the header stores the remaining 8 bits of runlength
    +    const char headerSecondByte = static_cast<char>(len & 0xff);
    +
    +    // write header
    +    writeByte(headerFirstByte);
    +    writeByte(headerSecondByte);
    +
    +    // store the first value from zigzag literal array
    +    if (isSigned) {
    +        writeVslong(literals[0]);
    +    } else {
    +        writeVulong(literals[0]);
    +    }
    +
    +    if (option.isFixedDelta) {
    +        // if delta is fixed then we don't need to store delta blob
    +        writeVslong(option.fixedDelta);
    +    } else {
    +        // store the first value as delta value using zigzag encoding
    +        writeVslong(adjDeltas[0]);
    +
    +        // adjacent delta values are bit packed. The length of adjDeltas array is
    +        // always one less than the number of literals (delta difference for n
    +        // elements is n-1). We have already written one element, write the
    +        // remaining numLiterals - 2 elements here
    +        writeInts(adjDeltas, 1, numLiterals - 2, fb);
    +    }
    +}
    +
    +void RleEncoderV2::writeInts(int64_t* input, uint32_t offset, size_t len, uint32_t bitSize) {
    +  if(input == nullptr || len < 1 || bitSize < 1) {
    +      return;
    +  }
    +
    +  if (getClosestAlignedFixedBits(bitSize) == bitSize) {
    +    uint32_t numBytes;
    +    uint32_t endOffSet = static_cast<uint32_t>(offset + len);
    +    if (bitSize < 8 ) {;
    +      char bitMask = static_cast<char>((1 << bitSize) - 1);
    +      uint32_t numHops = 8 / bitSize;
    +      uint32_t remainder = static_cast<uint32_t>(len % numHops);
    +      uint32_t endUnroll = endOffSet - remainder;
    +      for (uint32_t i = offset; i < endUnroll; i+=numHops) {
    +        char toWrite = 0;
    +        for (uint32_t j = 0; j < numHops; ++j) {
    +          toWrite |= static_cast<char>((input[i+j] & bitMask) << (8 - (j + 1) * bitSize));
    +        }
    +        writeByte(toWrite);
    +      }
    +
    +      if (remainder > 0) {
    +        uint32_t startShift = 8 - bitSize;
    +        char toWrite = 0;
    +        for (uint32_t i = endUnroll; i < endOffSet; ++i) {
    +          toWrite |= static_cast<char>((input[i] & bitMask) << startShift);
    +          startShift -= bitSize;
    +        }
    +        writeByte(toWrite);
    +      }
    +
    +    } else {
    +      numBytes = bitSize / 8;
    +
    +      for (uint32_t i = offset; i < endOffSet; ++i) {
    +        for (uint32_t j = 0; j < numBytes; ++j) {
    +          char toWrite = static_cast<char>((input[i] >> (8 * (numBytes - j - 1))) & 255);
    +          writeByte(toWrite);
    +        }
    +      }
    +    }
    +
    +    return;
    +  }
    +
    +  // write for unaligned bit size
    +  uint32_t bitsLeft = 8;
    +  char current = 0;
    +  for(uint32_t i = offset; i < (offset + len); i++) {
    +    long value = input[i];
    --- End diff --
    
    int64_t value


---

[GitHub] orc issue #273: ORC-343 Enable C++ writer to support RleV2

Posted by xndai <gi...@git.apache.org>.

Github user xndai commented on the issue:

    https://github.com/apache/orc/pull/273
  
    @majetideepak this is RLEv2 change that was promised.
    
    @yuruiz Could you also include some perf data obtained from offline testing?


---