You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by om...@apache.org on 2015/07/06 23:52:50 UTC
[21/23] orc git commit: ORC-23. Simplify directory structure.
http://git-wip-us.apache.org/repos/asf/orc/blob/7f55b453/c++/src/Exceptions.cc
----------------------------------------------------------------------
diff --git a/c++/src/Exceptions.cc b/c++/src/Exceptions.cc
new file mode 100644
index 0000000..ae0e3d1
--- /dev/null
+++ b/c++/src/Exceptions.cc
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Exceptions.hh"
+
+namespace orc {
+
+ NotImplementedYet::NotImplementedYet(const std::string& what_arg
+ ) : logic_error(what_arg) {
+ // PASS
+ }
+
+ NotImplementedYet::NotImplementedYet(const char* what_arg
+ ) :logic_error(what_arg) {
+ // PASS
+ }
+
+ NotImplementedYet::NotImplementedYet(const NotImplementedYet& error
+ ): logic_error(error) {
+ // PASS
+ }
+
+ NotImplementedYet::~NotImplementedYet() noexcept {
+ // PASS
+ }
+
+ ParseError::ParseError(const std::string& what_arg
+ ): runtime_error(what_arg) {
+ // PASS
+ }
+
+ ParseError::ParseError(const char* what_arg
+ ): runtime_error(what_arg) {
+ // PASS
+ }
+
+ ParseError::ParseError(const ParseError& error): runtime_error(error) {
+ // PASS
+ }
+
+ ParseError::~ParseError() noexcept {
+ // PASS
+ }
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/7f55b453/c++/src/Exceptions.hh
----------------------------------------------------------------------
diff --git a/c++/src/Exceptions.hh b/c++/src/Exceptions.hh
new file mode 100644
index 0000000..4706085
--- /dev/null
+++ b/c++/src/Exceptions.hh
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_EXCEPTIONS_HH
+#define ORC_EXCEPTIONS_HH
+
+#include "Adaptor.hh"
+
+#include <stdexcept>
+#include <string>
+
+namespace orc {
+
+ class NotImplementedYet: public std::logic_error {
+ public:
+ explicit NotImplementedYet(const std::string& what_arg);
+ explicit NotImplementedYet(const char* what_arg);
+ virtual ~NotImplementedYet() noexcept;
+ NotImplementedYet(const NotImplementedYet&);
+ private:
+ NotImplementedYet& operator=(const NotImplementedYet&);
+ };
+
+ class ParseError: public std::runtime_error {
+ public:
+ explicit ParseError(const std::string& what_arg);
+ explicit ParseError(const char* what_arg);
+ virtual ~ParseError() noexcept;
+ ParseError(const ParseError&);
+ private:
+ ParseError& operator=(const ParseError&);
+ };
+}
+
+#endif
http://git-wip-us.apache.org/repos/asf/orc/blob/7f55b453/c++/src/Int128.cc
----------------------------------------------------------------------
diff --git a/c++/src/Int128.cc b/c++/src/Int128.cc
new file mode 100644
index 0000000..ece7850
--- /dev/null
+++ b/c++/src/Int128.cc
@@ -0,0 +1,438 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Int128.hh"
+#include "Adaptor.hh"
+
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+
+namespace orc {
+
+ Int128 Int128::maximumValue() {
+ return Int128(0x7fffffffffffffff, 0xfffffffffffffff);
+ }
+
+ Int128 Int128::minimumValue() {
+ return Int128(static_cast<int64_t>(0x8000000000000000), 0x0);
+ }
+
+ Int128::Int128(const std::string& str) {
+ lowbits = 0;
+ highbits = 0;
+ size_t length = str.length();
+ if (length > 0) {
+ bool isNegative = str[0] == '-';
+ size_t posn = isNegative ? 1 : 0;
+ while (posn < length) {
+ size_t group = std::min(18ul, length - posn);
+ int64_t chunk = std::stoll(str.substr(posn, group));
+ int64_t multiple = 1;
+ for(size_t i=0; i < group; ++i) {
+ multiple *= 10;
+ }
+ *this *= multiple;
+ *this += chunk;
+ posn += group;
+ }
+ if (isNegative) {
+ negate();
+ }
+ }
+ }
+
+ Int128& Int128::operator*=(const Int128 &right) {
+ const uint64_t INT_MASK = 0xffffffff;
+ const uint64_t CARRY_BIT = 1l << 32;
+
+ // Break the left and right numbers into 32 bit chunks
+ // so that we can multiply them without overflow.
+ uint64_t L0 = static_cast<uint64_t>(highbits) >> 32;
+ uint64_t L1 = static_cast<uint64_t>(highbits) & INT_MASK;
+ uint64_t L2 = lowbits >> 32;
+ uint64_t L3 = lowbits & INT_MASK;
+ uint64_t R0 = static_cast<uint64_t>(right.highbits) >> 32;
+ uint64_t R1 = static_cast<uint64_t>(right.highbits) & INT_MASK;
+ uint64_t R2 = right.lowbits >> 32;
+ uint64_t R3 = right.lowbits & INT_MASK;
+
+ uint64_t product = L3 * R3;
+ lowbits = product & INT_MASK;
+ uint64_t sum = product >> 32;
+ product = L2 * R3;
+ sum += product;
+ highbits = sum < product ? CARRY_BIT : 0;
+ product = L3 * R2;
+ sum += product;
+ if (sum < product) {
+ highbits += CARRY_BIT;
+ }
+ lowbits += sum << 32;
+ highbits += static_cast<int64_t>(sum >> 32);
+ highbits += L1 * R3 + L2 * R2 + L3 * R1;
+ highbits += (L0 * R3 + L1 * R2 + L2 * R1 + L3 * R0) << 32;
+ return *this;
+ }
+
+ /**
+ * Expands the given value into an array of ints so that we can work on
+ * it. The array will be converted to an absolute value and the wasNegative
+ * flag will be set appropriately. The array will remove leading zeros from
+ * the value.
+ * @param array an array of length 4 to set with the value
+ * @param wasNegative a flag for whether the value was original negative
+ * @result the output length of the array
+ */
+ int64_t Int128::fillInArray(uint32_t* array, bool &wasNegative) const {
+ uint64_t high;
+ uint64_t low;
+ if (highbits < 0) {
+ low = ~lowbits + 1;
+ high = static_cast<uint64_t>(~highbits);
+ if (low == 0) {
+ high += 1;
+ }
+ wasNegative = true;
+ } else {
+ low = lowbits;
+ high = static_cast<uint64_t>(highbits);
+ wasNegative = false;
+ }
+ if (high != 0) {
+ if (high > UINT32_MAX) {
+ array[0] = static_cast<uint32_t>(high >> 32);
+ array[1] = static_cast<uint32_t>(high);
+ array[2] = static_cast<uint32_t>(low >> 32);
+ array[3] = static_cast<uint32_t>(low);
+ return 4;
+ } else {
+ array[0] = static_cast<uint32_t>(high);
+ array[1] = static_cast<uint32_t>(low >> 32);
+ array[2] = static_cast<uint32_t>(low);
+ return 3;
+ }
+ } else if (low >= UINT32_MAX) {
+ array[0] = static_cast<uint32_t>(low >> 32);
+ array[1] = static_cast<uint32_t>(low);
+ return 2;
+ } else if (low == 0) {
+ return 0;
+ } else {
+ array[0] = static_cast<uint32_t>(low);
+ return 1;
+ }
+ }
+
+
+ /**
+ * Find last set bit in a 32 bit integer. Bit 1 is the LSB and bit 32 is
+ * the MSB. We can replace this with bsrq asm instruction on x64.
+ */
+ int64_t fls(uint32_t x) {
+ int64_t bitpos = 0;
+ while (x) {
+ x >>= 1;
+ bitpos += 1;
+ }
+ return bitpos;
+ }
+
+ /**
+ * Shift the number in the array left by bits positions.
+ * @param array the number to shift, must have length elements
+ * @param length the number of entries in the array
+ * @param bits the number of bits to shift (0 <= bits < 32)
+ */
+ void shiftArrayLeft(uint32_t* array, int64_t length, int64_t bits) {
+ if (length > 0 && bits != 0) {
+ for(int64_t i=0; i < length-1; ++i) {
+ array[i] = (array[i] << bits) | (array[i+1] >> (32 - bits));
+ }
+ array[length-1] <<= bits;
+ }
+ }
+
+ /**
+ * Shift the number in the array right by bits positions.
+ * @param array the number to shift, must have length elements
+ * @param length the number of entries in the array
+ * @param bits the number of bits to shift (0 <= bits < 32)
+ */
+ void shiftArrayRight(uint32_t* array, int64_t length, int64_t bits) {
+ if (length > 0 && bits != 0) {
+ for(int64_t i=length-1; i > 0; --i) {
+ array[i] = (array[i] >> bits) | (array[i-1] << (32 - bits));
+ }
+ array[0] >>= bits;
+ }
+ }
+
+ /**
+ * Fix the signs of the result and remainder at the end of the division
+ * based on the signs of the dividend and divisor.
+ */
+ void fixDivisionSigns(Int128 &result, Int128 &remainder,
+ bool dividendWasNegative, bool divisorWasNegative) {
+ if (dividendWasNegative != divisorWasNegative) {
+ result.negate();
+ }
+ if (dividendWasNegative) {
+ remainder.negate();
+ }
+ }
+
+ /**
+ * Build a Int128 from a list of ints.
+ */
+ void buildFromArray(Int128& value, uint32_t* array, int64_t length) {
+ switch (length) {
+ case 0:
+ value = 0;
+ break;
+ case 1:
+ value = array[0];
+ break;
+ case 2:
+ value = Int128(0, (static_cast<uint64_t>(array[0]) << 32) + array[1]);
+ break;
+ case 3:
+ value = Int128(array[0],
+ (static_cast<uint64_t>(array[1]) << 32) + array[2]);
+ break;
+ case 4:
+ value = Int128((static_cast<int64_t>(array[0]) << 32) + array[1],
+ (static_cast<uint64_t>(array[2]) << 32) + array[3]);
+ break;
+ case 5:
+ if (array[0] != 0) {
+ throw std::logic_error("Can't build Int128 with 5 ints.");
+ }
+ value = Int128((static_cast<int64_t>(array[1]) << 32) + array[2],
+ (static_cast<uint64_t>(array[3]) << 32) + array[4]);
+ break;
+ default:
+ throw std::logic_error("Unsupported length for building Int128");
+ }
+ }
+
+ /**
+ * Do a division where the divisor fits into a single 32 bit value.
+ */
+ Int128 singleDivide(uint32_t* dividend, int64_t dividendLength,
+ uint32_t divisor, Int128& remainder,
+ bool dividendWasNegative, bool divisorWasNegative) {
+ uint64_t r = 0;
+ uint32_t resultArray[5];
+ for(int64_t j=0; j < dividendLength; j++) {
+ r <<= 32;
+ r += dividend[j];
+ resultArray[j] = static_cast<uint32_t>(r / divisor);
+ r %= divisor;
+ }
+ Int128 result;
+ buildFromArray(result, resultArray, dividendLength);
+ remainder = static_cast<int64_t>(r);
+ fixDivisionSigns(result, remainder, dividendWasNegative,
+ divisorWasNegative);
+ return result;
+ }
+
+ Int128 Int128::divide(const Int128 &divisor, Int128 &remainder) const {
+ // Split the dividend and divisor into integer pieces so that we can
+ // work on them.
+ uint32_t dividendArray[5];
+ uint32_t divisorArray[4];
+ bool dividendWasNegative;
+ bool divisorWasNegative;
+ // leave an extra zero before the dividend
+ dividendArray[0] = 0;
+ int64_t dividendLength = fillInArray(dividendArray + 1, dividendWasNegative)+1;
+ int64_t divisorLength = divisor.fillInArray(divisorArray, divisorWasNegative);
+
+ // Handle some of the easy cases.
+ if (dividendLength <= divisorLength) {
+ remainder = *this;
+ return 0;
+ } else if (divisorLength == 0) {
+ throw std::range_error("Division by 0 in Int128");
+ } else if (divisorLength == 1) {
+ return singleDivide(dividendArray, dividendLength, divisorArray[0],
+ remainder, dividendWasNegative, divisorWasNegative);
+ }
+
+ int64_t resultLength = dividendLength - divisorLength;
+ uint32_t resultArray[4];
+
+ // Normalize by shifting both by a multiple of 2 so that
+ // the digit guessing is better. The requirement is that
+ // divisorArray[0] is greater than 2**31.
+ int64_t normalizeBits = 32 - fls(divisorArray[0]);
+ shiftArrayLeft(divisorArray, divisorLength, normalizeBits);
+ shiftArrayLeft(dividendArray, dividendLength, normalizeBits);
+
+ // compute each digit in the result
+ for(int64_t j=0; j < resultLength; ++j) {
+ // Guess the next digit. At worst it is two too large
+ uint32_t guess = UINT32_MAX;
+ uint64_t highDividend = static_cast<uint64_t>(dividendArray[j]) << 32 |
+ dividendArray[j+1];
+ if (dividendArray[j] != divisorArray[0]) {
+ guess = static_cast<uint32_t>(highDividend / divisorArray[0]);
+ }
+
+ // catch all of the cases where guess is two too large and most of the
+ // cases where it is one too large
+ uint32_t rhat =
+ static_cast<uint32_t>(highDividend - guess *
+ static_cast<uint64_t>(divisorArray[0]));
+ while (static_cast<uint64_t>(divisorArray[1]) * guess >
+ (static_cast<uint64_t>(rhat) << 32) + dividendArray[j+2]) {
+ guess -= 1;
+ rhat += divisorArray[0];
+ if (static_cast<uint64_t>(rhat) < divisorArray[0]) {
+ break;
+ }
+ }
+
+ // subtract off the guess * divisor from the dividend
+ uint64_t mult = 0;
+ for(int64_t i=divisorLength-1; i >= 0; --i) {
+ mult += static_cast<uint64_t>(guess) * divisorArray[i];
+ uint32_t prev = dividendArray[j+i+1];
+ dividendArray[j+i+1] -= static_cast<uint32_t>(mult);
+ mult >>= 32;
+ if (dividendArray[j+i+1] > prev) {
+ mult += 1;
+ }
+ }
+ uint32_t prev = dividendArray[j];
+ dividendArray[j] -= static_cast<uint32_t>(mult);
+
+ // if guess was too big, we add back divisor
+ if (dividendArray[j] > prev) {
+ guess -= 1;
+ uint32_t carry = 0;
+ for(int64_t i=divisorLength-1; i >= 0; --i) {
+ uint64_t sum = static_cast<uint64_t>(divisorArray[i]) +
+ dividendArray[j+i+1] + carry;
+ dividendArray[j+i+1] = static_cast<uint32_t>(sum);
+ carry = static_cast<uint32_t>(sum >> 32);
+ }
+ dividendArray[j] += carry;
+ }
+
+ resultArray[j] = guess;
+ }
+
+ // denormalize the remainder
+ shiftArrayRight(dividendArray, dividendLength, normalizeBits);
+
+ // return result and remainder
+ Int128 result;
+ buildFromArray(result, resultArray, resultLength);
+ buildFromArray(remainder, dividendArray, dividendLength);
+ fixDivisionSigns(result, remainder,
+ dividendWasNegative, divisorWasNegative);
+ return result;
+ }
+
+ std::string Int128::toString() const {
+ // 10**18 - the largest power of 10 less than 63 bits
+ const Int128 tenTo18(0xde0b6b3a7640000);
+ // 10**36
+ const Int128 tenTo36(0xc097ce7bc90715, 0xb34b9f1000000000);
+ Int128 remainder;
+ std::stringstream buf;
+ bool needFill = false;
+
+ // get anything above 10**36 and print it
+ Int128 top = divide(tenTo36, remainder);
+ if (top != 0) {
+ buf << top.toLong();
+ remainder.abs();
+ needFill = true;
+ }
+
+ // now get anything above 10**18 and print it
+ Int128 tail;
+ top = remainder.divide(tenTo18, tail);
+ if (needFill || top != 0) {
+ if (needFill) {
+ buf << std::setw(18) << std::setfill('0');
+ } else {
+ needFill = true;
+ tail.abs();
+ }
+ buf << top.toLong();
+ }
+
+ // finally print the tail, which is less than 10**18
+ if (needFill) {
+ buf << std::setw(18) << std::setfill('0');
+ }
+ buf << tail.toLong();
+ return buf.str();
+ }
+
+ std::string Int128::toDecimalString(int32_t scale) const {
+ std::string str = toString();
+ if (scale == 0) {
+ return str;
+ } else if (*this < 0) {
+ int32_t len = static_cast<int32_t>(str.length());
+ if (len - 1 > scale) {
+ return str.substr(0, static_cast<size_t>(len - scale)) + "." +
+ str.substr(static_cast<size_t>(len - scale),
+ static_cast<size_t>(scale));
+ } else if (len - 1 == scale) {
+ return "-0." + str.substr(1, std::string::npos);
+ } else {
+ std::string result = "-0.";
+ for(int32_t i=0; i < scale - len + 1; ++i) {
+ result += "0";
+ }
+ return result + str.substr(1, std::string::npos);
+ }
+ } else {
+ int32_t len = static_cast<int32_t>(str.length());
+ if (len > scale) {
+ return str.substr(0, static_cast<size_t>(len - scale)) + "." +
+ str.substr(static_cast<size_t>(len - scale),
+ static_cast<size_t>(scale));
+ } else if (len == scale) {
+ return "0." + str;
+ } else {
+ std::string result = "0.";
+ for(int32_t i=0; i < scale - len; ++i) {
+ result += "0";
+ }
+ return result + str;
+ }
+ }
+ }
+
+ std::string Int128::toHexString() const {
+ std::stringstream buf;
+ buf << std::hex << "0x"
+ << std::setw(16) << std::setfill('0') << highbits
+ << std::setw(16) << std::setfill('0') << lowbits;
+ return buf.str();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/orc/blob/7f55b453/c++/src/MemoryPool.cc
----------------------------------------------------------------------
diff --git a/c++/src/MemoryPool.cc b/c++/src/MemoryPool.cc
new file mode 100644
index 0000000..28cc9e1
--- /dev/null
+++ b/c++/src/MemoryPool.cc
@@ -0,0 +1,232 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/Int128.hh"
+#include "orc/MemoryPool.hh"
+
+#include "Adaptor.hh"
+
+#include <cstdlib>
+#include <iostream>
+#include <string.h>
+
+namespace orc {
+
+ MemoryPool::~MemoryPool() {
+ // PASS
+ }
+
+ class MemoryPoolImpl: public MemoryPool {
+ public:
+ virtual ~MemoryPoolImpl();
+
+ char* malloc(uint64_t size) override;
+ void free(char* p) override;
+ };
+
+ char* MemoryPoolImpl::malloc(uint64_t size) {
+ return static_cast<char*>(std::malloc(size));
+ }
+
+ void MemoryPoolImpl::free(char* p) {
+ std::free(p);
+ }
+
+ MemoryPoolImpl::~MemoryPoolImpl() {
+ // PASS
+ }
+
+ template <class T>
+ DataBuffer<T>::DataBuffer(MemoryPool& pool,
+ uint64_t newSize
+ ): memoryPool(pool),
+ buf(nullptr),
+ currentSize(0),
+ currentCapacity(0) {
+ resize(newSize);
+ }
+
+ template <class T>
+ DataBuffer<T>::~DataBuffer(){
+ for(uint64_t i=currentSize; i > 0; --i) {
+ (buf + i - 1)->~T();
+ }
+ if (buf) {
+ memoryPool.free(reinterpret_cast<char*>(buf));
+ }
+ }
+
+ template <class T>
+ void DataBuffer<T>::resize(uint64_t newSize) {
+ reserve(newSize);
+ if (currentSize > newSize) {
+ for(uint64_t i=currentSize; i > newSize; --i) {
+ (buf + i - 1)->~T();
+ }
+ } else if (newSize > currentSize) {
+ for(uint64_t i=currentSize; i < newSize; ++i) {
+ new (buf + i) T();
+ }
+ }
+ currentSize = newSize;
+ }
+
+ template <class T>
+ void DataBuffer<T>::reserve(uint64_t newCapacity){
+ if (newCapacity > currentCapacity) {
+ if (buf) {
+ T* buf_old = buf;
+ buf = reinterpret_cast<T*>(memoryPool.malloc(sizeof(T) * newCapacity));
+ memcpy(buf, buf_old, sizeof(T) * currentSize);
+ memoryPool.free(reinterpret_cast<char*>(buf_old));
+ } else {
+ buf = reinterpret_cast<T*>(memoryPool.malloc(sizeof(T) * newCapacity));
+ }
+ currentCapacity = newCapacity;
+ }
+ }
+
+ // Specializations for char
+
+ template <>
+ DataBuffer<char>::~DataBuffer(){
+ if (buf) {
+ memoryPool.free(reinterpret_cast<char*>(buf));
+ }
+ }
+
+ template <>
+ void DataBuffer<char>::resize(uint64_t newSize) {
+ reserve(newSize);
+ if (newSize > currentSize) {
+ memset(buf + currentSize, 0, newSize - currentSize);
+ }
+ currentSize = newSize;
+ }
+
+ // Specializations for char*
+
+ template <>
+ DataBuffer<char*>::~DataBuffer(){
+ if (buf) {
+ memoryPool.free(reinterpret_cast<char*>(buf));
+ }
+ }
+
+ template <>
+ void DataBuffer<char*>::resize(uint64_t newSize) {
+ reserve(newSize);
+ if (newSize > currentSize) {
+ memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(char*));
+ }
+ currentSize = newSize;
+ }
+
+ // Specializations for double
+
+ template <>
+ DataBuffer<double>::~DataBuffer(){
+ if (buf) {
+ memoryPool.free(reinterpret_cast<char*>(buf));
+ }
+ }
+
+ template <>
+ void DataBuffer<double>::resize(uint64_t newSize) {
+ reserve(newSize);
+ if (newSize > currentSize) {
+ memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(double));
+ }
+ currentSize = newSize;
+ }
+
+ // Specializations for int64_t
+
+ template <>
+ DataBuffer<int64_t>::~DataBuffer(){
+ if (buf) {
+ memoryPool.free(reinterpret_cast<char*>(buf));
+ }
+ }
+
+ template <>
+ void DataBuffer<int64_t>::resize(uint64_t newSize) {
+ reserve(newSize);
+ if (newSize > currentSize) {
+ memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(int64_t));
+ }
+ currentSize = newSize;
+ }
+
+ // Specializations for uint64_t
+
+ template <>
+ DataBuffer<uint64_t>::~DataBuffer(){
+ if (buf) {
+ memoryPool.free(reinterpret_cast<char*>(buf));
+ }
+ }
+
+ template <>
+ void DataBuffer<uint64_t>::resize(uint64_t newSize) {
+ reserve(newSize);
+ if (newSize > currentSize) {
+ memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(uint64_t));
+ }
+ currentSize = newSize;
+ }
+
+ // Specializations for unsigned char
+
+ template <>
+ DataBuffer<unsigned char>::~DataBuffer(){
+ if (buf) {
+ memoryPool.free(reinterpret_cast<char*>(buf));
+ }
+ }
+
+ template <>
+ void DataBuffer<unsigned char>::resize(uint64_t newSize) {
+ reserve(newSize);
+ if (newSize > currentSize) {
+ memset(buf + currentSize, 0, newSize - currentSize);
+ }
+ currentSize = newSize;
+ }
+
+ #ifdef __clang__
+ #pragma clang diagnostic ignored "-Wweak-template-vtables"
+ #endif
+
+ template class DataBuffer<char>;
+ template class DataBuffer<char*>;
+ template class DataBuffer<double>;
+ template class DataBuffer<Int128>;
+ template class DataBuffer<int64_t>;
+ template class DataBuffer<uint64_t>;
+ template class DataBuffer<unsigned char>;
+
+ #ifdef __clang__
+ #pragma clang diagnostic ignored "-Wexit-time-destructors"
+ #endif
+
+ MemoryPool* getDefaultPool() {
+ static MemoryPoolImpl internal;
+ return &internal;
+ }
+} // namespace orc
http://git-wip-us.apache.org/repos/asf/orc/blob/7f55b453/c++/src/OrcFile.cc
----------------------------------------------------------------------
diff --git a/c++/src/OrcFile.cc b/c++/src/OrcFile.cc
new file mode 100644
index 0000000..f8c22c4
--- /dev/null
+++ b/c++/src/OrcFile.cc
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/OrcFile.hh"
+
+#include "Adaptor.hh"
+#include "Exceptions.hh"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+namespace orc {
+
+ class FileInputStream : public InputStream {
+ private:
+ std::string filename ;
+ int file;
+ uint64_t totalLength;
+
+ public:
+ FileInputStream(std::string _filename) {
+ filename = _filename ;
+ file = open(filename.c_str(), O_RDONLY);
+ if (file == -1) {
+ throw ParseError("Can't open " + filename);
+ }
+ struct stat fileStat;
+ if (fstat(file, &fileStat) == -1) {
+ throw ParseError("Can't stat " + filename);
+ }
+ totalLength = static_cast<uint64_t>(fileStat.st_size);
+ }
+
+ ~FileInputStream();
+
+ uint64_t getLength() const override {
+ return totalLength;
+ }
+
+ void read(void* buf,
+ uint64_t length,
+ uint64_t offset) override {
+ if (!buf) {
+ throw ParseError("Buffer is null");
+ }
+ ssize_t bytesRead = pread(file, buf, length, static_cast<off_t>(offset));
+
+ if (bytesRead == -1) {
+ throw ParseError("Bad read of " + filename);
+ }
+ if (static_cast<uint64_t>(bytesRead) != length) {
+ throw ParseError("Short read of " + filename);
+ }
+ }
+
+ const std::string& getName() const override {
+ return filename;
+ }
+ };
+
+ FileInputStream::~FileInputStream() {
+ close(file);
+ }
+
+ std::unique_ptr<InputStream> readLocalFile(const std::string& path) {
+ return std::unique_ptr<InputStream>(new FileInputStream(path));
+ }
+}
+
+#ifndef HAS_STOLL
+
+ #include <sstream>
+
+ int64_t std::stoll(std::string str) {
+ int64_t val = 0;
+ stringstream ss ;
+ ss << str ;
+ ss >> val ;
+ return val;
+ }
+
+#endif
http://git-wip-us.apache.org/repos/asf/orc/blob/7f55b453/c++/src/RLE.cc
----------------------------------------------------------------------
diff --git a/c++/src/RLE.cc b/c++/src/RLE.cc
new file mode 100644
index 0000000..51bd628
--- /dev/null
+++ b/c++/src/RLE.cc
@@ -0,0 +1,47 @@
+/**
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include "RLEv1.hh"
+#include "RLEv2.hh"
+#include "Exceptions.hh"
+
+namespace orc {
+
+ RleDecoder::~RleDecoder() {
+ // PASS
+ }
+
+ std::unique_ptr<RleDecoder> createRleDecoder
+ (std::unique_ptr<SeekableInputStream> input,
+ bool isSigned,
+ RleVersion version,
+ MemoryPool& pool) {
+ switch (static_cast<int64_t>(version)) {
+ case RleVersion_1:
+ // We don't have std::make_unique() yet.
+ return std::unique_ptr<RleDecoder>(new RleDecoderV1(std::move(input),
+ isSigned));
+ case RleVersion_2:
+ return std::unique_ptr<RleDecoder>(new RleDecoderV2(std::move(input),
+ isSigned, pool));
+ default:
+ throw NotImplementedYet("Not implemented yet");
+ }
+ }
+
+} // namespace orc
http://git-wip-us.apache.org/repos/asf/orc/blob/7f55b453/c++/src/RLE.hh
----------------------------------------------------------------------
diff --git a/c++/src/RLE.hh b/c++/src/RLE.hh
new file mode 100644
index 0000000..0a44c95
--- /dev/null
+++ b/c++/src/RLE.hh
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_RLE_HH
+#define ORC_RLE_HH
+
+#include "Compression.hh"
+
+#include <memory>
+
+namespace orc {
+
+ inline int64_t unZigZag(uint64_t value) {
+ return value >> 1 ^ -(value & 1);
+ }
+
+ class RleDecoder {
+ public:
+ // must be non-inline!
+ virtual ~RleDecoder();
+
+ /**
+ * Seek to a particular spot.
+ */
+ virtual void seek(PositionProvider&) = 0;
+
+ /**
+ * Seek over a given number of values.
+ */
+ virtual void skip(uint64_t numValues) = 0;
+
+ /**
+ * Read a number of values into the batch.
+ * @param data the array to read into
+ * @param numValues the number of values to read
+ * @param notNull If the pointer is null, all values are read. If the
+ * pointer is not null, positions that are false are skipped.
+ */
+ virtual void next(int64_t* data, uint64_t numValues,
+ const char* notNull) = 0;
+ };
+
+ enum RleVersion {
+ RleVersion_1,
+ RleVersion_2
+ };
+
+ /**
+ * Create an RLE decoder.
+ * @param input the input stream to read from
+ * @param isSigned true if the number sequence is signed
+ * @param version version of RLE decoding to do
+ * @param pool memory pool to use for allocation
+ */
+ std::unique_ptr<RleDecoder> createRleDecoder
+ (std::unique_ptr<SeekableInputStream> input,
+ bool isSigned,
+ RleVersion version,
+ MemoryPool& pool);
+
+} // namespace orc
+
+#endif // ORC_RLE_HH
http://git-wip-us.apache.org/repos/asf/orc/blob/7f55b453/c++/src/RLEv1.cc
----------------------------------------------------------------------
diff --git a/c++/src/RLEv1.cc b/c++/src/RLEv1.cc
new file mode 100644
index 0000000..91bb79d
--- /dev/null
+++ b/c++/src/RLEv1.cc
@@ -0,0 +1,190 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Adaptor.hh"
+#include "Compression.hh"
+#include "Exceptions.hh"
+#include "RLEv1.hh"
+
+#include <algorithm>
+
+namespace orc {
+
+const uint64_t MINIMUM_REPEAT = 3;
+const uint64_t BASE_128_MASK = 0x7f;
+
+signed char RleDecoderV1::readByte() {
+ if (bufferStart == bufferEnd) {
+ int bufferLength;
+ const void* bufferPointer;
+ if (!inputStream->Next(&bufferPointer, &bufferLength)) {
+ throw ParseError("bad read in readByte");
+ }
+ bufferStart = static_cast<const char*>(bufferPointer);
+ bufferEnd = bufferStart + bufferLength;
+ }
+ return *(bufferStart++);
+}
+
+uint64_t RleDecoderV1::readLong() {
+ uint64_t result = 0;
+ int64_t offset = 0;
+ signed char ch = readByte();
+ if (ch >= 0) {
+ result = static_cast<uint64_t>(ch);
+ } else {
+ result = static_cast<uint64_t>(ch) & BASE_128_MASK;
+ while ((ch = readByte()) < 0) {
+ offset += 7;
+ result |= (static_cast<uint64_t>(ch) & BASE_128_MASK) << offset;
+ }
+ result |= static_cast<uint64_t>(ch) << (offset + 7);
+ }
+ return result;
+}
+
+void RleDecoderV1::skipLongs(uint64_t numValues) {
+ while (numValues > 0) {
+ if (readByte() >= 0) {
+ --numValues;
+ }
+ }
+}
+
+void RleDecoderV1::readHeader() {
+ signed char ch = readByte();
+ if (ch < 0) {
+ remainingValues = static_cast<uint64_t>(-ch);
+ repeating = false;
+ } else {
+ remainingValues = static_cast<uint64_t>(ch) + MINIMUM_REPEAT;
+ repeating = true;
+ delta = readByte();
+ value = isSigned
+ ? unZigZag(readLong())
+ : static_cast<int64_t>(readLong());
+ }
+}
+
+RleDecoderV1::RleDecoderV1(std::unique_ptr<SeekableInputStream> input,
+ bool hasSigned)
+ : inputStream(std::move(input)),
+ isSigned(hasSigned),
+ remainingValues(0),
+ bufferStart(nullptr),
+ bufferEnd(bufferStart) {
+}
+
+void RleDecoderV1::seek(PositionProvider& location) {
+ // move the input stream
+ inputStream->seek(location);
+ // force a re-read from the stream
+ bufferEnd = bufferStart;
+ // read a new header
+ readHeader();
+ // skip ahead the given number of records
+ skip(location.next());
+}
+
+void RleDecoderV1::skip(uint64_t numValues) {
+ while (numValues > 0) {
+ if (remainingValues == 0) {
+ readHeader();
+ }
+ uint64_t count = std::min(numValues, remainingValues);
+ remainingValues -= count;
+ numValues -= count;
+ if (repeating) {
+ value += delta * static_cast<int64_t>(count);
+ } else {
+ skipLongs(count);
+ }
+ }
+}
+
+void RleDecoderV1::next(int64_t* const data,
+ const uint64_t numValues,
+ const char* const notNull) {
+ uint64_t position = 0;
+ // skipNulls()
+ if (notNull) {
+ // Skip over null values.
+ while (position < numValues && !notNull[position]) {
+ ++position;
+ }
+ }
+ while (position < numValues) {
+ // If we are out of values, read more.
+ if (remainingValues == 0) {
+ readHeader();
+ }
+ // How many do we read out of this block?
+ uint64_t count = std::min(numValues - position, remainingValues);
+ uint64_t consumed = 0;
+ if (repeating) {
+ if (notNull) {
+ for (uint64_t i = 0; i < count; ++i) {
+ if (notNull[position + i]) {
+ data[position + i] = value + static_cast<int64_t>(consumed) * delta;
+ consumed += 1;
+ }
+ }
+ } else {
+ for (uint64_t i = 0; i < count; ++i) {
+ data[position + i] = value + static_cast<int64_t>(i) * delta;
+ }
+ consumed = count;
+ }
+ value += static_cast<int64_t>(consumed) * delta;
+ } else {
+ if (notNull) {
+ for (uint64_t i = 0 ; i < count; ++i) {
+ if (notNull[i]) {
+ data[position + i] = isSigned
+ ? unZigZag(readLong())
+ : static_cast<int64_t>(readLong());
+ ++consumed;
+ }
+ }
+ } else {
+ if (isSigned) {
+ for (uint64_t i = 0; i < count; ++i) {
+ data[position + i] = unZigZag(readLong());
+ }
+ } else {
+ for (uint64_t i = 0; i < count; ++i) {
+ data[position + i] = static_cast<int64_t>(readLong());
+ }
+ }
+ consumed = count;
+ }
+ }
+ remainingValues -= consumed;
+ position += count;
+
+ // skipNulls()
+ if (notNull) {
+ // Skip over null values.
+ while (position < numValues && !notNull[position]) {
+ ++position;
+ }
+ }
+ }
+}
+
+} // namespace orc
http://git-wip-us.apache.org/repos/asf/orc/blob/7f55b453/c++/src/RLEv1.hh
----------------------------------------------------------------------
diff --git a/c++/src/RLEv1.hh b/c++/src/RLEv1.hh
new file mode 100644
index 0000000..95e50a3
--- /dev/null
+++ b/c++/src/RLEv1.hh
@@ -0,0 +1,70 @@
+/**
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef ORC_RLEV1_HH
+#define ORC_RLEV1_HH
+
+#include "Adaptor.hh"
+#include "RLE.hh"
+
+#include <memory>
+
+namespace orc {
+
+class RleDecoderV1 : public RleDecoder {
+public:
+ RleDecoderV1(std::unique_ptr<SeekableInputStream> input,
+ bool isSigned);
+
+ /**
+ * Seek to a particular spot.
+ */
+ void seek(PositionProvider&) override;
+
+ /**
+ * Seek over a given number of values.
+ */
+ void skip(uint64_t numValues) override;
+
+ /**
+ * Read a number of values into the batch.
+ */
+ void next(int64_t* data, uint64_t numValues,
+ const char* notNull) override;
+
+private:
+ inline signed char readByte();
+
+ inline void readHeader();
+
+ inline uint64_t readLong();
+
+ inline void skipLongs(uint64_t numValues);
+
+ const std::unique_ptr<SeekableInputStream> inputStream;
+ const bool isSigned;
+ uint64_t remainingValues;
+ int64_t value;
+ const char *bufferStart;
+ const char *bufferEnd;
+ int64_t delta;
+ bool repeating;
+};
+} // namespace orc
+
+#endif // ORC_RLEV1_HH
http://git-wip-us.apache.org/repos/asf/orc/blob/7f55b453/c++/src/RLEv2.cc
----------------------------------------------------------------------
diff --git a/c++/src/RLEv2.cc b/c++/src/RLEv2.cc
new file mode 100644
index 0000000..43428b4
--- /dev/null
+++ b/c++/src/RLEv2.cc
@@ -0,0 +1,482 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Adaptor.hh"
+#include "Compression.hh"
+#include "RLEv2.hh"
+
+#define MIN_REPEAT 3
+
+namespace orc {
+
+struct FixedBitSizes {
+ enum FBS {
+ ONE = 0, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE,
+ THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN,
+ TWENTY, TWENTYONE, TWENTYTWO, TWENTYTHREE, TWENTYFOUR, TWENTYSIX,
+ TWENTYEIGHT, THIRTY, THIRTYTWO, FORTY, FORTYEIGHT, FIFTYSIX, SIXTYFOUR
+ };
+};
+
+inline uint32_t decodeBitWidth(uint32_t n) {
+ if (n >= FixedBitSizes::ONE &&
+ n <= FixedBitSizes::TWENTYFOUR) {
+ return n + 1;
+ } else if (n == FixedBitSizes::TWENTYSIX) {
+ return 26;
+ } else if (n == FixedBitSizes::TWENTYEIGHT) {
+ return 28;
+ } else if (n == FixedBitSizes::THIRTY) {
+ return 30;
+ } else if (n == FixedBitSizes::THIRTYTWO) {
+ return 32;
+ } else if (n == FixedBitSizes::FORTY) {
+ return 40;
+ } else if (n == FixedBitSizes::FORTYEIGHT) {
+ return 48;
+ } else if (n == FixedBitSizes::FIFTYSIX) {
+ return 56;
+ } else {
+ return 64;
+ }
+}
+
+inline uint32_t getClosestFixedBits(uint32_t n) {
+ if (n == 0) {
+ return 1;
+ }
+
+ if (n >= 1 && n <= 24) {
+ return n;
+ } else if (n > 24 && n <= 26) {
+ return 26;
+ } else if (n > 26 && n <= 28) {
+ return 28;
+ } else if (n > 28 && n <= 30) {
+ return 30;
+ } else if (n > 30 && n <= 32) {
+ return 32;
+ } else if (n > 32 && n <= 40) {
+ return 40;
+ } else if (n > 40 && n <= 48) {
+ return 48;
+ } else if (n > 48 && n <= 56) {
+ return 56;
+ } else {
+ return 64;
+ }
+}
+
+int64_t RleDecoderV2::readLongBE(uint64_t bsz) {
+ int64_t ret = 0, val;
+ uint64_t n = bsz;
+ while (n > 0) {
+ n--;
+ val = readByte();
+ ret |= (val << (n * 8));
+ }
+ return ret;
+}
+
+inline int64_t RleDecoderV2::readVslong() {
+ return unZigZag(readVulong());
+}
+
+uint64_t RleDecoderV2::readVulong() {
+ uint64_t ret = 0, b;
+ uint64_t offset = 0;
+ do {
+ b = readByte();
+ ret |= (0x7f & b) << offset;
+ offset += 7;
+ } while (b >= 0x80);
+ return ret;
+}
+
+RleDecoderV2::RleDecoderV2(std::unique_ptr<SeekableInputStream> input,
+ bool _isSigned, MemoryPool& pool
+ ): inputStream(std::move(input)),
+ isSigned(_isSigned),
+ firstByte(0),
+ runLength(0),
+ runRead(0),
+ bufferStart(nullptr),
+ bufferEnd(bufferStart),
+ deltaBase(0),
+ byteSize(0),
+ firstValue(0),
+ prevValue(0),
+ bitSize(0),
+ bitsLeft(0),
+ curByte(0),
+ patchBitSize(0),
+ base(0),
+ curGap(0),
+ patchMask(0),
+ actualGap(0),
+ unpacked(pool, 0),
+ unpackedPatch(pool, 0) {
+ // PASS
+}
+
+void RleDecoderV2::seek(PositionProvider& location) {
+ // move the input stream
+ inputStream->seek(location);
+ // clear state
+ bufferEnd = bufferStart = 0;
+ runRead = runLength = 0;
+ // skip ahead the given number of records
+ skip(location.next());
+}
+
+void RleDecoderV2::skip(uint64_t numValues) {
+ // simple for now, until perf tests indicate something encoding specific is
+ // needed
+ const uint64_t N = 64;
+ int64_t dummy[N];
+
+ while (numValues) {
+ uint64_t nRead = std::min(N, numValues);
+ next(dummy, nRead, nullptr);
+ numValues -= nRead;
+ }
+}
+
+void RleDecoderV2::next(int64_t* const data,
+ const uint64_t numValues,
+ const char* const notNull) {
+ uint64_t nRead = 0;
+
+ while (nRead < numValues) {
+ // Skip any nulls before attempting to read first byte.
+ while (notNull && !notNull[nRead]) {
+ if (++nRead == numValues) {
+ return; // ended with null values
+ }
+ }
+
+ if (runRead == runLength) {
+ resetRun();
+ firstByte = readByte();
+ }
+
+ uint64_t offset = nRead, length = numValues - nRead;
+
+ EncodingType enc = static_cast<EncodingType>
+ ((firstByte >> 6) & 0x03);
+ switch(static_cast<int64_t>(enc)) {
+ case SHORT_REPEAT:
+ nRead += nextShortRepeats(data, offset, length, notNull);
+ break;
+ case DIRECT:
+ nRead += nextDirect(data, offset, length, notNull);
+ break;
+ case PATCHED_BASE:
+ nRead += nextPatched(data, offset, length, notNull);
+ break;
+ case DELTA:
+ nRead += nextDelta(data, offset, length, notNull);
+ break;
+ default:
+ throw ParseError("unknown encoding");
+ }
+ }
+}
+
+uint64_t RleDecoderV2::nextShortRepeats(int64_t* const data,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* const notNull) {
+ if (runRead == runLength) {
+ // extract the number of fixed bytes
+ byteSize = (firstByte >> 3) & 0x07;
+ byteSize += 1;
+
+ runLength = firstByte & 0x07;
+ // run lengths values are stored only after MIN_REPEAT value is met
+ runLength += MIN_REPEAT;
+ runRead = 0;
+
+ // read the repeated value which is store using fixed bytes
+ firstValue = readLongBE(byteSize);
+
+ if (isSigned) {
+ firstValue = unZigZag(static_cast<uint64_t>(firstValue));
+ }
+ }
+
+ uint64_t nRead = std::min(runLength - runRead, numValues);
+
+ if (notNull) {
+ for(uint64_t pos = offset; pos < offset + nRead; ++pos) {
+ if (notNull[pos]) {
+ data[pos] = firstValue;
+ ++runRead;
+ }
+ }
+ } else {
+ for(uint64_t pos = offset; pos < offset + nRead; ++pos) {
+ data[pos] = firstValue;
+ ++runRead;
+ }
+ }
+
+ return nRead;
+}
+
+uint64_t RleDecoderV2::nextDirect(int64_t* const data,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* const notNull) {
+ if (runRead == runLength) {
+ // extract the number of fixed bits
+ unsigned char fbo = (firstByte >> 1) & 0x1f;
+ bitSize = decodeBitWidth(fbo);
+
+ // extract the run length
+ runLength = static_cast<uint64_t>(firstByte & 0x01) << 8;
+ runLength |= readByte();
+ // runs are one off
+ runLength += 1;
+ runRead = 0;
+ }
+
+ uint64_t nRead = std::min(runLength - runRead, numValues);
+
+ runRead += readLongs(data, offset, nRead, bitSize, notNull);
+
+ if (isSigned) {
+ if (notNull) {
+ for (uint64_t pos = offset; pos < offset + nRead; ++pos) {
+ if (notNull[pos]) {
+ data[pos] = unZigZag(static_cast<uint64_t>(data[pos]));
+ }
+ }
+ } else {
+ for (uint64_t pos = offset; pos < offset + nRead; ++pos) {
+ data[pos] = unZigZag(static_cast<uint64_t>(data[pos]));
+ }
+ }
+ }
+
+ return nRead;
+}
+
+uint64_t RleDecoderV2::nextPatched(int64_t* const data,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* const notNull) {
+ if (runRead == runLength) {
+ // extract the number of fixed bits
+ unsigned char fbo = (firstByte >> 1) & 0x1f;
+ bitSize = decodeBitWidth(fbo);
+
+ // extract the run length
+ runLength = static_cast<uint64_t>(firstByte & 0x01) << 8;
+ runLength |= readByte();
+ // runs are one off
+ runLength += 1;
+ runRead = 0;
+
+ // extract the number of bytes occupied by base
+ uint64_t thirdByte = readByte();
+ byteSize = (thirdByte >> 5) & 0x07;
+ // base width is one off
+ byteSize += 1;
+
+ // extract patch width
+ uint32_t pwo = thirdByte & 0x1f;
+ patchBitSize = decodeBitWidth(pwo);
+
+ // read fourth byte and extract patch gap width
+ uint64_t fourthByte = readByte();
+ uint32_t pgw = (fourthByte >> 5) & 0x07;
+ // patch gap width is one off
+ pgw += 1;
+
+ // extract the length of the patch list
+ size_t pl = fourthByte & 0x1f;
+ if (pl == 0) {
+ throw ParseError("Corrupt PATCHED_BASE encoded data (pl==0)!");
+ }
+
+ // read the next base width number of bytes to extract base value
+ base = readLongBE(byteSize);
+ int64_t mask = (static_cast<int64_t>(1) << ((byteSize * 8) - 1));
+ // if mask of base value is 1 then base is negative value else positive
+ if ((base & mask) != 0) {
+ base = base & ~mask;
+ base = -base;
+ }
+
+ // TODO: something more efficient than resize
+ unpacked.resize(runLength);
+ unpackedIdx = 0;
+ readLongs(unpacked.data(), 0, runLength, bitSize);
+ // any remaining bits are thrown out
+ resetReadLongs();
+
+ // TODO: something more efficient than resize
+ unpackedPatch.resize(pl);
+ patchIdx = 0;
+ // TODO: Skip corrupt?
+ // if ((patchBitSize + pgw) > 64 && !skipCorrupt) {
+ if ((patchBitSize + pgw) > 64) {
+ throw ParseError("Corrupt PATCHED_BASE encoded data "
+ "(patchBitSize + pgw > 64)!");
+ }
+ uint32_t cfb = getClosestFixedBits(patchBitSize + pgw);
+ readLongs(unpackedPatch.data(), 0, pl, cfb);
+ // any remaining bits are thrown out
+ resetReadLongs();
+
+ // apply the patch directly when decoding the packed data
+ patchMask = ((static_cast<int64_t>(1) << patchBitSize) - 1);
+
+ adjustGapAndPatch();
+ }
+
+ uint64_t nRead = std::min(runLength - runRead, numValues);
+
+ for(uint64_t pos = offset; pos < offset + nRead; ++pos) {
+ // skip null positions
+ if (notNull && !notNull[pos]) {
+ continue;
+ }
+ if (static_cast<int64_t>(unpackedIdx) != actualGap) {
+ // no patching required. add base to unpacked value to get final value
+ data[pos] = base + unpacked[unpackedIdx];
+ } else {
+ // extract the patch value
+ int64_t patchedVal = unpacked[unpackedIdx] | (curPatch << bitSize);
+
+ // add base to patched value
+ data[pos] = base + patchedVal;
+
+ // increment the patch to point to next entry in patch list
+ ++patchIdx;
+
+ if (patchIdx < unpackedPatch.size()) {
+ adjustGapAndPatch();
+
+ // next gap is relative to the current gap
+ actualGap += unpackedIdx;
+ }
+ }
+
+ ++runRead;
+ ++unpackedIdx;
+ }
+
+ return nRead;
+}
+
+uint64_t RleDecoderV2::nextDelta(int64_t* const data,
+ uint64_t offset,
+ uint64_t numValues,
+ const char* const notNull) {
+ if (runRead == runLength) {
+ // extract the number of fixed bits
+ unsigned char fbo = (firstByte >> 1) & 0x1f;
+ if (fbo != 0) {
+ bitSize = decodeBitWidth(fbo);
+ } else {
+ bitSize = 0;
+ }
+
+ // extract the run length
+ runLength = static_cast<uint64_t>(firstByte & 0x01) << 8;
+ runLength |= readByte();
+ ++runLength; // account for first value
+ runRead = deltaBase = 0;
+
+ // read the first value stored as vint
+ if (isSigned) {
+ firstValue = static_cast<int64_t>(readVslong());
+ } else {
+ firstValue = static_cast<int64_t>(readVulong());
+ }
+
+ prevValue = firstValue;
+
+ // read the fixed delta value stored as vint (deltas can be negative even
+ // if all number are positive)
+ deltaBase = static_cast<int64_t>(readVslong());
+ }
+
+ uint64_t nRead = std::min(runLength - runRead, numValues);
+
+ uint64_t pos = offset;
+ for ( ; pos < offset + nRead; ++pos) {
+ // skip null positions
+ if (!notNull || notNull[pos]) break;
+ }
+ if (runRead == 0 && pos < offset + nRead) {
+ data[pos++] = firstValue;
+ ++runRead;
+ }
+
+ if (bitSize == 0) {
+ // add fixed deltas to adjacent values
+ for ( ; pos < offset + nRead; ++pos) {
+ // skip null positions
+ if (notNull && !notNull[pos]) {
+ continue;
+ }
+ prevValue = data[pos] = prevValue + deltaBase;
+ ++runRead;
+ }
+ } else {
+ for ( ; pos < offset + nRead; ++pos) {
+ // skip null positions
+ if (!notNull || notNull[pos]) break;
+ }
+ if (runRead < 2 && pos < offset + nRead) {
+ // add delta base and first value
+ prevValue = data[pos++] = firstValue + deltaBase;
+ ++runRead;
+ }
+
+ // write the unpacked values, add it to previous value and store final
+ // value to result buffer. if the delta base value is negative then it
+ // is a decreasing sequence else an increasing sequence
+ uint64_t remaining = (offset + nRead) - pos;
+ runRead += readLongs(data, pos, remaining, bitSize, notNull);
+
+ if (deltaBase < 0) {
+ for ( ; pos < offset + nRead; ++pos) {
+ // skip null positions
+ if (notNull && !notNull[pos]) {
+ continue;
+ }
+ prevValue = data[pos] = prevValue - data[pos];
+ }
+ } else {
+ for ( ; pos < offset + nRead; ++pos) {
+ // skip null positions
+ if (notNull && !notNull[pos]) {
+ continue;
+ }
+ prevValue = data[pos] = prevValue + data[pos];
+ }
+ }
+ }
+ return nRead;
+}
+
+} // namespace orc
http://git-wip-us.apache.org/repos/asf/orc/blob/7f55b453/c++/src/RLEv2.hh
----------------------------------------------------------------------
diff --git a/c++/src/RLEv2.hh b/c++/src/RLEv2.hh
new file mode 100644
index 0000000..2923009
--- /dev/null
+++ b/c++/src/RLEv2.hh
@@ -0,0 +1,175 @@
+/**
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef ORC_RLEV2_HH
+#define ORC_RLEV2_HH
+
+#include "Adaptor.hh"
+#include "Exceptions.hh"
+#include "RLE.hh"
+
+#include <vector>
+
+namespace orc {
+
+class RleDecoderV2 : public RleDecoder {
+public:
+
+ enum EncodingType { SHORT_REPEAT=0, DIRECT=1, PATCHED_BASE=2, DELTA=3 };
+
+ RleDecoderV2(std::unique_ptr<SeekableInputStream> input,
+ bool isSigned, MemoryPool& pool);
+
+ /**
+ * Seek to a particular spot.
+ */
+ void seek(PositionProvider&) override;
+
+ /**
+ * Seek over a given number of values.
+ */
+ void skip(uint64_t numValues) override;
+
+ /**
+ * Read a number of values into the batch.
+ */
+ void next(int64_t* data, uint64_t numValues,
+ const char* notNull) override;
+
+private:
+
+ // Used by PATCHED_BASE
+ void adjustGapAndPatch() {
+ curGap = static_cast<uint64_t>(unpackedPatch[patchIdx]) >>
+ patchBitSize;
+ curPatch = unpackedPatch[patchIdx] & patchMask;
+ actualGap = 0;
+
+ // special case: gap is >255 then patch value will be 0.
+ // if gap is <=255 then patch value cannot be 0
+ while (curGap == 255 && curPatch == 0) {
+ actualGap += 255;
+ ++patchIdx;
+ curGap = static_cast<uint64_t>(unpackedPatch[patchIdx]) >>
+ patchBitSize;
+ curPatch = unpackedPatch[patchIdx] & patchMask;
+ }
+ // add the left over gap
+ actualGap += curGap;
+ }
+
+ void resetReadLongs() {
+ bitsLeft = 0;
+ curByte = 0;
+ }
+
+ void resetRun() {
+ resetReadLongs();
+ bitSize = 0;
+ }
+
+ unsigned char readByte() {
+ if (bufferStart == bufferEnd) {
+ int bufferLength;
+ const void* bufferPointer;
+ if (!inputStream->Next(&bufferPointer, &bufferLength)) {
+ throw ParseError("bad read in RleDecoderV2::readByte");
+ }
+ bufferStart = static_cast<const char*>(bufferPointer);
+ bufferEnd = bufferStart + bufferLength;
+ }
+
+ unsigned char result = static_cast<unsigned char>(*bufferStart++);
+ return result;
+}
+
+ int64_t readLongBE(uint64_t bsz);
+ int64_t readVslong();
+ uint64_t readVulong();
+ uint64_t readLongs(int64_t *data, uint64_t offset, uint64_t len,
+ uint64_t fb, const char* notNull = nullptr) {
+ uint64_t ret = 0;
+
+ // TODO: unroll to improve performance
+ for(uint64_t i = offset; i < (offset + len); i++) {
+ // skip null positions
+ if (notNull && !notNull[i]) {
+ continue;
+ }
+ uint64_t result = 0;
+ uint64_t bitsLeftToRead = fb;
+ while (bitsLeftToRead > bitsLeft) {
+ result <<= bitsLeft;
+ result |= curByte & ((1 << bitsLeft) - 1);
+ bitsLeftToRead -= bitsLeft;
+ curByte = readByte();
+ bitsLeft = 8;
+ }
+
+ // handle the left over bits
+ if (bitsLeftToRead > 0) {
+ result <<= bitsLeftToRead;
+ bitsLeft -= static_cast<uint32_t>(bitsLeftToRead);
+ result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1);
+ }
+ data[i] = static_cast<int64_t>(result);
+ ++ret;
+ }
+
+ return ret;
+}
+
+
+ uint64_t nextShortRepeats(int64_t* data, uint64_t offset, uint64_t numValues,
+ const char* notNull);
+ uint64_t nextDirect(int64_t* data, uint64_t offset, uint64_t numValues,
+ const char* notNull);
+ uint64_t nextPatched(int64_t* data, uint64_t offset, uint64_t numValues,
+ const char* notNull);
+ uint64_t nextDelta(int64_t* data, uint64_t offset, uint64_t numValues,
+ const char* notNull);
+
+ const std::unique_ptr<SeekableInputStream> inputStream;
+ const bool isSigned;
+
+ unsigned char firstByte;
+ uint64_t runLength;
+ uint64_t runRead;
+ const char *bufferStart;
+ const char *bufferEnd;
+ int64_t deltaBase; // Used by DELTA
+ uint64_t byteSize; // Used by SHORT_REPEAT and PATCHED_BASE
+ int64_t firstValue; // Used by SHORT_REPEAT and DELTA
+ int64_t prevValue; // Used by DELTA
+ uint32_t bitSize; // Used by DIRECT, PATCHED_BASE and DELTA
+ uint32_t bitsLeft; // Used by anything that uses readLongs
+ uint32_t curByte; // Used by anything that uses readLongs
+ uint32_t patchBitSize; // Used by PATCHED_BASE
+ uint64_t unpackedIdx; // Used by PATCHED_BASE
+ uint64_t patchIdx; // Used by PATCHED_BASE
+ int64_t base; // Used by PATCHED_BASE
+ uint64_t curGap; // Used by PATCHED_BASE
+ int64_t curPatch; // Used by PATCHED_BASE
+ int64_t patchMask; // Used by PATCHED_BASE
+ int64_t actualGap; // Used by PATCHED_BASE
+ DataBuffer<int64_t> unpacked; // Used by PATCHED_BASE
+ DataBuffer<int64_t> unpackedPatch; // Used by PATCHED_BASE
+};
+} // namespace orc
+
+#endif // ORC_RLEV2_HH