You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@orc.apache.org by ga...@apache.org on 2022/11/08 06:38:47 UTC
[orc] branch main updated: ORC-1307: Add `.clang-format` to enforce C++ code style
This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new b5cbed459 ORC-1307: Add `.clang-format` to enforce C++ code style
b5cbed459 is described below
commit b5cbed4599c15068bdc46ea741ecae4a306d90db
Author: Gang Wu <us...@gmail.com>
AuthorDate: Tue Nov 8 14:38:41 2022 +0800
ORC-1307: Add `.clang-format` to enforce C++ code style
This closes #1308
---
.clang-format | 26 +
.github/workflows/build_and_test.yml | 16 +
c++/include/orc/BloomFilter.hh | 8 +-
c++/include/orc/ColumnPrinter.hh | 16 +-
c++/include/orc/Common.hh | 57 +-
c++/include/orc/Exceptions.hh | 23 +-
c++/include/orc/Int128.hh | 59 +-
c++/include/orc/MemoryPool.hh | 25 +-
c++/include/orc/OrcFile.hh | 19 +-
c++/include/orc/Reader.hh | 53 +-
c++/include/orc/Statistics.hh | 58 +-
c++/include/orc/Type.hh | 32 +-
c++/include/orc/Vector.hh | 40 +-
c++/include/orc/Writer.hh | 36 +-
c++/include/orc/sargs/Literal.hh | 66 +-
c++/include/orc/sargs/SearchArgument.hh | 62 +-
c++/include/orc/sargs/TruthValue.hh | 18 +-
c++/src/Adaptor.cc | 46 +-
c++/src/BlockBuffer.cc | 23 +-
c++/src/BlockBuffer.hh | 13 +-
c++/src/BloomFilter.cc | 66 +-
c++/src/BloomFilter.hh | 37 +-
c++/src/ByteRLE.cc | 163 +-
c++/src/ByteRLE.hh | 27 +-
c++/src/ColumnPrinter.cc | 450 +-
c++/src/ColumnReader.cc | 1096 ++--
c++/src/ColumnReader.hh | 31 +-
c++/src/ColumnWriter.cc | 1146 ++---
c++/src/ColumnWriter.hh | 66 +-
c++/src/Common.cc | 16 +-
c++/src/Compression.cc | 889 ++--
c++/src/Compression.hh | 25 +-
c++/src/Exceptions.cc | 28 +-
c++/src/Int128.cc | 179 +-
c++/src/LzoDecompressor.cc | 64 +-
c++/src/LzoDecompressor.hh | 8 +-
c++/src/MemoryPool.cc | 63 +-
c++/src/Murmur3.cc | 12 +-
c++/src/Murmur3.hh | 10 +-
c++/src/Options.hh | 53 +-
c++/src/OrcFile.cc | 47 +-
c++/src/OrcHdfsFile.cc | 96 +-
c++/src/RLE.cc | 91 +-
c++/src/RLE.hh | 52 +-
c++/src/RLEV2Util.cc | 70 +-
c++/src/RLEV2Util.hh | 38 +-
c++/src/RLEv1.cc | 436 +-
c++/src/RLEv1.hh | 75 +-
c++/src/RLEv2.hh | 326 +-
c++/src/Reader.cc | 579 +--
c++/src/Reader.hh | 117 +-
c++/src/RleDecoderV2.cc | 1286 +++--
c++/src/RleEncoderV2.cc | 773 +--
c++/src/Statistics.cc | 150 +-
c++/src/Statistics.hh | 512 +-
c++/src/StripeStream.cc | 94 +-
c++/src/StripeStream.hh | 112 +-
c++/src/Timezone.cc | 317 +-
c++/src/Timezone.hh | 14 +-
c++/src/TypeImpl.cc | 648 ++-
c++/src/TypeImpl.hh | 64 +-
c++/src/Utils.hh | 71 +-
c++/src/Vector.cc | 195 +-
c++/src/Writer.cc | 278 +-
c++/src/io/InputStream.cc | 61 +-
c++/src/io/InputStream.hh | 50 +-
c++/src/io/OutputStream.cc | 35 +-
c++/src/io/OutputStream.hh | 47 +-
c++/src/sargs/ExpressionTree.cc | 46 +-
c++/src/sargs/ExpressionTree.hh | 8 +-
c++/src/sargs/Literal.cc | 30 +-
c++/src/sargs/PredicateLeaf.cc | 282 +-
c++/src/sargs/PredicateLeaf.hh | 62 +-
c++/src/sargs/SargsApplier.cc | 81 +-
c++/src/sargs/SargsApplier.hh | 48 +-
c++/src/sargs/SearchArgument.cc | 171 +-
c++/src/sargs/SearchArgument.hh | 99 +-
c++/src/sargs/TruthValue.cc | 2 +-
c++/src/wrap/coded-stream-wrapper.h | 6 +-
c++/src/wrap/gmock.h | 20 +-
c++/src/wrap/gtest-wrapper.h | 35 +-
c++/src/wrap/orc-proto-wrapper.cc | 38 +-
c++/src/wrap/orc-proto-wrapper.hh | 24 +-
c++/src/wrap/snappy-wrapper.h | 2 +-
c++/src/wrap/zero-copy-stream-wrapper.h | 8 +-
c++/test/CreateTestFiles.cc | 17 +-
c++/test/MemoryInputStream.cc | 5 +-
c++/test/MemoryInputStream.hh | 20 +-
c++/test/MemoryOutputStream.cc | 6 +-
c++/test/MemoryOutputStream.hh | 17 +-
c++/test/OrcTest.hh | 2 +-
c++/test/TestAttributes.cc | 37 +-
c++/test/TestBlockBuffer.cc | 4 +-
c++/test/TestBloomFilter.cc | 38 +-
c++/test/TestBufferedOutputStream.cc | 39 +-
c++/test/TestByteRLEEncoder.cc | 89 +-
c++/test/TestByteRle.cc | 2584 +++++-----
c++/test/TestColumnPrinter.cc | 316 +-
c++/test/TestColumnReader.cc | 8432 +++++++++++++++----------------
c++/test/TestColumnStatistics.cc | 65 +-
c++/test/TestCompression.cc | 203 +-
c++/test/TestDecimal.cc | 65 +-
c++/test/TestDecompression.cc | 425 +-
c++/test/TestDictionaryEncoding.cc | 122 +-
c++/test/TestDriver.cc | 5 +-
c++/test/TestInt128.cc | 50 +-
c++/test/TestMurmur3.cc | 19 +-
c++/test/TestPredicateLeaf.cc | 943 ++--
c++/test/TestPredicatePushdown.cc | 212 +-
c++/test/TestRLEV2Util.cc | 4 +-
c++/test/TestReader.cc | 138 +-
c++/test/TestRleDecoder.cc | 5468 ++++++++++----------
c++/test/TestRleEncoder.cc | 127 +-
c++/test/TestSargsApplier.cc | 104 +-
c++/test/TestSearchArgument.cc | 644 ++-
c++/test/TestStripeIndexStatistics.cc | 58 +-
c++/test/TestTimestampStatistics.cc | 61 +-
c++/test/TestTimezone.cc | 228 +-
c++/test/TestType.cc | 154 +-
c++/test/TestWriter.cc | 1005 ++--
tools/src/CSVFileImport.cc | 207 +-
tools/src/FileContents.cc | 11 +-
tools/src/FileMemory.cc | 42 +-
tools/src/FileMetadata.cc | 131 +-
tools/src/FileScan.cc | 12 +-
tools/src/FileStatistics.cc | 78 +-
tools/src/TimezoneDump.cc | 7 +-
tools/src/ToolsHelper.cc | 49 +-
tools/src/ToolsHelper.hh | 3 +-
tools/test/TestCSVFileImport.cc | 48 +-
tools/test/TestFileContents.cc | 27 +-
tools/test/TestFileMetadata.cc | 521 +-
tools/test/TestFileScan.cc | 202 +-
tools/test/TestFileStatistics.cc | 291 +-
tools/test/TestMatch.cc | 5742 +++++++++++----------
tools/test/ToolTest.cc | 26 +-
tools/test/ToolTest.hh | 8 +-
tools/test/gzip.cc | 44 +-
tools/test/gzip.hh | 12 +-
139 files changed, 19154 insertions(+), 22734 deletions(-)
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 000000000..0779071a3
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+---
+Language: Cpp
+BasedOnStyle: Google
+ColumnLimit: 100
+IndentWidth: 2
+NamespaceIndentation: All
+UseTab: Never
+AllowShortFunctionsOnASingleLine: Empty
+DerivePointerAlignment: false
+IncludeBlocks: Preserve
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 793bb8c57..890c5d5cf 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -106,3 +106,19 @@ jobs:
cd java
mvn install -DskipTests
mvn javadoc:javadoc
+
+ formatting-check:
+ name: "C++ format check"
+ runs-on: ubuntu-20.04
+ strategy:
+ matrix:
+ path:
+ - 'c++'
+ - 'tools'
+ steps:
+ - uses: actions/checkout@v3
+ - name: Run clang-format style check for C++ code
+ uses: jidicula/clang-format-action@v4.9.0
+ with:
+ clang-format-version: '13'
+ check-path: ${{ matrix.path }}
diff --git a/c++/include/orc/BloomFilter.hh b/c++/include/orc/BloomFilter.hh
index 91277392c..d08f6deac 100644
--- a/c++/include/orc/BloomFilter.hh
+++ b/c++/include/orc/BloomFilter.hh
@@ -27,11 +27,11 @@
namespace orc {
class BloomFilter {
- public:
+ public:
virtual ~BloomFilter();
// test if the element exists in BloomFilter
- virtual bool testBytes(const char * data, int64_t length) const = 0;
+ virtual bool testBytes(const char* data, int64_t length) const = 0;
virtual bool testLong(int64_t data) const = 0;
virtual bool testDouble(double data) const = 0;
};
@@ -40,6 +40,6 @@ namespace orc {
std::vector<std::shared_ptr<BloomFilter>> entries;
};
-}
+} // namespace orc
-#endif //ORC_BLOOMFILTER_HH
+#endif // ORC_BLOOMFILTER_HH
diff --git a/c++/include/orc/ColumnPrinter.hh b/c++/include/orc/ColumnPrinter.hh
index aa1921473..79093e4c0 100644
--- a/c++/include/orc/ColumnPrinter.hh
+++ b/c++/include/orc/ColumnPrinter.hh
@@ -19,12 +19,11 @@
#ifndef ORC_COLUMN_PRINTER_HH
#define ORC_COLUMN_PRINTER_HH
-#include "orc/orc-config.hh"
#include "orc/OrcFile.hh"
#include "orc/Vector.hh"
+#include "orc/orc-config.hh"
#include <stdio.h>
-#include <string>
#include <memory>
#include <string>
#include <vector>
@@ -32,12 +31,12 @@
namespace orc {
class ColumnPrinter {
- protected:
- std::string &buffer;
- bool hasNulls ;
+ protected:
+ std::string& buffer;
+ bool hasNulls;
const char* notNull;
- public:
+ public:
ColumnPrinter(std::string&);
virtual ~ColumnPrinter();
virtual void printRow(uint64_t rowId) = 0;
@@ -45,7 +44,6 @@ namespace orc {
virtual void reset(const ColumnVectorBatch& batch);
};
- ORC_UNIQUE_PTR<ColumnPrinter> createColumnPrinter(std::string&,
- const Type* type);
-}
+ ORC_UNIQUE_PTR<ColumnPrinter> createColumnPrinter(std::string&, const Type* type);
+} // namespace orc
#endif
diff --git a/c++/include/orc/Common.hh b/c++/include/orc/Common.hh
index e51e37e71..2b9a3f213 100644
--- a/c++/include/orc/Common.hh
+++ b/c++/include/orc/Common.hh
@@ -19,47 +19,45 @@
#ifndef ORC_COMMON_HH
#define ORC_COMMON_HH
-#include "orc/Vector.hh"
-#include "orc/Type.hh"
#include "orc/Exceptions.hh"
+#include "orc/Type.hh"
+#include "orc/Vector.hh"
#include <string>
namespace orc {
class FileVersion {
- private:
+ private:
uint32_t majorVersion;
uint32_t minorVersion;
- public:
+
+ public:
static const FileVersion& v_0_11();
static const FileVersion& v_0_12();
static const FileVersion& UNSTABLE_PRE_2_0();
- FileVersion(uint32_t major, uint32_t minor) :
- majorVersion(major), minorVersion(minor) {
- }
+ FileVersion(uint32_t major, uint32_t minor) : majorVersion(major), minorVersion(minor) {}
/**
* Get major version
*/
uint32_t getMajor() const {
- return this->majorVersion;
+ return this->majorVersion;
}
/**
* Get minor version
*/
uint32_t getMinor() const {
- return this->minorVersion;
+ return this->minorVersion;
}
- bool operator == (const FileVersion & right) const {
- return this->majorVersion == right.getMajor() &&
- this->minorVersion == right.getMinor();
+ bool operator==(const FileVersion& right) const {
+ return this->majorVersion == right.getMajor() && this->minorVersion == right.getMinor();
}
- bool operator != (const FileVersion & right) const {
+ bool operator!=(const FileVersion& right) const {
return !(*this == right);
}
@@ -140,7 +138,7 @@ namespace orc {
std::string streamKindToString(StreamKind kind);
class StreamInformation {
- public:
+ public:
virtual ~StreamInformation();
virtual StreamKind getKind() const = 0;
@@ -159,7 +157,7 @@ namespace orc {
std::string columnEncodingKindToString(ColumnEncodingKind kind);
class StripeInformation {
- public:
+ public:
virtual ~StripeInformation();
/**
@@ -184,7 +182,7 @@ namespace orc {
* Get the length of the stripe's data.
* @return the number of bytes in the stripe
*/
- virtual uint64_t getDataLength()const = 0;
+ virtual uint64_t getDataLength() const = 0;
/**
* Get the length of the stripe's tail section, which contains its index.
@@ -206,8 +204,7 @@ namespace orc {
/**
* Get the StreamInformation for the given stream.
*/
- virtual ORC_UNIQUE_PTR<StreamInformation>
- getStreamInformation(uint64_t streamId) const = 0;
+ virtual ORC_UNIQUE_PTR<StreamInformation> getStreamInformation(uint64_t streamId) const = 0;
/**
* Get the column encoding for the given column.
@@ -238,10 +235,8 @@ namespace orc {
template <>
inline bool compare(Decimal val1, Decimal val2) {
// compare integral parts
- Int128 integral1 = scaleDownInt128ByPowerOfTen(val1.value,
- val1.scale);
- Int128 integral2 = scaleDownInt128ByPowerOfTen(val2.value,
- val2.scale);
+ Int128 integral1 = scaleDownInt128ByPowerOfTen(val1.value, val1.scale);
+ Int128 integral2 = scaleDownInt128ByPowerOfTen(val2.value, val2.scale);
if (integral1 < integral2) {
return true;
@@ -253,25 +248,17 @@ namespace orc {
// unnecessary to check overflow here because the scaled number will not
// exceed original ones
bool overflow = false, positive = val1.value >= 0;
- val1.value -= scaleUpInt128ByPowerOfTen(integral1,
- val1.scale,
- overflow);
- val2.value -= scaleUpInt128ByPowerOfTen(integral2,
- val2.scale,
- overflow);
+ val1.value -= scaleUpInt128ByPowerOfTen(integral1, val1.scale, overflow);
+ val2.value -= scaleUpInt128ByPowerOfTen(integral2, val2.scale, overflow);
int32_t diff = val1.scale - val2.scale;
if (diff > 0) {
- val2.value = scaleUpInt128ByPowerOfTen(val2.value,
- diff,
- overflow);
+ val2.value = scaleUpInt128ByPowerOfTen(val2.value, diff, overflow);
if (overflow) {
return positive ? true : false;
}
} else {
- val1.value = scaleUpInt128ByPowerOfTen(val1.value,
- -diff,
- overflow);
+ val1.value = scaleUpInt128ByPowerOfTen(val1.value, -diff, overflow);
if (overflow) {
return positive ? false : true;
}
@@ -317,6 +304,6 @@ namespace orc {
return !(lhs != rhs);
}
-}
+} // namespace orc
#endif
diff --git a/c++/include/orc/Exceptions.hh b/c++/include/orc/Exceptions.hh
index 9765d4fd6..a1ac2b2a9 100644
--- a/c++/include/orc/Exceptions.hh
+++ b/c++/include/orc/Exceptions.hh
@@ -26,35 +26,38 @@
namespace orc {
- class NotImplementedYet: public std::logic_error {
- public:
+ class NotImplementedYet : public std::logic_error {
+ public:
explicit NotImplementedYet(const std::string& what_arg);
explicit NotImplementedYet(const char* what_arg);
virtual ~NotImplementedYet() ORC_NOEXCEPT;
NotImplementedYet(const NotImplementedYet&);
- private:
+
+ private:
NotImplementedYet& operator=(const NotImplementedYet&);
};
- class ParseError: public std::runtime_error {
- public:
+ class ParseError : public std::runtime_error {
+ public:
explicit ParseError(const std::string& what_arg);
explicit ParseError(const char* what_arg);
virtual ~ParseError() ORC_NOEXCEPT;
ParseError(const ParseError&);
- private:
+
+ private:
ParseError& operator=(const ParseError&);
};
- class InvalidArgument: public std::runtime_error {
- public:
+ class InvalidArgument : public std::runtime_error {
+ public:
explicit InvalidArgument(const std::string& what_arg);
explicit InvalidArgument(const char* what_arg);
virtual ~InvalidArgument() ORC_NOEXCEPT;
InvalidArgument(const InvalidArgument&);
- private:
+
+ private:
InvalidArgument& operator=(const InvalidArgument&);
};
-}
+} // namespace orc
#endif
diff --git a/c++/include/orc/Int128.hh b/c++/include/orc/Int128.hh
index 1f68b2b11..52a718237 100644
--- a/c++/include/orc/Int128.hh
+++ b/c++/include/orc/Int128.hh
@@ -35,7 +35,7 @@ namespace orc {
*
*/
class Int128 {
- public:
+ public:
Int128() {
highbits = 0;
lowbits = 0;
@@ -110,7 +110,7 @@ namespace orc {
* @param right the number to add
* @return *this
*/
- Int128& operator+=(const Int128 &right) {
+ Int128& operator+=(const Int128& right) {
uint64_t sum = lowbits + right.lowbits;
highbits += right.highbits;
if (sum < lowbits) {
@@ -125,7 +125,7 @@ namespace orc {
* @param right the number to subtract
* @return *this
*/
- Int128& operator-=(const Int128 &right) {
+ Int128& operator-=(const Int128& right) {
uint64_t diff = lowbits - right.lowbits;
highbits -= right.highbits;
if (diff > lowbits) {
@@ -140,7 +140,7 @@ namespace orc {
* @param right the number to multiply by
* @return *this
*/
- Int128& operator*=(const Int128 &right);
+ Int128& operator*=(const Int128& right);
/**
* Divide this number by right and return the result. This operation is
@@ -154,14 +154,14 @@ namespace orc {
* @param right the number to divide by
* @param remainder the remainder after the division
*/
- Int128 divide(const Int128 &right, Int128& remainder) const;
+ Int128 divide(const Int128& right, Int128& remainder) const;
/**
* Logical or between two Int128.
* @param right the number to or in
* @return *this
*/
- Int128& operator|=(const Int128 &right) {
+ Int128& operator|=(const Int128& right) {
lowbits |= right.lowbits;
highbits |= right.highbits;
return *this;
@@ -172,7 +172,7 @@ namespace orc {
* @param right the number to and in
* @return *this
*/
- Int128& operator&=(const Int128 &right) {
+ Int128& operator&=(const Int128& right) {
lowbits &= right.lowbits;
highbits &= right.highbits;
return *this;
@@ -183,7 +183,7 @@ namespace orc {
* @param right the number to and in
* @return logical and result
*/
- Int128 operator&(const Int128 &right) {
+ Int128 operator&(const Int128& right) {
Int128 value = *this;
value &= right;
return value;
@@ -219,8 +219,7 @@ namespace orc {
if (bits < 64) {
lowbits >>= bits;
lowbits |= static_cast<uint64_t>(highbits << (64 - bits));
- highbits = static_cast<int64_t>
- (static_cast<uint64_t>(highbits) >> bits);
+ highbits = static_cast<int64_t>(static_cast<uint64_t>(highbits) >> bits);
} else if (bits < 128) {
lowbits = static_cast<uint64_t>(highbits >> (bits - 64));
highbits = highbits >= 0 ? 0 : -1l;
@@ -240,7 +239,7 @@ namespace orc {
return highbits != right.highbits || lowbits != right.lowbits;
}
- bool operator<(const Int128 &right) const {
+ bool operator<(const Int128& right) const {
if (highbits == right.highbits) {
return lowbits < right.lowbits;
} else {
@@ -248,7 +247,7 @@ namespace orc {
}
}
- bool operator<=(const Int128 &right) const {
+ bool operator<=(const Int128& right) const {
if (highbits == right.highbits) {
return lowbits <= right.lowbits;
} else {
@@ -256,7 +255,7 @@ namespace orc {
}
}
- bool operator>(const Int128 &right) const {
+ bool operator>(const Int128& right) const {
if (highbits == right.highbits) {
return lowbits > right.lowbits;
} else {
@@ -264,7 +263,7 @@ namespace orc {
}
}
- bool operator>=(const Int128 &right) const {
+ bool operator>=(const Int128& right) const {
if (highbits == right.highbits) {
return lowbits >= right.lowbits;
} else {
@@ -273,10 +272,8 @@ namespace orc {
}
uint32_t hash() const {
- return static_cast<uint32_t>(highbits >> 32) ^
- static_cast<uint32_t>(highbits) ^
- static_cast<uint32_t>(lowbits >> 32) ^
- static_cast<uint32_t>(lowbits);
+ return static_cast<uint32_t>(highbits >> 32) ^ static_cast<uint32_t>(highbits) ^
+ static_cast<uint32_t>(lowbits >> 32) ^ static_cast<uint32_t>(lowbits);
}
/**
@@ -284,12 +281,12 @@ namespace orc {
*/
bool fitsInLong() const {
switch (highbits) {
- case 0:
- return 0 == (lowbits & LONG_SIGN_BIT);
- case -1:
- return 0 != (lowbits & LONG_SIGN_BIT);
- default:
- return false;
+ case 0:
+ return 0 == (lowbits & LONG_SIGN_BIT);
+ case -1:
+ return 0 != (lowbits & LONG_SIGN_BIT);
+ default:
+ return false;
}
}
@@ -316,8 +313,7 @@ namespace orc {
* @param trimTrailingZeros whether or not to trim trailing zeros
* @return converted string representation
*/
- std::string toDecimalString(int32_t scale = 0,
- bool trimTrailingZeros = false) const;
+ std::string toDecimalString(int32_t scale = 0, bool trimTrailingZeros = false) const;
/**
* Return the base 16 string representation of the two's complement with
@@ -347,15 +343,14 @@ namespace orc {
* @param wasNegative set to true if the original number was negative
* @return the number of elements that were set in the array (1 to 4)
*/
- int64_t fillInArray(uint32_t* array, bool &wasNegative) const;
+ int64_t fillInArray(uint32_t* array, bool& wasNegative) const;
- private:
+ private:
static const uint64_t LONG_SIGN_BIT = 0x8000000000000000u;
int64_t highbits;
uint64_t lowbits;
};
-
/**
* Scales up an Int128 value
* @param value the Int128 value to scale
@@ -363,9 +358,7 @@ namespace orc {
* @param overflow returns whether the result overflows or not
* @return the scaled value
*/
- Int128 scaleUpInt128ByPowerOfTen(Int128 value,
- int32_t power,
- bool &overflow);
+ Int128 scaleUpInt128ByPowerOfTen(Int128 value, int32_t power, bool& overflow);
/**
* Scales down an Int128 value
* @param value the Int128 value to scale
@@ -373,5 +366,5 @@ namespace orc {
* @return the scaled value
*/
Int128 scaleDownInt128ByPowerOfTen(Int128 value, int32_t power);
-}
+} // namespace orc
#endif
diff --git a/c++/include/orc/MemoryPool.hh b/c++/include/orc/MemoryPool.hh
index 080327159..347505412 100644
--- a/c++/include/orc/MemoryPool.hh
+++ b/c++/include/orc/MemoryPool.hh
@@ -19,15 +19,15 @@
#ifndef MEMORYPOOL_HH_
#define MEMORYPOOL_HH_
-#include "orc/orc-config.hh"
#include "orc/Int128.hh"
+#include "orc/orc-config.hh"
#include <memory>
namespace orc {
class MemoryPool {
- public:
+ public:
virtual ~MemoryPool();
virtual char* malloc(uint64_t size) = 0;
@@ -37,7 +37,7 @@ namespace orc {
template <class T>
class DataBuffer {
- private:
+ private:
MemoryPool& memoryPool;
T* buf;
// current size
@@ -49,7 +49,7 @@ namespace orc {
DataBuffer(DataBuffer& buffer);
DataBuffer& operator=(DataBuffer& buffer);
- public:
+ public:
DataBuffer(MemoryPool& pool, uint64_t _size = 0);
DataBuffer(DataBuffer<T>&& buffer) ORC_NOEXCEPT;
@@ -132,10 +132,10 @@ namespace orc {
template <>
void DataBuffer<unsigned char>::resize(uint64_t newSize);
- #ifdef __clang__
- #pragma clang diagnostic push
- #pragma clang diagnostic ignored "-Wweak-template-vtables"
- #endif
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wweak-template-vtables"
+#endif
extern template class DataBuffer<char>;
extern template class DataBuffer<char*>;
@@ -145,10 +145,9 @@ namespace orc {
extern template class DataBuffer<uint64_t>;
extern template class DataBuffer<unsigned char>;
- #ifdef __clang__
- #pragma clang diagnostic pop
- #endif
-} // namespace orc
-
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+} // namespace orc
#endif /* MEMORYPOOL_HH_ */
diff --git a/c++/include/orc/OrcFile.hh b/c++/include/orc/OrcFile.hh
index 1ae7725f2..ddc7843bb 100644
--- a/c++/include/orc/OrcFile.hh
+++ b/c++/include/orc/OrcFile.hh
@@ -21,9 +21,9 @@
#include <string>
-#include "orc/orc-config.hh"
#include "orc/Reader.hh"
#include "orc/Writer.hh"
+#include "orc/orc-config.hh"
/** /file orc/OrcFile.hh
@brief The top level interface to ORC.
@@ -35,7 +35,7 @@ namespace orc {
* An abstract interface for providing ORC readers a stream of bytes.
*/
class InputStream {
- public:
+ public:
virtual ~InputStream();
/**
@@ -56,9 +56,7 @@ namespace orc {
* @param length the number of bytes to read.
* @param offset the position in the stream to read from.
*/
- virtual void read(void* buf,
- uint64_t length,
- uint64_t offset) = 0;
+ virtual void read(void* buf, uint64_t length, uint64_t offset) = 0;
/**
* Get the name of the stream for error messages.
@@ -70,7 +68,7 @@ namespace orc {
* An abstract interface for providing ORC writer a stream of bytes.
*/
class OutputStream {
- public:
+ public:
virtual ~OutputStream();
/**
@@ -107,8 +105,7 @@ namespace orc {
* @param path the name of the file in the local file system or HDFS
* @param metrics the metrics of the reader
*/
- ORC_UNIQUE_PTR<InputStream> readFile(const std::string& path,
- ReaderMetrics* metrics = nullptr);
+ ORC_UNIQUE_PTR<InputStream> readFile(const std::string& path, ReaderMetrics* metrics = nullptr);
/**
* Create a stream to a local file.
@@ -145,10 +142,8 @@ namespace orc {
* @param stream the stream to write to
* @param options the options for writing the file
*/
- ORC_UNIQUE_PTR<Writer> createWriter(
- const Type& type,
- OutputStream* stream,
+ ORC_UNIQUE_PTR<Writer> createWriter(const Type& type, OutputStream* stream,
const WriterOptions& options);
-}
+} // namespace orc
#endif
diff --git a/c++/include/orc/Reader.hh b/c++/include/orc/Reader.hh
index 6f4aaf67d..98e4e8b3a 100644
--- a/c++/include/orc/Reader.hh
+++ b/c++/include/orc/Reader.hh
@@ -21,18 +21,18 @@
#include "orc/BloomFilter.hh"
#include "orc/Common.hh"
-#include "orc/orc-config.hh"
#include "orc/Statistics.hh"
-#include "orc/sargs/SearchArgument.hh"
#include "orc/Type.hh"
#include "orc/Vector.hh"
+#include "orc/orc-config.hh"
+#include "orc/sargs/SearchArgument.hh"
+#include <atomic>
#include <map>
#include <memory>
#include <set>
#include <string>
#include <vector>
-#include <atomic>
namespace orc {
@@ -66,10 +66,10 @@ namespace orc {
* Options for creating a Reader.
*/
class ReaderOptions {
- private:
+ private:
ORC_UNIQUE_PTR<ReaderOptionsPrivate> privateBits;
- public:
+ public:
ReaderOptions();
ReaderOptions(const ReaderOptions&);
ReaderOptions(ReaderOptions&);
@@ -144,10 +144,10 @@ namespace orc {
* Options for creating a RowReader.
*/
class RowReaderOptions {
- private:
+ private:
ORC_UNIQUE_PTR<RowReaderOptionsPrivate> privateBits;
- public:
+ public:
RowReaderOptions();
RowReaderOptions(const RowReaderOptions&);
RowReaderOptions(RowReaderOptions&);
@@ -200,8 +200,7 @@ namespace orc {
* @param idReadIntentMap a map of IdReadIntentMap.
* @return this
*/
- RowReaderOptions&
- includeTypesWithIntents(const IdReadIntentMap& idReadIntentMap);
+ RowReaderOptions& includeTypesWithIntents(const IdReadIntentMap& idReadIntentMap);
/**
* Set the section of the file to process.
@@ -327,7 +326,6 @@ namespace orc {
const IdReadIntentMap getIdReadIntentMap() const;
};
-
class RowReader;
/**
@@ -335,7 +333,7 @@ namespace orc {
* This is an an abstract class that will be subclassed as necessary.
*/
class Reader {
- public:
+ public:
virtual ~Reader();
/**
@@ -425,8 +423,7 @@ namespace orc {
* @param stripeIndex the index of the stripe (0 to N-1) to get information about
* @return the information about that stripe
*/
- virtual ORC_UNIQUE_PTR<StripeInformation>
- getStripe(uint64_t stripeIndex) const = 0;
+ virtual ORC_UNIQUE_PTR<StripeInformation> getStripe(uint64_t stripeIndex) const = 0;
/**
* Get the number of stripe statistics in the file.
@@ -439,8 +436,7 @@ namespace orc {
* @param stripeIndex the index of the stripe (0 to N-1) to get statistics about
* @return the statistics about that stripe
*/
- virtual ORC_UNIQUE_PTR<StripeStatistics>
- getStripeStatistics(uint64_t stripeIndex) const = 0;
+ virtual ORC_UNIQUE_PTR<StripeStatistics> getStripeStatistics(uint64_t stripeIndex) const = 0;
/**
* Get the length of the data stripes in the file.
@@ -483,8 +479,7 @@ namespace orc {
* @param columnId id of the column
* @return the information about the column
*/
- virtual ORC_UNIQUE_PTR<ColumnStatistics>
- getColumnStatistics(uint32_t columnId) const = 0;
+ virtual ORC_UNIQUE_PTR<ColumnStatistics> getColumnStatistics(uint32_t columnId) const = 0;
/**
* Check if the file has correct column statistics.
@@ -535,13 +530,13 @@ namespace orc {
* based on the information in the file footer.
* The bound is less tight if only few columns are read or compression is
* used.
- */
+ */
/**
* @param stripeIx index of the stripe to be read (if not specified,
* all stripes are considered).
* @return upper bound on memory use by all columns
*/
- virtual uint64_t getMemoryUse(int stripeIx=-1) = 0;
+ virtual uint64_t getMemoryUse(int stripeIx = -1) = 0;
/**
* @param include Column Field Ids
@@ -549,7 +544,8 @@ namespace orc {
* all stripes are considered).
* @return upper bound on memory use by selected columns
*/
- virtual uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx=-1) = 0;
+ virtual uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include,
+ int stripeIx = -1) = 0;
/**
* @param names Column Names
@@ -557,7 +553,7 @@ namespace orc {
* all stripes are considered).
* @return upper bound on memory use by selected columns
*/
- virtual uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx=-1) = 0;
+ virtual uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx = -1) = 0;
/**
* @param include Column Type Ids
@@ -565,7 +561,8 @@ namespace orc {
* all stripes are considered).
* @return upper bound on memory use by selected columns
*/
- virtual uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx=-1) = 0;
+ virtual uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include,
+ int stripeIx = -1) = 0;
/**
* Get BloomFiters of all selected columns in the specified stripe
@@ -574,8 +571,8 @@ namespace orc {
* all columns that have bloom filters are considered).
* @return map of bloom filters with the key standing for the index of column.
*/
- virtual std::map<uint32_t, BloomFilterIndex>
- getBloomFilters(uint32_t stripeIndex, const std::set<uint32_t>& included) const = 0;
+ virtual std::map<uint32_t, BloomFilterIndex> getBloomFilters(
+ uint32_t stripeIndex, const std::set<uint32_t>& included) const = 0;
};
/**
@@ -583,7 +580,7 @@ namespace orc {
* This is an an abstract class that will be subclassed as necessary.
*/
class RowReader {
- public:
+ public:
virtual ~RowReader();
/**
* Get the selected type of the rows in the file. The file's row type
@@ -605,8 +602,7 @@ namespace orc {
* @param size the number of rows to read
* @return a new ColumnVectorBatch to read into
*/
- virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size
- ) const = 0;
+ virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size) const = 0;
/**
* Read the next row batch from the current position.
@@ -629,8 +625,7 @@ namespace orc {
* @param rowNumber the next row the reader should return
*/
virtual void seekToRow(uint64_t rowNumber) = 0;
-
};
-}
+} // namespace orc
#endif
diff --git a/c++/include/orc/Statistics.hh b/c++/include/orc/Statistics.hh
index 4d7caeab3..c08885b57 100644
--- a/c++/include/orc/Statistics.hh
+++ b/c++/include/orc/Statistics.hh
@@ -19,9 +19,11 @@
#ifndef ORC_STATISTICS_HH
#define ORC_STATISTICS_HH
-#include "orc/orc-config.hh"
#include "orc/Type.hh"
#include "orc/Vector.hh"
+#include "orc/orc-config.hh"
+
+#include <sstream>
namespace orc {
@@ -29,7 +31,7 @@ namespace orc {
* Statistics that are available for all types of columns.
*/
class ColumnStatistics {
- public:
+ public:
virtual ~ColumnStatistics();
/**
@@ -54,8 +56,8 @@ namespace orc {
/**
* Statistics for binary columns.
*/
- class BinaryColumnStatistics: public ColumnStatistics {
- public:
+ class BinaryColumnStatistics : public ColumnStatistics {
+ public:
virtual ~BinaryColumnStatistics();
/**
@@ -70,8 +72,8 @@ namespace orc {
/**
* Statistics for boolean columns.
*/
- class BooleanColumnStatistics: public ColumnStatistics {
- public:
+ class BooleanColumnStatistics : public ColumnStatistics {
+ public:
virtual ~BooleanColumnStatistics();
/**
@@ -87,8 +89,8 @@ namespace orc {
/**
* Statistics for date columns.
*/
- class DateColumnStatistics: public ColumnStatistics {
- public:
+ class DateColumnStatistics : public ColumnStatistics {
+ public:
virtual ~DateColumnStatistics();
/**
@@ -119,8 +121,8 @@ namespace orc {
/**
* Statistics for decimal columns.
*/
- class DecimalColumnStatistics: public ColumnStatistics {
- public:
+ class DecimalColumnStatistics : public ColumnStatistics {
+ public:
virtual ~DecimalColumnStatistics();
/**
@@ -163,8 +165,8 @@ namespace orc {
/**
* Statistics for float and double columns.
*/
- class DoubleColumnStatistics: public ColumnStatistics {
- public:
+ class DoubleColumnStatistics : public ColumnStatistics {
+ public:
virtual ~DoubleColumnStatistics();
/**
@@ -210,8 +212,8 @@ namespace orc {
* Statistics for all of the integer columns, such as byte, short, int, and
* long.
*/
- class IntegerColumnStatistics: public ColumnStatistics {
- public:
+ class IntegerColumnStatistics : public ColumnStatistics {
+ public:
virtual ~IntegerColumnStatistics();
/**
@@ -256,8 +258,8 @@ namespace orc {
/**
* Statistics for string columns.
*/
- class StringColumnStatistics: public ColumnStatistics {
- public:
+ class StringColumnStatistics : public ColumnStatistics {
+ public:
virtual ~StringColumnStatistics();
/**
@@ -282,13 +284,13 @@ namespace orc {
* Get the minimum value for the column.
* @return minimum value
*/
- virtual const std::string & getMinimum() const = 0;
+ virtual const std::string& getMinimum() const = 0;
/**
* Get the maximum value for the column.
* @return maximum value
*/
- virtual const std::string & getMaximum() const = 0;
+ virtual const std::string& getMaximum() const = 0;
/**
* Get the total length of all values.
@@ -300,8 +302,8 @@ namespace orc {
/**
* Statistics for timestamp columns.
*/
- class TimestampColumnStatistics: public ColumnStatistics {
- public:
+ class TimestampColumnStatistics : public ColumnStatistics {
+ public:
virtual ~TimestampColumnStatistics();
/**
@@ -366,7 +368,7 @@ namespace orc {
};
class Statistics {
- public:
+ public:
virtual ~Statistics();
/**
@@ -374,8 +376,7 @@ namespace orc {
* @param colId id of the column
* @return one column's statistics
*/
- virtual const ColumnStatistics* getColumnStatistics(uint32_t colId
- ) const = 0;
+ virtual const ColumnStatistics* getColumnStatistics(uint32_t colId) const = 0;
/**
* Get the number of columns.
@@ -388,7 +389,7 @@ namespace orc {
* Statistics for all of collections such as Map and List.
*/
class CollectionColumnStatistics : public ColumnStatistics {
- public:
+ public:
virtual ~CollectionColumnStatistics();
/**
@@ -453,7 +454,7 @@ namespace orc {
};
class StripeStatistics : public Statistics {
- public:
+ public:
virtual ~StripeStatistics();
/**
@@ -462,9 +463,8 @@ namespace orc {
* @param rowIndexId RowIndex entry id
* @return statistics of the given RowIndex entry
*/
- virtual const ColumnStatistics*
- getRowIndexStatistics(
- uint32_t columnId, uint32_t rowIndexId) const = 0;
+ virtual const ColumnStatistics* getRowIndexStatistics(uint32_t columnId,
+ uint32_t rowIndexId) const = 0;
/**
* Get the number of RowIndex statistics in a given column.
@@ -473,6 +473,6 @@ namespace orc {
*/
virtual uint32_t getNumberOfRowIndexStats(uint32_t columnId) const = 0;
};
-}
+} // namespace orc
#endif
diff --git a/c++/include/orc/Type.hh b/c++/include/orc/Type.hh
index a7df8307e..e3c9768c1 100644
--- a/c++/include/orc/Type.hh
+++ b/c++/include/orc/Type.hh
@@ -19,9 +19,9 @@
#ifndef ORC_TYPE_HH
#define ORC_TYPE_HH
-#include "orc/orc-config.hh"
-#include "orc/Vector.hh"
#include "MemoryPool.hh"
+#include "orc/Vector.hh"
+#include "orc/orc-config.hh"
namespace orc {
@@ -48,7 +48,7 @@ namespace orc {
};
class Type {
- public:
+ public:
virtual ~Type();
virtual uint64_t getColumnId() const = 0;
virtual uint64_t getMaximumColumnId() const = 0;
@@ -59,8 +59,7 @@ namespace orc {
virtual uint64_t getMaximumLength() const = 0;
virtual uint64_t getPrecision() const = 0;
virtual uint64_t getScale() const = 0;
- virtual Type& setAttribute(const std::string& key,
- const std::string& value) = 0;
+ virtual Type& setAttribute(const std::string& key, const std::string& value) = 0;
virtual bool hasAttributeKey(const std::string& key) const = 0;
virtual Type& removeAttribute(const std::string& key) = 0;
virtual std::vector<std::string> getAttributeKeys() const = 0;
@@ -70,10 +69,8 @@ namespace orc {
/**
* Create a row batch for this type.
*/
- virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size,
- MemoryPool& pool,
- bool encoded = false
- ) const = 0;
+ virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size, MemoryPool& pool,
+ bool encoded = false) const = 0;
/**
* Add a new field to a struct type.
@@ -81,8 +78,7 @@ namespace orc {
* @param fieldType the type of the new field
* @return a reference to the struct type
*/
- virtual Type* addStructField(const std::string& fieldName,
- ORC_UNIQUE_PTR<Type> fieldType) = 0;
+ virtual Type* addStructField(const std::string& fieldName, ORC_UNIQUE_PTR<Type> fieldType) = 0;
/**
* Add a new child to a union type.
@@ -101,18 +97,14 @@ namespace orc {
const int64_t DEFAULT_DECIMAL_PRECISION = 38;
ORC_UNIQUE_PTR<Type> createPrimitiveType(TypeKind kind);
- ORC_UNIQUE_PTR<Type> createCharType(TypeKind kind,
- uint64_t maxLength);
- ORC_UNIQUE_PTR<Type>
- createDecimalType(uint64_t precision=
- DEFAULT_DECIMAL_PRECISION,
- uint64_t scale=DEFAULT_DECIMAL_SCALE);
+ ORC_UNIQUE_PTR<Type> createCharType(TypeKind kind, uint64_t maxLength);
+ ORC_UNIQUE_PTR<Type> createDecimalType(uint64_t precision = DEFAULT_DECIMAL_PRECISION,
+ uint64_t scale = DEFAULT_DECIMAL_SCALE);
ORC_UNIQUE_PTR<Type> createStructType();
ORC_UNIQUE_PTR<Type> createListType(ORC_UNIQUE_PTR<Type> elements);
- ORC_UNIQUE_PTR<Type> createMapType(ORC_UNIQUE_PTR<Type> key,
- ORC_UNIQUE_PTR<Type> value);
+ ORC_UNIQUE_PTR<Type> createMapType(ORC_UNIQUE_PTR<Type> key, ORC_UNIQUE_PTR<Type> value);
ORC_UNIQUE_PTR<Type> createUnionType();
-}
+} // namespace orc
#endif
diff --git a/c++/include/orc/Vector.hh b/c++/include/orc/Vector.hh
index 752e1af78..27afaa71f 100644
--- a/c++/include/orc/Vector.hh
+++ b/c++/include/orc/Vector.hh
@@ -19,17 +19,17 @@
#ifndef ORC_VECTOR_HH
#define ORC_VECTOR_HH
-#include "orc/orc-config.hh"
-#include "MemoryPool.hh"
#include "Int128.hh"
+#include "MemoryPool.hh"
+#include "orc/orc-config.hh"
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
#include <list>
#include <memory>
-#include <cstring>
-#include <vector>
#include <stdexcept>
-#include <cstdlib>
-#include <iostream>
+#include <vector>
namespace orc {
@@ -83,12 +83,12 @@ namespace orc {
*/
virtual bool hasVariableLength();
- private:
+ private:
ColumnVectorBatch(const ColumnVectorBatch&);
ColumnVectorBatch& operator=(const ColumnVectorBatch&);
};
- struct LongVectorBatch: public ColumnVectorBatch {
+ struct LongVectorBatch : public ColumnVectorBatch {
LongVectorBatch(uint64_t capacity, MemoryPool& pool);
virtual ~LongVectorBatch();
@@ -99,7 +99,7 @@ namespace orc {
uint64_t getMemoryUsage();
};
- struct DoubleVectorBatch: public ColumnVectorBatch {
+ struct DoubleVectorBatch : public ColumnVectorBatch {
DoubleVectorBatch(uint64_t capacity, MemoryPool& pool);
virtual ~DoubleVectorBatch();
std::string toString() const;
@@ -110,7 +110,7 @@ namespace orc {
DataBuffer<double> data;
};
- struct StringVectorBatch: public ColumnVectorBatch {
+ struct StringVectorBatch : public ColumnVectorBatch {
StringVectorBatch(uint64_t capacity, MemoryPool& pool);
virtual ~StringVectorBatch();
std::string toString() const;
@@ -161,7 +161,7 @@ namespace orc {
DataBuffer<int64_t> index;
};
- struct StructVectorBatch: public ColumnVectorBatch {
+ struct StructVectorBatch : public ColumnVectorBatch {
StructVectorBatch(uint64_t capacity, MemoryPool& pool);
virtual ~StructVectorBatch();
std::string toString() const;
@@ -173,7 +173,7 @@ namespace orc {
std::vector<ColumnVectorBatch*> fields;
};
- struct ListVectorBatch: public ColumnVectorBatch {
+ struct ListVectorBatch : public ColumnVectorBatch {
ListVectorBatch(uint64_t capacity, MemoryPool& pool);
virtual ~ListVectorBatch();
std::string toString() const;
@@ -192,7 +192,7 @@ namespace orc {
ORC_UNIQUE_PTR<ColumnVectorBatch> elements;
};
- struct MapVectorBatch: public ColumnVectorBatch {
+ struct MapVectorBatch : public ColumnVectorBatch {
MapVectorBatch(uint64_t capacity, MemoryPool& pool);
virtual ~MapVectorBatch();
std::string toString() const;
@@ -213,7 +213,7 @@ namespace orc {
ORC_UNIQUE_PTR<ColumnVectorBatch> elements;
};
- struct UnionVectorBatch: public ColumnVectorBatch {
+ struct UnionVectorBatch : public ColumnVectorBatch {
UnionVectorBatch(uint64_t capacity, MemoryPool& pool);
virtual ~UnionVectorBatch();
std::string toString() const;
@@ -246,7 +246,7 @@ namespace orc {
int32_t scale;
};
- struct Decimal64VectorBatch: public ColumnVectorBatch {
+ struct Decimal64VectorBatch : public ColumnVectorBatch {
Decimal64VectorBatch(uint64_t capacity, MemoryPool& pool);
virtual ~Decimal64VectorBatch();
std::string toString() const;
@@ -262,7 +262,7 @@ namespace orc {
// the numeric values
DataBuffer<int64_t> values;
- protected:
+ protected:
/**
* Contains the scales that were read from the file. Should NOT be
* used.
@@ -272,7 +272,7 @@ namespace orc {
friend class Decimal64ColumnWriter;
};
- struct Decimal128VectorBatch: public ColumnVectorBatch {
+ struct Decimal128VectorBatch : public ColumnVectorBatch {
Decimal128VectorBatch(uint64_t capacity, MemoryPool& pool);
virtual ~Decimal128VectorBatch();
std::string toString() const;
@@ -288,7 +288,7 @@ namespace orc {
// the numeric values
DataBuffer<Int128> values;
- protected:
+ protected:
/**
* Contains the scales that were read from the file. Should NOT be
* used.
@@ -304,7 +304,7 @@ namespace orc {
* The timestamps are stored split into the time_t value (seconds since
* 1 Jan 1970 00:00:00) and the nanoseconds within the time_t value.
*/
- struct TimestampVectorBatch: public ColumnVectorBatch {
+ struct TimestampVectorBatch : public ColumnVectorBatch {
TimestampVectorBatch(uint64_t capacity, MemoryPool& pool);
virtual ~TimestampVectorBatch();
std::string toString() const;
@@ -322,6 +322,6 @@ namespace orc {
DataBuffer<int64_t> nanoseconds;
};
-}
+} // namespace orc
#endif
diff --git a/c++/include/orc/Writer.hh b/c++/include/orc/Writer.hh
index 1a95572e7..854dc1dd7 100644
--- a/c++/include/orc/Writer.hh
+++ b/c++/include/orc/Writer.hh
@@ -20,9 +20,9 @@
#define ORC_WRITER_HH
#include "orc/Common.hh"
-#include "orc/orc-config.hh"
#include "orc/Type.hh"
#include "orc/Vector.hh"
+#include "orc/orc-config.hh"
#include <atomic>
#include <memory>
@@ -35,15 +35,9 @@ namespace orc {
// classes that hold data members so we can maintain binary compatibility
struct WriterOptionsPrivate;
- enum CompressionStrategy {
- CompressionStrategy_SPEED = 0,
- CompressionStrategy_COMPRESSION
- };
+ enum CompressionStrategy { CompressionStrategy_SPEED = 0, CompressionStrategy_COMPRESSION };
- enum RleVersion {
- RleVersion_1 = 0,
- RleVersion_2 = 1
- };
+ enum RleVersion { RleVersion_1 = 0, RleVersion_2 = 1 };
class Timezone;
@@ -60,10 +54,10 @@ namespace orc {
* Options for creating a Writer.
*/
class WriterOptions {
- private:
+ private:
ORC_UNIQUE_PTR<WriterOptionsPrivate> privateBits;
- public:
+ public:
WriterOptions();
WriterOptions(const WriterOptions&);
WriterOptions(WriterOptions&);
@@ -93,7 +87,8 @@ namespace orc {
uint64_t getCompressionBlockSize() const;
/**
- * Set row index stride (the number of rows per an entry in the row index). Use value 0 to disable row index.
+ * Set row index stride (the number of rows per an entry in the row index). Use value 0 to
+ * disable row index.
*/
WriterOptions& setRowIndexStride(uint64_t stride);
@@ -167,13 +162,13 @@ namespace orc {
/**
* Set the memory pool.
*/
- WriterOptions& setMemoryPool(MemoryPool * memoryPool);
+ WriterOptions& setMemoryPool(MemoryPool* memoryPool);
/**
* Get the memory pool.
* @return if not set, return default memory pool.
*/
- MemoryPool * getMemoryPool() const;
+ MemoryPool* getMemoryPool() const;
/**
* Set the error stream.
@@ -184,7 +179,7 @@ namespace orc {
* Get the error stream.
* @return if not set, return std::err.
*/
- std::ostream * getErrorStream() const;
+ std::ostream* getErrorStream() const;
/**
* Get the RLE version.
@@ -249,17 +244,17 @@ namespace orc {
/**
* Set the writer metrics.
*/
- WriterOptions& setWriterMetrics(WriterMetrics * metrics);
+ WriterOptions& setWriterMetrics(WriterMetrics* metrics);
/**
* Get the writer metrics.
* @return if not set, return nullptr.
*/
- WriterMetrics * getWriterMetrics() const;
+ WriterMetrics* getWriterMetrics() const;
};
class Writer {
- public:
+ public:
virtual ~Writer();
/**
@@ -267,8 +262,7 @@ namespace orc {
* @param size the number of rows to write.
* @return a new ColumnVectorBatch to write into.
*/
- virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size
- ) const = 0;
+ virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size) const = 0;
/**
* Add a row batch into current writer.
@@ -286,6 +280,6 @@ namespace orc {
*/
virtual void addUserMetadata(const std::string name, const std::string value) = 0;
};
-}
+} // namespace orc
#endif
diff --git a/c++/include/orc/sargs/Literal.hh b/c++/include/orc/sargs/Literal.hh
index 36c9b37e3..9ce958302 100644
--- a/c++/include/orc/sargs/Literal.hh
+++ b/c++/include/orc/sargs/Literal.hh
@@ -27,21 +27,19 @@ namespace orc {
/**
* Possible data types for predicates
*/
- enum class PredicateDataType {
- LONG = 0, FLOAT, STRING, DATE, DECIMAL, TIMESTAMP, BOOLEAN
- };
+ enum class PredicateDataType { LONG = 0, FLOAT, STRING, DATE, DECIMAL, TIMESTAMP, BOOLEAN };
/**
* Represents a literal value in a predicate
*/
class Literal {
- public:
+ public:
struct Timestamp {
Timestamp() = default;
Timestamp(const Timestamp&) = default;
Timestamp(Timestamp&&) = default;
~Timestamp() = default;
- Timestamp(int64_t second_, int32_t nanos_): second(second_), nanos(nanos_) {
+ Timestamp(int64_t second_, int32_t nanos_) : second(second_), nanos(nanos_) {
// PASS
}
Timestamp& operator=(const Timestamp&) = default;
@@ -55,15 +53,23 @@ namespace orc {
bool operator<=(const Timestamp& r) const {
return second < r.second || (second == r.second && nanos <= r.nanos);
}
- bool operator!=(const Timestamp& r) const { return !(*this == r); }
- bool operator>(const Timestamp& r) const { return r < *this; }
- bool operator>=(const Timestamp& r) const { return r <= *this; }
- int64_t getMillis() const { return second * 1000 + nanos / 1000000; }
+ bool operator!=(const Timestamp& r) const {
+ return !(*this == r);
+ }
+ bool operator>(const Timestamp& r) const {
+ return r < *this;
+ }
+ bool operator>=(const Timestamp& r) const {
+ return r <= *this;
+ }
+ int64_t getMillis() const {
+ return second * 1000 + nanos / 1000000;
+ }
int64_t second;
int32_t nanos;
};
- Literal(const Literal &r);
+ Literal(const Literal& r);
~Literal();
Literal& operator=(const Literal& r);
bool operator==(const Literal& r) const;
@@ -102,7 +108,7 @@ namespace orc {
/**
* Create a literal of STRING type
*/
- Literal(const char * str, size_t size);
+ Literal(const char* str, size_t size);
/**
* Create a literal of DECIMAL type
@@ -123,38 +129,44 @@ namespace orc {
/**
* Check if a literal is null
*/
- bool isNull() const { return mIsNull; }
+ bool isNull() const {
+ return mIsNull;
+ }
- PredicateDataType getType() const { return mType; }
+ PredicateDataType getType() const {
+ return mType;
+ }
std::string toString() const;
- size_t getHashCode() const { return mHashCode; }
+ size_t getHashCode() const {
+ return mHashCode;
+ }
- private:
+ private:
size_t hashCode() const;
union LiteralVal {
int64_t IntVal;
double DoubleVal;
int64_t DateVal;
- char * Buffer;
+ char* Buffer;
Timestamp TimeStampVal;
Int128 DecimalVal;
bool BooleanVal;
// explicitly define default constructor
- LiteralVal(): DecimalVal(0) {}
+ LiteralVal() : DecimalVal(0) {}
};
- private:
- LiteralVal mValue; // data value for this literal if not null
- PredicateDataType mType; // data type of the literal
- size_t mSize; // size of mValue if it is Buffer
- int32_t mPrecision; // precision of decimal type
- int32_t mScale; // scale of decimal type
- bool mIsNull; // whether this literal is null
- size_t mHashCode; // precomputed hash code for the literal
+ private:
+ LiteralVal mValue; // data value for this literal if not null
+ PredicateDataType mType; // data type of the literal
+ size_t mSize; // size of mValue if it is Buffer
+ int32_t mPrecision; // precision of decimal type
+ int32_t mScale; // scale of decimal type
+ bool mIsNull; // whether this literal is null
+ size_t mHashCode; // precomputed hash code for the literal
};
-} // namespace orc
+} // namespace orc
-#endif //ORC_LITERAL_HH
+#endif // ORC_LITERAL_HH
diff --git a/c++/include/orc/sargs/SearchArgument.hh b/c++/include/orc/sargs/SearchArgument.hh
index 44fde8f5e..6493840a9 100644
--- a/c++/include/orc/sargs/SearchArgument.hh
+++ b/c++/include/orc/sargs/SearchArgument.hh
@@ -34,7 +34,7 @@ namespace orc {
* (<a href="http://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF</a>).
*/
class SearchArgument {
- public:
+ public:
virtual ~SearchArgument();
/**
@@ -52,7 +52,7 @@ namespace orc {
* must call startOr, startAnd, or startNot before adding any leaves.
*/
class SearchArgumentBuilder {
- public:
+ public:
virtual ~SearchArgumentBuilder();
/**
@@ -87,8 +87,7 @@ namespace orc {
* @param literal the literal
* @return this
*/
- virtual SearchArgumentBuilder& lessThan(const std::string& column,
- PredicateDataType type,
+ virtual SearchArgumentBuilder& lessThan(const std::string& column, PredicateDataType type,
Literal literal) = 0;
/**
@@ -98,8 +97,7 @@ namespace orc {
* @param literal the literal
* @return this
*/
- virtual SearchArgumentBuilder& lessThan(uint64_t columnId,
- PredicateDataType type,
+ virtual SearchArgumentBuilder& lessThan(uint64_t columnId, PredicateDataType type,
Literal literal) = 0;
/**
@@ -109,8 +107,7 @@ namespace orc {
* @param literal the literal
* @return this
*/
- virtual SearchArgumentBuilder& lessThanEquals(const std::string& column,
- PredicateDataType type,
+ virtual SearchArgumentBuilder& lessThanEquals(const std::string& column, PredicateDataType type,
Literal literal) = 0;
/**
@@ -120,8 +117,7 @@ namespace orc {
* @param literal the literal
* @return this
*/
- virtual SearchArgumentBuilder& lessThanEquals(uint64_t columnId,
- PredicateDataType type,
+ virtual SearchArgumentBuilder& lessThanEquals(uint64_t columnId, PredicateDataType type,
Literal literal) = 0;
/**
@@ -131,8 +127,7 @@ namespace orc {
* @param literal the literal
* @return this
*/
- virtual SearchArgumentBuilder& equals(const std::string& column,
- PredicateDataType type,
+ virtual SearchArgumentBuilder& equals(const std::string& column, PredicateDataType type,
Literal literal) = 0;
/**
@@ -142,8 +137,7 @@ namespace orc {
* @param literal the literal
* @return this
*/
- virtual SearchArgumentBuilder& equals(uint64_t columnId,
- PredicateDataType type,
+ virtual SearchArgumentBuilder& equals(uint64_t columnId, PredicateDataType type,
Literal literal) = 0;
/**
@@ -153,8 +147,7 @@ namespace orc {
* @param literal the literal
* @return this
*/
- virtual SearchArgumentBuilder& nullSafeEquals(const std::string& column,
- PredicateDataType type,
+ virtual SearchArgumentBuilder& nullSafeEquals(const std::string& column, PredicateDataType type,
Literal literal) = 0;
/**
@@ -164,8 +157,7 @@ namespace orc {
* @param literal the literal
* @return this
*/
- virtual SearchArgumentBuilder& nullSafeEquals(uint64_t columnId,
- PredicateDataType type,
+ virtual SearchArgumentBuilder& nullSafeEquals(uint64_t columnId, PredicateDataType type,
Literal literal) = 0;
/**
@@ -175,8 +167,7 @@ namespace orc {
* @param literals the literals
* @return this
*/
- virtual SearchArgumentBuilder& in(const std::string& column,
- PredicateDataType type,
+ virtual SearchArgumentBuilder& in(const std::string& column, PredicateDataType type,
const std::initializer_list<Literal>& literals) = 0;
/**
@@ -186,8 +177,7 @@ namespace orc {
* @param literals the literals
* @return this
*/
- virtual SearchArgumentBuilder& in(uint64_t columnId,
- PredicateDataType type,
+ virtual SearchArgumentBuilder& in(uint64_t columnId, PredicateDataType type,
const std::initializer_list<Literal>& literals) = 0;
/**
@@ -197,8 +187,7 @@ namespace orc {
* @param literals the literals
* @return this
*/
- virtual SearchArgumentBuilder& in(const std::string& column,
- PredicateDataType type,
+ virtual SearchArgumentBuilder& in(const std::string& column, PredicateDataType type,
const std::vector<Literal>& literals) = 0;
/**
@@ -208,8 +197,7 @@ namespace orc {
* @param literals the literals
* @return this
*/
- virtual SearchArgumentBuilder& in(uint64_t columnId,
- PredicateDataType type,
+ virtual SearchArgumentBuilder& in(uint64_t columnId, PredicateDataType type,
const std::vector<Literal>& literals) = 0;
/**
@@ -218,8 +206,7 @@ namespace orc {
* @param type the type of the expression
* @return this
*/
- virtual SearchArgumentBuilder& isNull(const std::string& column,
- PredicateDataType type) = 0;
+ virtual SearchArgumentBuilder& isNull(const std::string& column, PredicateDataType type) = 0;
/**
* Add an is null leaf to the current item on the stack.
@@ -227,8 +214,7 @@ namespace orc {
* @param type the type of the expression
* @return this
*/
- virtual SearchArgumentBuilder& isNull(uint64_t columnId,
- PredicateDataType type) = 0;
+ virtual SearchArgumentBuilder& isNull(uint64_t columnId, PredicateDataType type) = 0;
/**
* Add a between leaf to the current item on the stack.
@@ -238,10 +224,8 @@ namespace orc {
* @param upper the literal
* @return this
*/
- virtual SearchArgumentBuilder& between(const std::string& column,
- PredicateDataType type,
- Literal lower,
- Literal upper) = 0;
+ virtual SearchArgumentBuilder& between(const std::string& column, PredicateDataType type,
+ Literal lower, Literal upper) = 0;
/**
* Add a between leaf to the current item on the stack.
@@ -251,9 +235,7 @@ namespace orc {
* @param upper the literal
* @return this
*/
- virtual SearchArgumentBuilder& between(uint64_t columnId,
- PredicateDataType type,
- Literal lower,
+ virtual SearchArgumentBuilder& between(uint64_t columnId, PredicateDataType type, Literal lower,
Literal upper) = 0;
/**
@@ -275,10 +257,10 @@ namespace orc {
* Factory to create SearchArgumentBuilder which builds SearchArgument
*/
class SearchArgumentFactory {
- public:
+ public:
static std::unique_ptr<SearchArgumentBuilder> newBuilder();
};
-} // namespace orc
+} // namespace orc
-#endif //ORC_SEARCHARGUMENT_HH
+#endif // ORC_SEARCHARGUMENT_HH
diff --git a/c++/include/orc/sargs/TruthValue.hh b/c++/include/orc/sargs/TruthValue.hh
index b3ea6b76c..fa3dce06f 100644
--- a/c++/include/orc/sargs/TruthValue.hh
+++ b/c++/include/orc/sargs/TruthValue.hh
@@ -25,13 +25,13 @@ namespace orc {
* The potential result sets of logical operations.
*/
enum class TruthValue {
- YES, // all rows satisfy the predicate
- NO, // all rows dissatisfy the predicate
- IS_NULL, // all rows are null value
- YES_NULL, // null values exist, not-null rows satisfy the predicate
- NO_NULL, // null values exist, not-null rows dissatisfy the predicate
- YES_NO, // some rows satisfy the predicate and the others not
- YES_NO_NULL // null values exist, some rows satisfy predicate and some not
+ YES, // all rows satisfy the predicate
+ NO, // all rows dissatisfy the predicate
+ IS_NULL, // all rows are null value
+ YES_NULL, // null values exist, not-null rows satisfy the predicate
+ NO_NULL, // null values exist, not-null rows dissatisfy the predicate
+ YES_NO, // some rows satisfy the predicate and the others not
+ YES_NO_NULL // null values exist, some rows satisfy predicate and some not
};
// Compute logical or between the two values.
@@ -46,6 +46,6 @@ namespace orc {
// Do we need to read the data based on the TruthValue?
bool isNeeded(TruthValue val);
-} // namespace orc
+} // namespace orc
-#endif //ORC_TRUTHVALUE_HH
+#endif // ORC_TRUTHVALUE_HH
diff --git a/c++/src/Adaptor.cc b/c++/src/Adaptor.cc
index bf3a3e181..b3d49511d 100644
--- a/c++/src/Adaptor.cc
+++ b/c++/src/Adaptor.cc
@@ -1,24 +1,24 @@
/**
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements. See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership. The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
#include "Adaptor.hh"
-#include <sstream>
#include <iomanip>
+#include <sstream>
#ifndef HAS_STOLL
namespace std {
@@ -29,7 +29,7 @@ namespace std {
ss >> val;
return val;
}
-}
+} // namespace std
#endif
#ifndef HAS_STRPTIME
@@ -43,7 +43,7 @@ char* strptime(const char* s, const char* f, struct tm* tm) {
#endif
#ifndef HAS_PREAD
- #ifdef _WIN32
+#ifdef _WIN32
#include <Windows.h>
#include <io.h>
ssize_t pread(int fd, void* buf, size_t size, off_t offset) {
@@ -60,9 +60,9 @@ ssize_t pread(int fd, void* buf, size_t size, off_t offset) {
}
return static_cast<ssize_t>(rt);
}
- #else
- #error("pread() undefined: unknown environment")
- #endif
+#else
+#error("pread() undefined: unknown environment")
+#endif
#endif
namespace orc {
@@ -85,4 +85,4 @@ namespace orc {
return std::to_string(static_cast<long long int>(val));
}
#endif
-}
+} // namespace orc
diff --git a/c++/src/BlockBuffer.cc b/c++/src/BlockBuffer.cc
index defd86ae9..1f7843fad 100644
--- a/c++/src/BlockBuffer.cc
+++ b/c++/src/BlockBuffer.cc
@@ -17,18 +17,15 @@
*/
#include "BlockBuffer.hh"
-#include "orc/Writer.hh"
#include "orc/OrcFile.hh"
+#include "orc/Writer.hh"
#include <algorithm>
namespace orc {
BlockBuffer::BlockBuffer(MemoryPool& pool, uint64_t _blockSize)
- : memoryPool(pool),
- currentSize(0),
- currentCapacity(0),
- blockSize(_blockSize) {
+ : memoryPool(pool), currentSize(0), currentCapacity(0), blockSize(_blockSize) {
if (blockSize == 0) {
throw std::logic_error("Block size cannot be zero");
}
@@ -47,15 +44,13 @@ namespace orc {
if (blockIndex >= getBlockNumber()) {
throw std::out_of_range("Block index out of range");
}
- return Block(blocks[blockIndex],
- std::min(currentSize - blockIndex * blockSize, blockSize));
+ return Block(blocks[blockIndex], std::min(currentSize - blockIndex * blockSize, blockSize));
}
BlockBuffer::Block BlockBuffer::getNextBlock() {
if (currentSize < currentCapacity) {
- Block emptyBlock(
- blocks[currentSize / blockSize] + currentSize % blockSize,
- blockSize - currentSize % blockSize);
+ Block emptyBlock(blocks[currentSize / blockSize] + currentSize % blockSize,
+ blockSize - currentSize % blockSize);
currentSize = (currentSize / blockSize + 1) * blockSize;
return emptyBlock;
} else {
@@ -85,8 +80,7 @@ namespace orc {
}
}
- void BlockBuffer::writeTo(OutputStream* output,
- WriterMetrics* metrics) {
+ void BlockBuffer::writeTo(OutputStream* output, WriterMetrics* metrics) {
if (currentSize == 0) {
return;
}
@@ -110,8 +104,7 @@ namespace orc {
uint64_t blockOffset = 0;
while (blockOffset < block.size) {
// copy current block into chunk
- uint64_t copySize =
- std::min(chunkSize - chunkOffset, block.size - blockOffset);
+ uint64_t copySize = std::min(chunkSize - chunkOffset, block.size - blockOffset);
memcpy(chunk + chunkOffset, block.data + blockOffset, copySize);
chunkOffset += copySize;
blockOffset += copySize;
@@ -135,4 +128,4 @@ namespace orc {
metrics->IOCount.fetch_add(ioCount);
}
}
-} // namespace orc
+} // namespace orc
diff --git a/c++/src/BlockBuffer.hh b/c++/src/BlockBuffer.hh
index 2869cce9b..0f5f78e3f 100644
--- a/c++/src/BlockBuffer.hh
+++ b/c++/src/BlockBuffer.hh
@@ -33,7 +33,7 @@ namespace orc {
* for allocation.
*/
class BlockBuffer {
- private:
+ private:
MemoryPool& memoryPool;
// current buffer size
uint64_t currentSize;
@@ -50,15 +50,15 @@ namespace orc {
BlockBuffer(BlockBuffer&& buffer) = delete;
BlockBuffer& operator=(BlockBuffer&& buffer) = delete;
- public:
+ public:
BlockBuffer(MemoryPool& pool, uint64_t blockSize);
~BlockBuffer();
/**
- * Block points to a section of memory allocated by BlockBuffer,
- * containing the corresponding physical memory address and available size.
- */
+ * Block points to a section of memory allocated by BlockBuffer,
+ * containing the corresponding physical memory address and available size.
+ */
struct Block {
// the start of block
char* data;
@@ -117,8 +117,7 @@ namespace orc {
* @param output the output stream to write to
* @param metrics the metrics of the writer
*/
- void writeTo(OutputStream* output,
- WriterMetrics* metrics);
+ void writeTo(OutputStream* output, WriterMetrics* metrics);
};
} // namespace orc
diff --git a/c++/src/BloomFilter.cc b/c++/src/BloomFilter.cc
index 8a1f1880e..85a38f9a2 100644
--- a/c++/src/BloomFilter.cc
+++ b/c++/src/BloomFilter.cc
@@ -22,11 +22,14 @@
namespace orc {
constexpr uint64_t BITS_OF_LONG = 64;
- constexpr uint8_t SHIFT_6_BITS = 6;
- constexpr uint8_t SHIFT_3_BITS = 3;
+ constexpr uint8_t SHIFT_6_BITS = 6;
+ constexpr uint8_t SHIFT_3_BITS = 3;
static bool isLittleEndian() {
- static union { uint32_t i; char c[4]; } num = { 0x01020304 };
+ static union {
+ uint32_t i;
+ char c[4];
+ } num = {0x01020304};
return num.c[0] == 4;
}
@@ -34,11 +37,10 @@ namespace orc {
* Implementation of BitSet
*/
BitSet::BitSet(uint64_t numBits) {
- mData.resize(static_cast<size_t>(ceil(
- static_cast<double>(numBits) / BITS_OF_LONG)), 0);
+ mData.resize(static_cast<size_t>(ceil(static_cast<double>(numBits) / BITS_OF_LONG)), 0);
}
- BitSet::BitSet(const uint64_t * bits, uint64_t numBits) {
+ BitSet::BitSet(const uint64_t* bits, uint64_t numBits) {
// caller should make sure numBits is multiple of 64
mData.resize(numBits >> SHIFT_6_BITS, 0);
memcpy(mData.data(), bits, numBits >> SHIFT_3_BITS);
@@ -59,8 +61,8 @@ namespace orc {
void BitSet::merge(const BitSet& other) {
if (mData.size() != other.mData.size()) {
std::stringstream ss;
- ss << "BitSet must be of equal length ("
- << mData.size() << " != " << other.mData.size() << ")";
+ ss << "BitSet must be of equal length (" << mData.size() << " != " << other.mData.size()
+ << ")";
throw std::logic_error(ss.str());
}
@@ -73,7 +75,7 @@ namespace orc {
memset(mData.data(), 0, sizeof(uint64_t) * mData.size());
}
- const uint64_t * BitSet::getData() const {
+ const uint64_t* BitSet::getData() const {
return mData.data();
}
@@ -92,8 +94,8 @@ namespace orc {
int32_t optimalNumOfHashFunctions(uint64_t expectedEntries, uint64_t numBits) {
double n = static_cast<double>(expectedEntries);
- return std::max<int32_t>(1, static_cast<int32_t>(
- std::round(static_cast<double>(numBits) / n * std::log(2.0))));
+ return std::max<int32_t>(
+ 1, static_cast<int32_t>(std::round(static_cast<double>(numBits) / n * std::log(2.0))));
}
int32_t optimalNumOfBits(uint64_t expectedEntries, double fpp) {
@@ -108,23 +110,20 @@ namespace orc {
// probability'
// Lets split up 64-bit hashcode into two 32-bit hash codes and employ
// the technique mentioned in the above paper
- inline uint64_t getBytesHash(const char * data, int64_t length) {
+ inline uint64_t getBytesHash(const char* data, int64_t length) {
if (data == nullptr) {
return Murmur3::NULL_HASHCODE;
}
- return Murmur3::hash64(reinterpret_cast<const uint8_t *>(data),
- static_cast<uint32_t>(length));
+ return Murmur3::hash64(reinterpret_cast<const uint8_t*>(data), static_cast<uint32_t>(length));
}
/**
* Implementation of BloomFilter
*/
BloomFilterImpl::BloomFilterImpl(uint64_t expectedEntries, double fpp) {
- checkArgument(expectedEntries > 0,
- "expectedEntries should be > 0");
- checkArgument(fpp > 0.0 && fpp < 1.0,
- "False positive probability should be > 0.0 & < 1.0");
+ checkArgument(expectedEntries > 0, "expectedEntries should be > 0");
+ checkArgument(fpp > 0.0 && fpp < 1.0, "False positive probability should be > 0.0 & < 1.0");
uint64_t nb = static_cast<uint64_t>(optimalNumOfBits(expectedEntries, fpp));
// make 'mNumBits' multiple of 64
@@ -133,7 +132,7 @@ namespace orc {
mBitSet.reset(new BitSet(mNumBits));
}
- void BloomFilterImpl::addBytes(const char * data, int64_t length) {
+ void BloomFilterImpl::addBytes(const char* data, int64_t length) {
uint64_t hash64 = getBytesHash(data, length);
addHash(static_cast<int64_t>(hash64));
}
@@ -142,7 +141,7 @@ namespace orc {
addHash(getLongHash(data));
}
- bool BloomFilterImpl::testBytes(const char * data, int64_t length) const {
+ bool BloomFilterImpl::testBytes(const char* data, int64_t length) const {
uint64_t hash64 = getBytesHash(data, length);
return testHash(static_cast<int64_t>(hash64));
}
@@ -182,7 +181,7 @@ namespace orc {
mNumBits = bitsetStr.size() << SHIFT_3_BITS;
checkArgument(mNumBits % BITS_OF_LONG == 0, "numBits should be multiple of 64!");
- const uint64_t * bitset = reinterpret_cast<const uint64_t *>(bitsetStr.data());
+ const uint64_t* bitset = reinterpret_cast<const uint64_t*>(bitsetStr.data());
if (isLittleEndian()) {
mBitSet.reset(new BitSet(bitset, mNumBits));
} else {
@@ -204,7 +203,7 @@ namespace orc {
addLong(reinterpret_cast<int64_t&>(data));
}
- bool BloomFilterImpl::testDouble(double data) const{
+ bool BloomFilterImpl::testDouble(double data) const {
return testLong(reinterpret_cast<int64_t&>(data));
}
@@ -227,7 +226,7 @@ namespace orc {
}
}
- bool BloomFilterImpl::testHash(int64_t hash64) const{
+ bool BloomFilterImpl::testHash(int64_t hash64) const {
int32_t hash1 = static_cast<int32_t>(hash64 & 0xffffffff);
// In Java codes, we use "hash64 >>> 32" which is an unsigned shift op.
// So we cast hash64 to uint64_t here for an unsigned right shift.
@@ -251,10 +250,8 @@ namespace orc {
if (mNumBits != other.mNumBits || mNumHashFunctions != other.mNumHashFunctions) {
std::stringstream ss;
ss << "BloomFilters are not compatible for merging: "
- << "this: numBits:" << mNumBits
- << ",numHashFunctions:" << mNumHashFunctions
- << ", that: numBits:" << other.mNumBits
- << ",numHashFunctions:" << other.mNumHashFunctions;
+ << "this: numBits:" << mNumBits << ",numHashFunctions:" << mNumHashFunctions
+ << ", that: numBits:" << other.mNumBits << ",numHashFunctions:" << other.mNumHashFunctions;
throw std::logic_error(ss.str());
}
@@ -272,11 +269,11 @@ namespace orc {
// a little endian encoding in the utf8bitset field.
if (isLittleEndian()) {
// bytes are already organized in little endian; thus no conversion needed
- const char * bitset = reinterpret_cast<const char *>(mBitSet->getData());
+ const char* bitset = reinterpret_cast<const char*>(mBitSet->getData());
bloomFilter.set_utf8bitset(bitset, sizeInBytes());
} else {
std::vector<uint64_t> bitset(sizeInBytes() / sizeof(uint64_t), 0);
- const uint64_t * longs = mBitSet->getData();
+ const uint64_t* longs = mBitSet->getData();
for (size_t i = 0; i != bitset.size(); ++i) {
uint64_t& dst = bitset[i];
const uint64_t src = longs[i];
@@ -290,8 +287,7 @@ namespace orc {
}
bool BloomFilterImpl::operator==(const BloomFilterImpl& other) const {
- return mNumBits == other.mNumBits &&
- mNumHashFunctions == other.mNumHashFunctions &&
+ return mNumBits == other.mNumBits && mNumHashFunctions == other.mNumHashFunctions &&
*mBitSet == *other.mBitSet;
}
@@ -300,10 +296,8 @@ namespace orc {
}
std::unique_ptr<BloomFilter> BloomFilterUTF8Utils::deserialize(
- const proto::Stream_Kind& streamKind,
- const proto::ColumnEncoding& encoding,
- const proto::BloomFilter& bloomFilter) {
-
+ const proto::Stream_Kind& streamKind, const proto::ColumnEncoding& encoding,
+ const proto::BloomFilter& bloomFilter) {
std::unique_ptr<BloomFilter> ret(nullptr);
// only BLOOM_FILTER_UTF8 is supported
@@ -325,4 +319,4 @@ namespace orc {
return ret;
}
-}
+} // namespace orc
diff --git a/c++/src/BloomFilter.hh b/c++/src/BloomFilter.hh
index cf18a46fd..d72961a83 100644
--- a/c++/src/BloomFilter.hh
+++ b/c++/src/BloomFilter.hh
@@ -33,7 +33,7 @@ namespace orc {
* for index bounds nor expand the bit set size if the specified index is greater than the size.
*/
class BitSet {
- public:
+ public:
/**
* Creates an empty BitSet
*
@@ -47,7 +47,7 @@ namespace orc {
* @param bits - serialized uint64_t buffer of bitset
* @param numBits - number of bits used
*/
- BitSet(const uint64_t * bits, uint64_t numBits);
+ BitSet(const uint64_t* bits, uint64_t numBits);
/**
* Sets the bit at specified index.
@@ -82,14 +82,14 @@ namespace orc {
/**
* Gets underlying raw data
*/
- const uint64_t * getData() const;
+ const uint64_t* getData() const;
/**
* Compares two BitSets
*/
bool operator==(const BitSet& other) const;
- private:
+ private:
std::vector<uint64_t> mData;
};
@@ -120,14 +120,14 @@ namespace orc {
* BloomFilterUtf8, which always uses UTF8 for the encoding.
*/
class BloomFilterImpl : public BloomFilter {
- public:
+ public:
/**
* Creates an empty BloomFilter
*
* @param expectedEntries - number of entries it will hold
* @param fpp - false positive probability
*/
- BloomFilterImpl(uint64_t expectedEntries, double fpp=DEFAULT_FPP);
+ BloomFilterImpl(uint64_t expectedEntries, double fpp = DEFAULT_FPP);
/**
* Creates a BloomFilter by deserializing the proto-buf version
@@ -139,14 +139,14 @@ namespace orc {
/**
* Adds a new element to the BloomFilter
*/
- void addBytes(const char * data, int64_t length);
+ void addBytes(const char* data, int64_t length);
void addLong(int64_t data);
void addDouble(double data);
/**
* Test if the element exists in BloomFilter
*/
- bool testBytes(const char * data, int64_t length) const override;
+ bool testBytes(const char* data, int64_t length) const override;
bool testLong(int64_t data) const override;
bool testDouble(double data) const override;
@@ -160,7 +160,7 @@ namespace orc {
bool operator==(const BloomFilterImpl& other) const;
- private:
+ private:
friend struct BloomFilterUTF8Utils;
friend class TestBloomFilter_testBloomFilterBasicOperations_Test;
@@ -172,7 +172,7 @@ namespace orc {
void serialize(proto::BloomFilter& bloomFilter) const;
- private:
+ private:
static constexpr double DEFAULT_FPP = 0.05;
uint64_t mNumBits;
int32_t mNumHashFunctions;
@@ -186,25 +186,24 @@ namespace orc {
}
// deserialize BloomFilter from protobuf
- static std::unique_ptr<BloomFilter>
- deserialize(const proto::Stream_Kind& streamKind,
- const proto::ColumnEncoding& columnEncoding,
- const proto::BloomFilter& bloomFilter);
+ static std::unique_ptr<BloomFilter> deserialize(const proto::Stream_Kind& streamKind,
+ const proto::ColumnEncoding& columnEncoding,
+ const proto::BloomFilter& bloomFilter);
};
// Thomas Wang's integer hash function
// http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm
// Put this in header file so tests can use it as well.
inline int64_t getLongHash(int64_t key) {
- key = (~key) + (key << 21); // key = (key << 21) - key - 1;
+ key = (~key) + (key << 21); // key = (key << 21) - key - 1;
key = key ^ (key >> 24);
- key = (key + (key << 3)) + (key << 8); // key * 265
+ key = (key + (key << 3)) + (key << 8); // key * 265
key = key ^ (key >> 14);
- key = (key + (key << 2)) + (key << 4); // key * 21
+ key = (key + (key << 2)) + (key << 4); // key * 21
key = key ^ (key >> 28);
key = key + (key << 31);
return key;
}
-}
+} // namespace orc
-#endif //ORC_BLOOMFILTER_IMPL_HH
+#endif // ORC_BLOOMFILTER_IMPL_HH
diff --git a/c++/src/ByteRLE.cc b/c++/src/ByteRLE.cc
index 261d07759..e268a5645 100644
--- a/c++/src/ByteRLE.cc
+++ b/c++/src/ByteRLE.cc
@@ -16,9 +16,9 @@
* limitations under the License.
*/
+#include <string.h>
#include <algorithm>
#include <iostream>
-#include <string.h>
#include <utility>
#include "ByteRLE.hh"
@@ -36,7 +36,7 @@ namespace orc {
}
class ByteRleEncoderImpl : public ByteRleEncoder {
- public:
+ public:
ByteRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output);
virtual ~ByteRleEncoderImpl() override;
@@ -47,8 +47,7 @@ namespace orc {
* @param notNull If the pointer is null, all values are read. If the
* pointer is not null, positions that are false are skipped.
*/
- virtual void add(const char* data, uint64_t numValues,
- const char* notNull) override;
+ virtual void add(const char* data, uint64_t numValues, const char* notNull) override;
/**
* Get size of buffer used so far.
@@ -69,7 +68,7 @@ namespace orc {
*/
void reset();
- protected:
+ protected:
std::unique_ptr<BufferedOutputStream> outputStream;
char* literals;
int numLiterals;
@@ -84,22 +83,21 @@ namespace orc {
void write(char c);
};
- ByteRleEncoderImpl::ByteRleEncoderImpl(
- std::unique_ptr<BufferedOutputStream> output)
- : outputStream(std::move(output)) {
+ ByteRleEncoderImpl::ByteRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output)
+ : outputStream(std::move(output)) {
literals = new char[MAX_LITERAL_SIZE];
reset();
}
ByteRleEncoderImpl::~ByteRleEncoderImpl() {
// PASS
- delete [] literals;
+ delete[] literals;
}
void ByteRleEncoderImpl::writeByte(char c) {
if (bufferPosition == bufferLength) {
int addedSize = 0;
- if (!outputStream->Next(reinterpret_cast<void **>(&buffer), &addedSize)) {
+ if (!outputStream->Next(reinterpret_cast<void**>(&buffer), &addedSize)) {
throw std::bad_alloc();
}
bufferPosition = 0;
@@ -108,10 +106,7 @@ namespace orc {
buffer[bufferPosition++] = c;
}
- void ByteRleEncoderImpl::add(
- const char* data,
- uint64_t numValues,
- const char* notNull) {
+ void ByteRleEncoderImpl::add(const char* data, uint64_t numValues, const char* notNull) {
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
write(data[i]);
@@ -122,8 +117,7 @@ namespace orc {
void ByteRleEncoderImpl::writeValues() {
if (numLiterals != 0) {
if (repeat) {
- writeByte(
- static_cast<char>(numLiterals - static_cast<int>(MINIMUM_REPEAT)));
+ writeByte(static_cast<char>(numLiterals - static_cast<int>(MINIMUM_REPEAT)));
writeByte(literals[0]);
} else {
writeByte(static_cast<char>(-numLiterals));
@@ -190,7 +184,7 @@ namespace orc {
return outputStream->getSize();
}
- void ByteRleEncoderImpl::recordPosition(PositionRecorder *recorder) const {
+ void ByteRleEncoderImpl::recordPosition(PositionRecorder* recorder) const {
uint64_t flushedSize = outputStream->getSize();
uint64_t unflushedSize = static_cast<uint64_t>(bufferPosition);
if (outputStream->isCompressed()) {
@@ -221,14 +215,13 @@ namespace orc {
reset();
}
- std::unique_ptr<ByteRleEncoder> createByteRleEncoder
- (std::unique_ptr<BufferedOutputStream> output) {
- return std::unique_ptr<ByteRleEncoder>(new ByteRleEncoderImpl
- (std::move(output)));
+ std::unique_ptr<ByteRleEncoder> createByteRleEncoder(
+ std::unique_ptr<BufferedOutputStream> output) {
+ return std::unique_ptr<ByteRleEncoder>(new ByteRleEncoderImpl(std::move(output)));
}
class BooleanRleEncoderImpl : public ByteRleEncoderImpl {
- public:
+ public:
BooleanRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output);
virtual ~BooleanRleEncoderImpl() override;
@@ -239,8 +232,7 @@ namespace orc {
* @param notNull If the pointer is null, all values are read. If the
* pointer is not null, positions that are false are skipped.
*/
- virtual void add(const char* data, uint64_t numValues,
- const char* notNull) override;
+ virtual void add(const char* data, uint64_t numValues, const char* notNull) override;
/**
* Flushing underlying BufferedOutputStream
@@ -249,15 +241,13 @@ namespace orc {
virtual void recordPosition(PositionRecorder* recorder) const override;
- private:
+ private:
int bitsRemained;
char current;
-
};
- BooleanRleEncoderImpl::BooleanRleEncoderImpl(
- std::unique_ptr<BufferedOutputStream> output)
- : ByteRleEncoderImpl(std::move(output)) {
+ BooleanRleEncoderImpl::BooleanRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output)
+ : ByteRleEncoderImpl(std::move(output)) {
bitsRemained = 8;
current = static_cast<char>(0);
}
@@ -266,10 +256,7 @@ namespace orc {
// PASS
}
- void BooleanRleEncoderImpl::add(
- const char* data,
- uint64_t numValues,
- const char* notNull) {
+ void BooleanRleEncoderImpl::add(const char* data, uint64_t numValues, const char* notNull) {
for (uint64_t i = 0; i < numValues; ++i) {
if (bitsRemained == 0) {
write(current);
@@ -278,8 +265,7 @@ namespace orc {
}
if (!notNull || notNull[i]) {
if (!data || data[i]) {
- current =
- static_cast<char>(current | (0x80 >> (8 - bitsRemained)));
+ current = static_cast<char>(current | (0x80 >> (8 - bitsRemained)));
}
--bitsRemained;
}
@@ -305,22 +291,19 @@ namespace orc {
recorder->add(static_cast<uint64_t>(8 - bitsRemained));
}
- std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder
- (std::unique_ptr<BufferedOutputStream> output) {
- BooleanRleEncoderImpl* encoder =
- new BooleanRleEncoderImpl(std::move(output)) ;
- return std::unique_ptr<ByteRleEncoder>(
- reinterpret_cast<ByteRleEncoder*>(encoder));
+ std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder(
+ std::unique_ptr<BufferedOutputStream> output) {
+ BooleanRleEncoderImpl* encoder = new BooleanRleEncoderImpl(std::move(output));
+ return std::unique_ptr<ByteRleEncoder>(reinterpret_cast<ByteRleEncoder*>(encoder));
}
ByteRleDecoder::~ByteRleDecoder() {
// PASS
}
- class ByteRleDecoderImpl: public ByteRleDecoder {
- public:
- ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream> input,
- ReaderMetrics* metrics);
+ class ByteRleDecoderImpl : public ByteRleDecoder {
+ public:
+ ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream> input, ReaderMetrics* metrics);
virtual ~ByteRleDecoderImpl();
@@ -339,7 +322,7 @@ namespace orc {
*/
virtual void next(char* data, uint64_t numValues, char* notNull);
- protected:
+ protected:
void nextInternal(char* data, uint64_t numValues, char* notNull);
inline void nextBuffer();
inline signed char readByte();
@@ -394,10 +377,9 @@ namespace orc {
bufferEnd = nullptr;
}
- ByteRleDecoderImpl::ByteRleDecoderImpl(
- std::unique_ptr<SeekableInputStream> input,
- ReaderMetrics* _metrics)
- : metrics(_metrics) {
+ ByteRleDecoderImpl::ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream> input,
+ ReaderMetrics* _metrics)
+ : metrics(_metrics) {
inputStream = std::move(input);
reset();
}
@@ -433,8 +415,7 @@ namespace orc {
nextBuffer();
}
size_t skipSize = std::min(static_cast<size_t>(consumedBytes),
- static_cast<size_t>(bufferEnd -
- bufferStart));
+ static_cast<size_t>(bufferEnd - bufferStart));
bufferStart += skipSize;
consumedBytes -= skipSize;
}
@@ -442,14 +423,12 @@ namespace orc {
}
}
- void ByteRleDecoderImpl::next(char* data, uint64_t numValues,
- char* notNull) {
+ void ByteRleDecoderImpl::next(char* data, uint64_t numValues, char* notNull) {
SCOPED_STOPWATCH(metrics, ByteDecodingLatencyUs, ByteDecodingCall);
nextInternal(data, numValues, notNull);
}
- void ByteRleDecoderImpl::nextInternal(char* data, uint64_t numValues,
- char* notNull) {
+ void ByteRleDecoderImpl::nextInternal(char* data, uint64_t numValues, char* notNull) {
uint64_t position = 0;
// skip over null values
while (notNull && position < numValues && !notNull[position]) {
@@ -461,12 +440,11 @@ namespace orc {
readHeader();
}
// how many do we read out of this block?
- size_t count = std::min(static_cast<size_t>(numValues - position),
- remainingValues);
+ size_t count = std::min(static_cast<size_t>(numValues - position), remainingValues);
uint64_t consumed = 0;
if (repeating) {
if (notNull) {
- for(uint64_t i=0; i < count; ++i) {
+ for (uint64_t i = 0; i < count; ++i) {
if (notNull[position + i]) {
data[position + i] = value;
consumed += 1;
@@ -478,7 +456,7 @@ namespace orc {
}
} else {
if (notNull) {
- for(uint64_t i=0; i < count; ++i) {
+ for (uint64_t i = 0; i < count; ++i) {
if (notNull[position + i]) {
data[position + i] = readByte();
consumed += 1;
@@ -490,9 +468,8 @@ namespace orc {
if (bufferStart == bufferEnd) {
nextBuffer();
}
- uint64_t copyBytes =
- std::min(static_cast<uint64_t>(count - i),
- static_cast<uint64_t>(bufferEnd - bufferStart));
+ uint64_t copyBytes = std::min(static_cast<uint64_t>(count - i),
+ static_cast<uint64_t>(bufferEnd - bufferStart));
memcpy(data + position + i, bufferStart, copyBytes);
bufferStart += copyBytes;
i += copyBytes;
@@ -509,17 +486,14 @@ namespace orc {
}
}
- std::unique_ptr<ByteRleDecoder> createByteRleDecoder
- (std::unique_ptr<SeekableInputStream> input,
- ReaderMetrics* metrics) {
- return std::unique_ptr<ByteRleDecoder>(new ByteRleDecoderImpl
- (std::move(input), metrics));
+ std::unique_ptr<ByteRleDecoder> createByteRleDecoder(std::unique_ptr<SeekableInputStream> input,
+ ReaderMetrics* metrics) {
+ return std::unique_ptr<ByteRleDecoder>(new ByteRleDecoderImpl(std::move(input), metrics));
}
- class BooleanRleDecoderImpl: public ByteRleDecoderImpl {
- public:
- BooleanRleDecoderImpl(std::unique_ptr<SeekableInputStream> input,
- ReaderMetrics* metrics);
+ class BooleanRleDecoderImpl : public ByteRleDecoderImpl {
+ public:
+ BooleanRleDecoderImpl(std::unique_ptr<SeekableInputStream> input, ReaderMetrics* metrics);
virtual ~BooleanRleDecoderImpl();
@@ -538,15 +512,14 @@ namespace orc {
*/
virtual void next(char* data, uint64_t numValues, char* notNull);
- protected:
+ protected:
size_t remainingBits;
char lastByte;
};
- BooleanRleDecoderImpl::BooleanRleDecoderImpl
- (std::unique_ptr<SeekableInputStream> input,
- ReaderMetrics* _metrics
- ): ByteRleDecoderImpl(std::move(input), _metrics) {
+ BooleanRleDecoderImpl::BooleanRleDecoderImpl(std::unique_ptr<SeekableInputStream> input,
+ ReaderMetrics* _metrics)
+ : ByteRleDecoderImpl(std::move(input), _metrics) {
remainingBits = 0;
lastByte = 0;
}
@@ -584,36 +557,33 @@ namespace orc {
}
}
- void BooleanRleDecoderImpl::next(char* data, uint64_t numValues,
- char* notNull) {
+ void BooleanRleDecoderImpl::next(char* data, uint64_t numValues, char* notNull) {
SCOPED_STOPWATCH(metrics, ByteDecodingLatencyUs, ByteDecodingCall);
// next spot to fill in
uint64_t position = 0;
// use up any remaining bits
if (notNull) {
- while(remainingBits > 0 && position < numValues) {
+ while (remainingBits > 0 && position < numValues) {
if (notNull[position]) {
remainingBits -= 1;
- data[position] = (static_cast<unsigned char>(lastByte) >>
- remainingBits) & 0x1;
+ data[position] = (static_cast<unsigned char>(lastByte) >> remainingBits) & 0x1;
} else {
data[position] = 0;
}
position += 1;
}
} else {
- while(remainingBits > 0 && position < numValues) {
+ while (remainingBits > 0 && position < numValues) {
remainingBits -= 1;
- data[position++] = (static_cast<unsigned char>(lastByte) >>
- remainingBits) & 0x1;
+ data[position++] = (static_cast<unsigned char>(lastByte) >> remainingBits) & 0x1;
}
}
// count the number of nonNulls remaining
uint64_t nonNulls = numValues - position;
if (notNull) {
- for(uint64_t i=position; i < numValues; ++i) {
+ for (uint64_t i = position; i < numValues; ++i) {
if (!notNull[i]) {
nonNulls -= 1;
}
@@ -634,8 +604,8 @@ namespace orc {
// expand the array backwards so that we don't clobber the data
uint64_t bitsLeft = bytesRead * 8 - remainingBits;
if (notNull) {
- for(int64_t i=static_cast<int64_t>(numValues) - 1;
- i >= static_cast<int64_t>(position); --i) {
+ for (int64_t i = static_cast<int64_t>(numValues) - 1; i >= static_cast<int64_t>(position);
+ --i) {
if (notNull[i]) {
uint64_t shiftPosn = (-bitsLeft) % 8;
data[i] = (data[position + (bitsLeft - 1) / 8] >> shiftPosn) & 0x1;
@@ -645,8 +615,8 @@ namespace orc {
}
}
} else {
- for(int64_t i=static_cast<int64_t>(numValues) - 1;
- i >= static_cast<int64_t>(position); --i, --bitsLeft) {
+ for (int64_t i = static_cast<int64_t>(numValues) - 1; i >= static_cast<int64_t>(position);
+ --i, --bitsLeft) {
uint64_t shiftPosn = (-bitsLeft) % 8;
data[i] = (data[position + (bitsLeft - 1) / 8] >> shiftPosn) & 0x1;
}
@@ -654,12 +624,9 @@ namespace orc {
}
}
- std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder
- (std::unique_ptr<SeekableInputStream> input,
- ReaderMetrics* metrics) {
- BooleanRleDecoderImpl* decoder =
- new BooleanRleDecoderImpl(std::move(input), metrics);
- return std::unique_ptr<ByteRleDecoder>(
- reinterpret_cast<ByteRleDecoder*>(decoder));
+ std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder(
+ std::unique_ptr<SeekableInputStream> input, ReaderMetrics* metrics) {
+ BooleanRleDecoderImpl* decoder = new BooleanRleDecoderImpl(std::move(input), metrics);
+ return std::unique_ptr<ByteRleDecoder>(reinterpret_cast<ByteRleDecoder*>(decoder));
}
-}
+} // namespace orc
diff --git a/c++/src/ByteRLE.hh b/c++/src/ByteRLE.hh
index f8aecc6c6..bd19f52ec 100644
--- a/c++/src/ByteRLE.hh
+++ b/c++/src/ByteRLE.hh
@@ -27,7 +27,7 @@
namespace orc {
class ByteRleEncoder {
- public:
+ public:
virtual ~ByteRleEncoder();
/**
@@ -37,8 +37,7 @@ namespace orc {
* @param notNull If the pointer is null, all values are read. If the
* pointer is not null, positions that are false are skipped.
*/
- virtual void add(const char* data, uint64_t numValues,
- const char* notNull) = 0;
+ virtual void add(const char* data, uint64_t numValues, const char* notNull) = 0;
/**
* Get size of buffer used so far.
@@ -63,7 +62,7 @@ namespace orc {
};
class ByteRleDecoder {
- public:
+ public:
virtual ~ByteRleDecoder();
/**
@@ -90,24 +89,23 @@ namespace orc {
* Create a byte RLE encoder.
* @param output the output stream to write to
*/
- std::unique_ptr<ByteRleEncoder> createByteRleEncoder
- (std::unique_ptr<BufferedOutputStream> output);
+ std::unique_ptr<ByteRleEncoder> createByteRleEncoder(
+ std::unique_ptr<BufferedOutputStream> output);
/**
* Create a boolean RLE encoder.
* @param output the output stream to write to
*/
- std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder
- (std::unique_ptr<BufferedOutputStream> output);
+ std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder(
+ std::unique_ptr<BufferedOutputStream> output);
/**
* Create a byte RLE decoder.
* @param input the input stream to read from
* @param metrics the metrics of the decoder
*/
- std::unique_ptr<ByteRleDecoder> createByteRleDecoder
- (std::unique_ptr<SeekableInputStream> input,
- ReaderMetrics* metrics);
+ std::unique_ptr<ByteRleDecoder> createByteRleDecoder(std::unique_ptr<SeekableInputStream> input,
+ ReaderMetrics* metrics);
/**
* Create a boolean RLE decoder.
@@ -118,9 +116,8 @@ namespace orc {
* @param input the input stream to read from
* @param metrics the metrics of the decoder
*/
- std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder
- (std::unique_ptr<SeekableInputStream> input,
- ReaderMetrics* metrics);
-}
+ std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder(
+ std::unique_ptr<SeekableInputStream> input, ReaderMetrics* metrics);
+} // namespace orc
#endif
diff --git a/c++/src/ColumnPrinter.cc b/c++/src/ColumnPrinter.cc
index ab6b690c5..3d40002b7 100644
--- a/c++/src/ColumnPrinter.cc
+++ b/c++/src/ColumnPrinter.cc
@@ -21,167 +21,174 @@
#include "Adaptor.hh"
+#include <time.h>
#include <limits>
#include <sstream>
#include <stdexcept>
-#include <time.h>
#include <typeinfo>
#ifdef __clang__
- #pragma clang diagnostic ignored "-Wformat-security"
+#pragma clang diagnostic ignored "-Wformat-security"
#endif
namespace orc {
- class VoidColumnPrinter: public ColumnPrinter {
- public:
+ class VoidColumnPrinter : public ColumnPrinter {
+ public:
VoidColumnPrinter(std::string&);
~VoidColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class BooleanColumnPrinter: public ColumnPrinter {
- private:
+ class BooleanColumnPrinter : public ColumnPrinter {
+ private:
const int64_t* data;
- public:
+
+ public:
BooleanColumnPrinter(std::string&);
~BooleanColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class LongColumnPrinter: public ColumnPrinter {
- private:
+ class LongColumnPrinter : public ColumnPrinter {
+ private:
const int64_t* data;
- public:
+
+ public:
LongColumnPrinter(std::string&);
~LongColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class DoubleColumnPrinter: public ColumnPrinter {
- private:
+ class DoubleColumnPrinter : public ColumnPrinter {
+ private:
const double* data;
const bool isFloat;
- public:
+ public:
DoubleColumnPrinter(std::string&, const Type& type);
virtual ~DoubleColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class TimestampColumnPrinter: public ColumnPrinter {
- private:
+ class TimestampColumnPrinter : public ColumnPrinter {
+ private:
const int64_t* seconds;
const int64_t* nanoseconds;
- public:
+ public:
TimestampColumnPrinter(std::string&);
~TimestampColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class DateColumnPrinter: public ColumnPrinter {
- private:
+ class DateColumnPrinter : public ColumnPrinter {
+ private:
const int64_t* data;
- public:
+ public:
DateColumnPrinter(std::string&);
~DateColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class Decimal64ColumnPrinter: public ColumnPrinter {
- private:
+ class Decimal64ColumnPrinter : public ColumnPrinter {
+ private:
const int64_t* data;
int32_t scale;
- public:
+
+ public:
Decimal64ColumnPrinter(std::string&);
~Decimal64ColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class Decimal128ColumnPrinter: public ColumnPrinter {
- private:
+ class Decimal128ColumnPrinter : public ColumnPrinter {
+ private:
const Int128* data;
int32_t scale;
- public:
+
+ public:
Decimal128ColumnPrinter(std::string&);
~Decimal128ColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class StringColumnPrinter: public ColumnPrinter {
- private:
- const char* const * start;
+ class StringColumnPrinter : public ColumnPrinter {
+ private:
+ const char* const* start;
const int64_t* length;
- public:
+
+ public:
StringColumnPrinter(std::string&);
virtual ~StringColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class BinaryColumnPrinter: public ColumnPrinter {
- private:
- const char* const * start;
+ class BinaryColumnPrinter : public ColumnPrinter {
+ private:
+ const char* const* start;
const int64_t* length;
- public:
+
+ public:
BinaryColumnPrinter(std::string&);
virtual ~BinaryColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class ListColumnPrinter: public ColumnPrinter {
- private:
+ class ListColumnPrinter : public ColumnPrinter {
+ private:
const int64_t* offsets;
std::unique_ptr<ColumnPrinter> elementPrinter;
- public:
+ public:
ListColumnPrinter(std::string&, const Type& type);
virtual ~ListColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class MapColumnPrinter: public ColumnPrinter {
- private:
+ class MapColumnPrinter : public ColumnPrinter {
+ private:
const int64_t* offsets;
std::unique_ptr<ColumnPrinter> keyPrinter;
std::unique_ptr<ColumnPrinter> elementPrinter;
- public:
+ public:
MapColumnPrinter(std::string&, const Type& type);
virtual ~MapColumnPrinter() override {}
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class UnionColumnPrinter: public ColumnPrinter {
- private:
- const unsigned char *tags;
+ class UnionColumnPrinter : public ColumnPrinter {
+ private:
+ const unsigned char* tags;
const uint64_t* offsets;
std::vector<std::unique_ptr<ColumnPrinter>> fieldPrinter;
- public:
+ public:
UnionColumnPrinter(std::string&, const Type& type);
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
};
- class StructColumnPrinter: public ColumnPrinter {
- private:
+ class StructColumnPrinter : public ColumnPrinter {
+ private:
std::vector<std::unique_ptr<ColumnPrinter>> fieldPrinter;
std::vector<std::string> fieldNames;
- public:
+
+ public:
StructColumnPrinter(std::string&, const Type& type);
void printRow(uint64_t rowId) override;
void reset(const ColumnVectorBatch& batch) override;
@@ -191,13 +198,12 @@ namespace orc {
file += ch;
}
- void writeString(std::string& file, const char *ptr) {
+ void writeString(std::string& file, const char* ptr) {
size_t len = strlen(ptr);
file.append(ptr, len);
}
- ColumnPrinter::ColumnPrinter(std::string& _buffer
- ): buffer(_buffer) {
+ ColumnPrinter::ColumnPrinter(std::string& _buffer) : buffer(_buffer) {
notNull = nullptr;
hasNulls = false;
}
@@ -211,89 +217,87 @@ namespace orc {
if (hasNulls) {
notNull = batch.notNull.data();
} else {
- notNull = nullptr ;
+ notNull = nullptr;
}
}
- std::unique_ptr<ColumnPrinter> createColumnPrinter(std::string& buffer,
- const Type* type) {
- ColumnPrinter *result = nullptr;
+ std::unique_ptr<ColumnPrinter> createColumnPrinter(std::string& buffer, const Type* type) {
+ ColumnPrinter* result = nullptr;
if (type == nullptr) {
result = new VoidColumnPrinter(buffer);
} else {
- switch(static_cast<int64_t>(type->getKind())) {
- case BOOLEAN:
- result = new BooleanColumnPrinter(buffer);
- break;
-
- case BYTE:
- case SHORT:
- case INT:
- case LONG:
- result = new LongColumnPrinter(buffer);
- break;
-
- case FLOAT:
- case DOUBLE:
- result = new DoubleColumnPrinter(buffer, *type);
- break;
-
- case STRING:
- case VARCHAR :
- case CHAR:
- result = new StringColumnPrinter(buffer);
- break;
-
- case BINARY:
- result = new BinaryColumnPrinter(buffer);
- break;
-
- case TIMESTAMP:
- case TIMESTAMP_INSTANT:
- result = new TimestampColumnPrinter(buffer);
- break;
-
- case LIST:
- result = new ListColumnPrinter(buffer, *type);
- break;
-
- case MAP:
- result = new MapColumnPrinter(buffer, *type);
- break;
-
- case STRUCT:
- result = new StructColumnPrinter(buffer, *type);
- break;
-
- case DECIMAL:
- if (type->getPrecision() == 0 || type->getPrecision() > 18) {
- result = new Decimal128ColumnPrinter(buffer);
- } else {
- result = new Decimal64ColumnPrinter(buffer);
- }
- break;
+ switch (static_cast<int64_t>(type->getKind())) {
+ case BOOLEAN:
+ result = new BooleanColumnPrinter(buffer);
+ break;
- case DATE:
- result = new DateColumnPrinter(buffer);
- break;
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ result = new LongColumnPrinter(buffer);
+ break;
+
+ case FLOAT:
+ case DOUBLE:
+ result = new DoubleColumnPrinter(buffer, *type);
+ break;
- case UNION:
- result = new UnionColumnPrinter(buffer, *type);
- break;
+ case STRING:
+ case VARCHAR:
+ case CHAR:
+ result = new StringColumnPrinter(buffer);
+ break;
+
+ case BINARY:
+ result = new BinaryColumnPrinter(buffer);
+ break;
+
+ case TIMESTAMP:
+ case TIMESTAMP_INSTANT:
+ result = new TimestampColumnPrinter(buffer);
+ break;
+
+ case LIST:
+ result = new ListColumnPrinter(buffer, *type);
+ break;
+
+ case MAP:
+ result = new MapColumnPrinter(buffer, *type);
+ break;
+
+ case STRUCT:
+ result = new StructColumnPrinter(buffer, *type);
+ break;
+
+ case DECIMAL:
+ if (type->getPrecision() == 0 || type->getPrecision() > 18) {
+ result = new Decimal128ColumnPrinter(buffer);
+ } else {
+ result = new Decimal64ColumnPrinter(buffer);
+ }
+ break;
- default:
- throw std::logic_error("unknown batch type");
+ case DATE:
+ result = new DateColumnPrinter(buffer);
+ break;
+
+ case UNION:
+ result = new UnionColumnPrinter(buffer, *type);
+ break;
+
+ default:
+ throw std::logic_error("unknown batch type");
}
}
return std::unique_ptr<ColumnPrinter>(result);
}
- VoidColumnPrinter::VoidColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer) {
+ VoidColumnPrinter::VoidColumnPrinter(std::string& _buffer) : ColumnPrinter(_buffer) {
// PASS
}
- void VoidColumnPrinter::reset(const ColumnVectorBatch&) {
+ void VoidColumnPrinter::reset(const ColumnVectorBatch&) {
// PASS
}
@@ -301,13 +305,12 @@ namespace orc {
writeString(buffer, "null");
}
- LongColumnPrinter::LongColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer),
- data(nullptr) {
+ LongColumnPrinter::LongColumnPrinter(std::string& _buffer)
+ : ColumnPrinter(_buffer), data(nullptr) {
// PASS
}
- void LongColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ void LongColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
data = dynamic_cast<const LongVectorBatch&>(batch).data.data();
}
@@ -318,20 +321,17 @@ namespace orc {
} else {
char numBuffer[64];
snprintf(numBuffer, sizeof(numBuffer), "%" INT64_FORMAT_STRING "d",
- static_cast<int64_t >(data[rowId]));
+ static_cast<int64_t>(data[rowId]));
writeString(buffer, numBuffer);
}
}
- DoubleColumnPrinter::DoubleColumnPrinter(std::string& _buffer,
- const Type& type
- ): ColumnPrinter(_buffer),
- data(nullptr),
- isFloat(type.getKind() == FLOAT){
+ DoubleColumnPrinter::DoubleColumnPrinter(std::string& _buffer, const Type& type)
+ : ColumnPrinter(_buffer), data(nullptr), isFloat(type.getKind() == FLOAT) {
// PASS
}
- void DoubleColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ void DoubleColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
data = dynamic_cast<const DoubleVectorBatch&>(batch).data.data();
}
@@ -341,20 +341,17 @@ namespace orc {
writeString(buffer, "null");
} else {
char numBuffer[64];
- snprintf(numBuffer, sizeof(numBuffer), isFloat ? "%.7g" : "%.14g",
- data[rowId]);
+ snprintf(numBuffer, sizeof(numBuffer), isFloat ? "%.7g" : "%.14g", data[rowId]);
writeString(buffer, numBuffer);
}
}
- Decimal64ColumnPrinter::Decimal64ColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer),
- data(nullptr),
- scale(0) {
+ Decimal64ColumnPrinter::Decimal64ColumnPrinter(std::string& _buffer)
+ : ColumnPrinter(_buffer), data(nullptr), scale(0) {
// PASS
}
- void Decimal64ColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ void Decimal64ColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
data = dynamic_cast<const Decimal64VectorBatch&>(batch).values.data();
scale = dynamic_cast<const Decimal64VectorBatch&>(batch).scale;
@@ -376,13 +373,12 @@ namespace orc {
int32_t len = static_cast<int32_t>(str.length());
if (len > scale) {
return sign + str.substr(0, static_cast<size_t>(len - scale)) + "." +
- str.substr(static_cast<size_t>(len - scale),
- static_cast<size_t>(scale));
+ str.substr(static_cast<size_t>(len - scale), static_cast<size_t>(scale));
} else if (len == scale) {
return sign + "0." + str;
} else {
std::string result = sign + "0.";
- for(int32_t i=0; i < scale - len; ++i) {
+ for (int32_t i = 0; i < scale - len; ++i) {
result += "0";
}
return result + str;
@@ -397,31 +393,27 @@ namespace orc {
}
}
- Decimal128ColumnPrinter::Decimal128ColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer),
- data(nullptr),
- scale(0) {
- // PASS
- }
-
- void Decimal128ColumnPrinter::reset(const ColumnVectorBatch& batch) {
- ColumnPrinter::reset(batch);
- data = dynamic_cast<const Decimal128VectorBatch&>(batch).values.data();
- scale = dynamic_cast<const Decimal128VectorBatch&>(batch).scale;
- }
-
- void Decimal128ColumnPrinter::printRow(uint64_t rowId) {
- if (hasNulls && !notNull[rowId]) {
- writeString(buffer, "null");
- } else {
- writeString(buffer, data[rowId].toDecimalString(scale).c_str());
- }
- }
-
- StringColumnPrinter::StringColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer),
- start(nullptr),
- length(nullptr) {
+ Decimal128ColumnPrinter::Decimal128ColumnPrinter(std::string& _buffer)
+ : ColumnPrinter(_buffer), data(nullptr), scale(0) {
+ // PASS
+ }
+
+ void Decimal128ColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ data = dynamic_cast<const Decimal128VectorBatch&>(batch).values.data();
+ scale = dynamic_cast<const Decimal128VectorBatch&>(batch).scale;
+ }
+
+ void Decimal128ColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ writeString(buffer, data[rowId].toDecimalString(scale).c_str());
+ }
+ }
+
+ StringColumnPrinter::StringColumnPrinter(std::string& _buffer)
+ : ColumnPrinter(_buffer), start(nullptr), length(nullptr) {
// PASS
}
@@ -436,51 +428,48 @@ namespace orc {
writeString(buffer, "null");
} else {
writeChar(buffer, '"');
- for(int64_t i=0; i < length[rowId]; ++i) {
+ for (int64_t i = 0; i < length[rowId]; ++i) {
char ch = static_cast<char>(start[rowId][i]);
switch (ch) {
- case '\\':
- writeString(buffer, "\\\\");
- break;
- case '\b':
- writeString(buffer, "\\b");
- break;
- case '\f':
- writeString(buffer, "\\f");
- break;
- case '\n':
- writeString(buffer, "\\n");
- break;
- case '\r':
- writeString(buffer, "\\r");
- break;
- case '\t':
- writeString(buffer, "\\t");
- break;
- case '"':
- writeString(buffer, "\\\"");
- break;
- default:
- writeChar(buffer, ch);
- break;
+ case '\\':
+ writeString(buffer, "\\\\");
+ break;
+ case '\b':
+ writeString(buffer, "\\b");
+ break;
+ case '\f':
+ writeString(buffer, "\\f");
+ break;
+ case '\n':
+ writeString(buffer, "\\n");
+ break;
+ case '\r':
+ writeString(buffer, "\\r");
+ break;
+ case '\t':
+ writeString(buffer, "\\t");
+ break;
+ case '"':
+ writeString(buffer, "\\\"");
+ break;
+ default:
+ writeChar(buffer, ch);
+ break;
}
}
writeChar(buffer, '"');
}
}
- ListColumnPrinter::ListColumnPrinter(std::string& _buffer,
- const Type& type
- ): ColumnPrinter(_buffer),
- offsets(nullptr) {
+ ListColumnPrinter::ListColumnPrinter(std::string& _buffer, const Type& type)
+ : ColumnPrinter(_buffer), offsets(nullptr) {
elementPrinter = createColumnPrinter(buffer, type.getSubtype(0));
}
- void ListColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ void ListColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
offsets = dynamic_cast<const ListVectorBatch&>(batch).offsets.data();
- elementPrinter->reset(*dynamic_cast<const ListVectorBatch&>(batch).
- elements);
+ elementPrinter->reset(*dynamic_cast<const ListVectorBatch&>(batch).elements);
}
void ListColumnPrinter::printRow(uint64_t rowId) {
@@ -488,7 +477,7 @@ namespace orc {
writeString(buffer, "null");
} else {
writeChar(buffer, '[');
- for(int64_t i=offsets[rowId]; i < offsets[rowId+1]; ++i) {
+ for (int64_t i = offsets[rowId]; i < offsets[rowId + 1]; ++i) {
if (i != offsets[rowId]) {
writeString(buffer, ", ");
}
@@ -498,15 +487,13 @@ namespace orc {
}
}
- MapColumnPrinter::MapColumnPrinter(std::string& _buffer,
- const Type& type
- ): ColumnPrinter(_buffer),
- offsets(nullptr) {
+ MapColumnPrinter::MapColumnPrinter(std::string& _buffer, const Type& type)
+ : ColumnPrinter(_buffer), offsets(nullptr) {
keyPrinter = createColumnPrinter(buffer, type.getSubtype(0));
elementPrinter = createColumnPrinter(buffer, type.getSubtype(1));
}
- void MapColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ void MapColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
const MapVectorBatch& myBatch = dynamic_cast<const MapVectorBatch&>(batch);
offsets = myBatch.offsets.data();
@@ -519,7 +506,7 @@ namespace orc {
writeString(buffer, "null");
} else {
writeChar(buffer, '[');
- for(int64_t i=offsets[rowId]; i < offsets[rowId+1]; ++i) {
+ for (int64_t i = offsets[rowId]; i < offsets[rowId + 1]; ++i) {
if (i != offsets[rowId]) {
writeString(buffer, ", ");
}
@@ -533,23 +520,19 @@ namespace orc {
}
}
- UnionColumnPrinter::UnionColumnPrinter(std::string& _buffer,
- const Type& type
- ): ColumnPrinter(_buffer),
- tags(nullptr),
- offsets(nullptr) {
- for(unsigned int i=0; i < type.getSubtypeCount(); ++i) {
+ UnionColumnPrinter::UnionColumnPrinter(std::string& _buffer, const Type& type)
+ : ColumnPrinter(_buffer), tags(nullptr), offsets(nullptr) {
+ for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) {
fieldPrinter.push_back(createColumnPrinter(buffer, type.getSubtype(i)));
}
}
void UnionColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
- const UnionVectorBatch& unionBatch =
- dynamic_cast<const UnionVectorBatch&>(batch);
+ const UnionVectorBatch& unionBatch = dynamic_cast<const UnionVectorBatch&>(batch);
tags = unionBatch.tags.data();
offsets = unionBatch.offsets.data();
- for(size_t i=0; i < fieldPrinter.size(); ++i) {
+ for (size_t i = 0; i < fieldPrinter.size(); ++i) {
fieldPrinter[i]->reset(*(unionBatch.children[i]));
}
}
@@ -569,10 +552,9 @@ namespace orc {
}
}
- StructColumnPrinter::StructColumnPrinter(std::string& _buffer,
- const Type& type
- ): ColumnPrinter(_buffer) {
- for(unsigned int i=0; i < type.getSubtypeCount(); ++i) {
+ StructColumnPrinter::StructColumnPrinter(std::string& _buffer, const Type& type)
+ : ColumnPrinter(_buffer) {
+ for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) {
fieldNames.push_back(type.getFieldName(i));
fieldPrinter.push_back(createColumnPrinter(buffer, type.getSubtype(i)));
}
@@ -580,9 +562,8 @@ namespace orc {
void StructColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
- const StructVectorBatch& structBatch =
- dynamic_cast<const StructVectorBatch&>(batch);
- for(size_t i=0; i < fieldPrinter.size(); ++i) {
+ const StructVectorBatch& structBatch = dynamic_cast<const StructVectorBatch&>(batch);
+ for (size_t i = 0; i < fieldPrinter.size(); ++i) {
fieldPrinter[i]->reset(*(structBatch.fields[i]));
}
}
@@ -592,7 +573,7 @@ namespace orc {
writeString(buffer, "null");
} else {
writeChar(buffer, '{');
- for(unsigned int i=0; i < fieldPrinter.size(); ++i) {
+ for (unsigned int i = 0; i < fieldPrinter.size(); ++i) {
if (i != 0) {
writeString(buffer, ", ");
}
@@ -605,9 +586,8 @@ namespace orc {
}
}
- DateColumnPrinter::DateColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer),
- data(nullptr) {
+ DateColumnPrinter::DateColumnPrinter(std::string& _buffer)
+ : ColumnPrinter(_buffer), data(nullptr) {
// PASS
}
@@ -631,9 +611,8 @@ namespace orc {
data = dynamic_cast<const LongVectorBatch&>(batch).data.data();
}
- BooleanColumnPrinter::BooleanColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer),
- data(nullptr) {
+ BooleanColumnPrinter::BooleanColumnPrinter(std::string& _buffer)
+ : ColumnPrinter(_buffer), data(nullptr) {
// PASS
}
@@ -650,10 +629,8 @@ namespace orc {
data = dynamic_cast<const LongVectorBatch&>(batch).data.data();
}
- BinaryColumnPrinter::BinaryColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer),
- start(nullptr),
- length(nullptr) {
+ BinaryColumnPrinter::BinaryColumnPrinter(std::string& _buffer)
+ : ColumnPrinter(_buffer), start(nullptr), length(nullptr) {
// PASS
}
@@ -662,7 +639,7 @@ namespace orc {
writeString(buffer, "null");
} else {
writeChar(buffer, '[');
- for(int64_t i=0; i < length[rowId]; ++i) {
+ for (int64_t i = 0; i < length[rowId]; ++i) {
if (i != 0) {
writeString(buffer, ", ");
}
@@ -681,10 +658,8 @@ namespace orc {
length = dynamic_cast<const StringVectorBatch&>(batch).length.data();
}
- TimestampColumnPrinter::TimestampColumnPrinter(std::string& _buffer
- ): ColumnPrinter(_buffer),
- seconds(nullptr),
- nanoseconds(nullptr) {
+ TimestampColumnPrinter::TimestampColumnPrinter(std::string& _buffer)
+ : ColumnPrinter(_buffer), seconds(nullptr), nanoseconds(nullptr) {
// PASS
}
@@ -713,19 +688,16 @@ namespace orc {
}
}
char numBuffer[64];
- snprintf(numBuffer, sizeof(numBuffer),
- "%0*" INT64_FORMAT_STRING "d\"",
- static_cast<int>(NANO_DIGITS - zeroDigits),
- static_cast<int64_t >(nanos));
+ snprintf(numBuffer, sizeof(numBuffer), "%0*" INT64_FORMAT_STRING "d\"",
+ static_cast<int>(NANO_DIGITS - zeroDigits), static_cast<int64_t>(nanos));
writeString(buffer, numBuffer);
}
}
void TimestampColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
- const TimestampVectorBatch& ts =
- dynamic_cast<const TimestampVectorBatch&>(batch);
+ const TimestampVectorBatch& ts = dynamic_cast<const TimestampVectorBatch&>(batch);
seconds = ts.data.data();
nanoseconds = ts.nanoseconds.data();
}
-}
+} // namespace orc
diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc
index 5e2191f97..5106162c6 100644
--- a/c++/src/ColumnReader.cc
+++ b/c++/src/ColumnReader.cc
@@ -21,8 +21,8 @@
#include "Adaptor.hh"
#include "ByteRLE.hh"
#include "ColumnReader.hh"
-#include "orc/Exceptions.hh"
#include "RLE.hh"
+#include "orc/Exceptions.hh"
#include <math.h>
#include <iostream>
@@ -35,24 +35,23 @@ namespace orc {
inline RleVersion convertRleVersion(proto::ColumnEncoding_Kind kind) {
switch (static_cast<int64_t>(kind)) {
- case proto::ColumnEncoding_Kind_DIRECT:
- case proto::ColumnEncoding_Kind_DICTIONARY:
- return RleVersion_1;
- case proto::ColumnEncoding_Kind_DIRECT_V2:
- case proto::ColumnEncoding_Kind_DICTIONARY_V2:
- return RleVersion_2;
- default:
- throw ParseError("Unknown encoding in convertRleVersion");
+ case proto::ColumnEncoding_Kind_DIRECT:
+ case proto::ColumnEncoding_Kind_DICTIONARY:
+ return RleVersion_1;
+ case proto::ColumnEncoding_Kind_DIRECT_V2:
+ case proto::ColumnEncoding_Kind_DICTIONARY_V2:
+ return RleVersion_2;
+ default:
+ throw ParseError("Unknown encoding in convertRleVersion");
}
}
- ColumnReader::ColumnReader(const Type& type,
- StripeStreams& stripe
- ): columnId(type.getColumnId()),
- memoryPool(stripe.getMemoryPool()),
- metrics(stripe.getReaderMetrics()) {
+ ColumnReader::ColumnReader(const Type& type, StripeStreams& stripe)
+ : columnId(type.getColumnId()),
+ memoryPool(stripe.getMemoryPool()),
+ metrics(stripe.getReaderMetrics()) {
std::unique_ptr<SeekableInputStream> stream =
- stripe.getStream(columnId, proto::Stream_Kind_PRESENT, true);
+ stripe.getStream(columnId, proto::Stream_Kind_PRESENT, true);
if (stream.get()) {
notNullDecoder = createBooleanRleDecoder(std::move(stream), metrics);
}
@@ -68,17 +67,14 @@ namespace orc {
// page through the values that we want to skip
// and count how many are non-null
const size_t MAX_BUFFER_SIZE = 32768;
- size_t bufferSize = std::min(MAX_BUFFER_SIZE,
- static_cast<size_t>(numValues));
+ size_t bufferSize = std::min(MAX_BUFFER_SIZE, static_cast<size_t>(numValues));
char buffer[MAX_BUFFER_SIZE];
uint64_t remaining = numValues;
while (remaining > 0) {
- uint64_t chunkSize =
- std::min(remaining,
- static_cast<uint64_t>(bufferSize));
+ uint64_t chunkSize = std::min(remaining, static_cast<uint64_t>(bufferSize));
decoder->next(buffer, chunkSize, nullptr);
remaining -= chunkSize;
- for(uint64_t i=0; i < chunkSize; ++i) {
+ for (uint64_t i = 0; i < chunkSize; ++i) {
if (!buffer[i]) {
numValues -= 1;
}
@@ -88,9 +84,7 @@ namespace orc {
return numValues;
}
- void ColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* incomingMask) {
+ void ColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* incomingMask) {
if (numValues > rowBatch.capacity) {
rowBatch.resize(numValues);
}
@@ -100,7 +94,7 @@ namespace orc {
char* notNullArray = rowBatch.notNull.data();
decoder->next(notNullArray, numValues, incomingMask);
// check to see if there are nulls in this batch
- for(uint64_t i=0; i < numValues; ++i) {
+ for (uint64_t i = 0; i < numValues; ++i) {
if (!notNullArray[i]) {
rowBatch.hasNulls = true;
return;
@@ -115,8 +109,7 @@ namespace orc {
rowBatch.hasNulls = false;
}
- void ColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
+ void ColumnReader::seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) {
if (notNullDecoder.get()) {
notNullDecoder->seek(positions.at(columnId));
}
@@ -131,36 +124,31 @@ namespace orc {
* @param numValues the number of bytes to convert to longs
*/
void expandBytesToLongs(int64_t* buffer, uint64_t numValues) {
- for(size_t i=numValues - 1; i < numValues; --i) {
- buffer[i] = reinterpret_cast<char *>(buffer)[i];
+ for (size_t i = numValues - 1; i < numValues; --i) {
+ buffer[i] = reinterpret_cast<char*>(buffer)[i];
}
}
- class BooleanColumnReader: public ColumnReader {
- private:
+ class BooleanColumnReader : public ColumnReader {
+ private:
std::unique_ptr<orc::ByteRleDecoder> rle;
- public:
+ public:
BooleanColumnReader(const Type& type, StripeStreams& stipe);
~BooleanColumnReader() override;
uint64_t skip(uint64_t numValues) override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override;
};
- BooleanColumnReader::BooleanColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe){
+ BooleanColumnReader::BooleanColumnReader(const Type& type, StripeStreams& stripe)
+ : ColumnReader(type, stripe) {
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (stream == nullptr)
- throw ParseError("DATA stream not found in Boolean column");
+ if (stream == nullptr) throw ParseError("DATA stream not found in Boolean column");
rle = createBooleanRleDecoder(std::move(stream), metrics);
}
@@ -174,49 +162,42 @@ namespace orc {
return numValues;
}
- void BooleanColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void BooleanColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
// Since the byte rle places the output in a char* instead of long*,
// we cheat here and use the long* and then expand it in a second pass.
- int64_t *ptr = dynamic_cast<LongVectorBatch&>(rowBatch).data.data();
- rle->next(reinterpret_cast<char*>(ptr),
- numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr);
+ int64_t* ptr = dynamic_cast<LongVectorBatch&>(rowBatch).data.data();
+ rle->next(reinterpret_cast<char*>(ptr), numValues,
+ rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr);
expandBytesToLongs(ptr, numValues);
}
void BooleanColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
rle->seek(positions.at(columnId));
}
- class ByteColumnReader: public ColumnReader {
- private:
+ class ByteColumnReader : public ColumnReader {
+ private:
std::unique_ptr<orc::ByteRleDecoder> rle;
- public:
+ public:
ByteColumnReader(const Type& type, StripeStreams& stipe);
~ByteColumnReader() override;
uint64_t skip(uint64_t numValues) override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override;
};
- ByteColumnReader::ByteColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe){
+ ByteColumnReader::ByteColumnReader(const Type& type, StripeStreams& stripe)
+ : ColumnReader(type, stripe) {
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (stream == nullptr)
- throw ParseError("DATA stream not found in Byte column");
+ if (stream == nullptr) throw ParseError("DATA stream not found in Byte column");
rle = createByteRleDecoder(std::move(stream), metrics);
}
@@ -230,52 +211,43 @@ namespace orc {
return numValues;
}
- void ByteColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void ByteColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
// Since the byte rle places the output in a char* instead of long*,
// we cheat here and use the long* and then expand it in a second pass.
- int64_t *ptr = dynamic_cast<LongVectorBatch&>(rowBatch).data.data();
- rle->next(reinterpret_cast<char*>(ptr),
- numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr);
+ int64_t* ptr = dynamic_cast<LongVectorBatch&>(rowBatch).data.data();
+ rle->next(reinterpret_cast<char*>(ptr), numValues,
+ rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr);
expandBytesToLongs(ptr, numValues);
}
- void ByteColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
+ void ByteColumnReader::seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
rle->seek(positions.at(columnId));
}
- class IntegerColumnReader: public ColumnReader {
- protected:
+ class IntegerColumnReader : public ColumnReader {
+ protected:
std::unique_ptr<orc::RleDecoder> rle;
- public:
+ public:
IntegerColumnReader(const Type& type, StripeStreams& stripe);
~IntegerColumnReader() override;
uint64_t skip(uint64_t numValues) override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override;
};
- IntegerColumnReader::IntegerColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe) {
+ IntegerColumnReader::IntegerColumnReader(const Type& type, StripeStreams& stripe)
+ : ColumnReader(type, stripe) {
RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (stream == nullptr)
- throw ParseError("DATA stream not found in Integer column");
- rle = createRleDecoder(
- std::move(stream), true, vers, memoryPool, metrics);
+ if (stream == nullptr) throw ParseError("DATA stream not found in Integer column");
+ rle = createRleDecoder(std::move(stream), true, vers, memoryPool, metrics);
}
IntegerColumnReader::~IntegerColumnReader() {
@@ -288,22 +260,20 @@ namespace orc {
return numValues;
}
- void IntegerColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void IntegerColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
- rle->next(dynamic_cast<LongVectorBatch&>(rowBatch).data.data(),
- numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr);
+ rle->next(dynamic_cast<LongVectorBatch&>(rowBatch).data.data(), numValues,
+ rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr);
}
void IntegerColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
rle->seek(positions.at(columnId));
}
- class TimestampColumnReader: public ColumnReader {
- private:
+ class TimestampColumnReader : public ColumnReader {
+ private:
std::unique_ptr<orc::RleDecoder> secondsRle;
std::unique_ptr<orc::RleDecoder> nanoRle;
const Timezone& writerTimezone;
@@ -311,47 +281,32 @@ namespace orc {
const int64_t epochOffset;
const bool sameTimezone;
- public:
- TimestampColumnReader(const Type& type,
- StripeStreams& stripe,
- bool isInstantType);
+ public:
+ TimestampColumnReader(const Type& type, StripeStreams& stripe, bool isInstantType);
~TimestampColumnReader() override;
uint64_t skip(uint64_t numValues) override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override;
};
-
- TimestampColumnReader::TimestampColumnReader(const Type& type,
- StripeStreams& stripe,
- bool isInstantType
- ): ColumnReader(type, stripe),
- writerTimezone(isInstantType ?
- getTimezoneByName("GMT") :
- stripe.getWriterTimezone()),
- readerTimezone(isInstantType ?
- getTimezoneByName("GMT") :
- stripe.getReaderTimezone()),
- epochOffset(writerTimezone.getEpoch()),
- sameTimezone(&writerTimezone == &readerTimezone){
+ TimestampColumnReader::TimestampColumnReader(const Type& type, StripeStreams& stripe,
+ bool isInstantType)
+ : ColumnReader(type, stripe),
+ writerTimezone(isInstantType ? getTimezoneByName("GMT") : stripe.getWriterTimezone()),
+ readerTimezone(isInstantType ? getTimezoneByName("GMT") : stripe.getReaderTimezone()),
+ epochOffset(writerTimezone.getEpoch()),
+ sameTimezone(&writerTimezone == &readerTimezone) {
RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (stream == nullptr)
- throw ParseError("DATA stream not found in Timestamp column");
- secondsRle = createRleDecoder(
- std::move(stream), true, vers, memoryPool, metrics);
+ if (stream == nullptr) throw ParseError("DATA stream not found in Timestamp column");
+ secondsRle = createRleDecoder(std::move(stream), true, vers, memoryPool, metrics);
stream = stripe.getStream(columnId, proto::Stream_Kind_SECONDARY, true);
- if (stream == nullptr)
- throw ParseError("SECONDARY stream not found in Timestamp column");
- nanoRle = createRleDecoder(
- std::move(stream), false, vers, memoryPool, metrics);
+ if (stream == nullptr) throw ParseError("SECONDARY stream not found in Timestamp column");
+ nanoRle = createRleDecoder(std::move(stream), false, vers, memoryPool, metrics);
}
TimestampColumnReader::~TimestampColumnReader() {
@@ -365,25 +320,22 @@ namespace orc {
return numValues;
}
- void TimestampColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void TimestampColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
- TimestampVectorBatch& timestampBatch =
- dynamic_cast<TimestampVectorBatch&>(rowBatch);
- int64_t *secsBuffer = timestampBatch.data.data();
+ TimestampVectorBatch& timestampBatch = dynamic_cast<TimestampVectorBatch&>(rowBatch);
+ int64_t* secsBuffer = timestampBatch.data.data();
secondsRle->next(secsBuffer, numValues, notNull);
- int64_t *nanoBuffer = timestampBatch.nanoseconds.data();
+ int64_t* nanoBuffer = timestampBatch.nanoseconds.data();
nanoRle->next(nanoBuffer, numValues, notNull);
// Construct the values
- for(uint64_t i=0; i < numValues; i++) {
+ for (uint64_t i = 0; i < numValues; i++) {
if (notNull == nullptr || notNull[i]) {
uint64_t zeros = nanoBuffer[i] & 0x7;
nanoBuffer[i] >>= 3;
if (zeros != 0) {
- for(uint64_t j = 0; j <= zeros; ++j) {
+ for (uint64_t j = 0; j <= zeros; ++j) {
nanoBuffer[i] *= 10;
}
}
@@ -410,38 +362,34 @@ namespace orc {
}
void TimestampColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
secondsRle->seek(positions.at(columnId));
nanoRle->seek(positions.at(columnId));
}
- template<TypeKind columnKind, bool isLittleEndian>
- class DoubleColumnReader: public ColumnReader {
- public:
+ template <TypeKind columnKind, bool isLittleEndian>
+ class DoubleColumnReader : public ColumnReader {
+ public:
DoubleColumnReader(const Type& type, StripeStreams& stripe);
~DoubleColumnReader() override {}
uint64_t skip(uint64_t numValues) override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override;
- private:
+ private:
std::unique_ptr<SeekableInputStream> inputStream;
const uint64_t bytesPerValue = (columnKind == FLOAT) ? 4 : 8;
- const char *bufferPointer;
- const char *bufferEnd;
+ const char* bufferPointer;
+ const char* bufferEnd;
unsigned char readByte() {
if (bufferPointer == bufferEnd) {
int length;
- if (!inputStream->Next
- (reinterpret_cast<const void**>(&bufferPointer), &length)) {
+ if (!inputStream->Next(reinterpret_cast<const void**>(&bufferPointer), &length)) {
throw ParseError("bad read in DoubleColumnReader::next()");
}
bufferEnd = bufferPointer + length;
@@ -470,7 +418,7 @@ namespace orc {
bits |= static_cast<int64_t>(readByte()) << (i * 8);
}
}
- double *result = reinterpret_cast<double*>(&bits);
+ double* result = reinterpret_cast<double*>(&bits);
return *result;
}
@@ -491,33 +439,28 @@ namespace orc {
bits |= readByte() << (i * 8);
}
}
- float *result = reinterpret_cast<float*>(&bits);
+ float* result = reinterpret_cast<float*>(&bits);
return static_cast<double>(*result);
}
};
- template<TypeKind columnKind, bool isLittleEndian>
- DoubleColumnReader<columnKind, isLittleEndian>::DoubleColumnReader(
- const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe),
- bufferPointer(nullptr),
- bufferEnd(nullptr) {
+ template <TypeKind columnKind, bool isLittleEndian>
+ DoubleColumnReader<columnKind, isLittleEndian>::DoubleColumnReader(const Type& type,
+ StripeStreams& stripe)
+ : ColumnReader(type, stripe), bufferPointer(nullptr), bufferEnd(nullptr) {
inputStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (inputStream == nullptr)
- throw ParseError("DATA stream not found in Double column");
+ if (inputStream == nullptr) throw ParseError("DATA stream not found in Double column");
}
- template<TypeKind columnKind, bool isLittleEndian>
+ template <TypeKind columnKind, bool isLittleEndian>
uint64_t DoubleColumnReader<columnKind, isLittleEndian>::skip(uint64_t numValues) {
numValues = ColumnReader::skip(numValues);
- if (static_cast<size_t>(bufferEnd - bufferPointer) >=
- bytesPerValue * numValues) {
+ if (static_cast<size_t>(bufferEnd - bufferPointer) >= bytesPerValue * numValues) {
bufferPointer += bytesPerValue * numValues;
} else {
- size_t sizeToSkip = bytesPerValue * numValues -
- static_cast<size_t>(bufferEnd - bufferPointer);
+ size_t sizeToSkip =
+ bytesPerValue * numValues - static_cast<size_t>(bufferEnd - bufferPointer);
const size_t cap = static_cast<size_t>(std::numeric_limits<int>::max());
while (sizeToSkip != 0) {
size_t step = sizeToSkip > cap ? cap : sizeToSkip;
@@ -531,11 +474,9 @@ namespace orc {
return numValues;
}
- template<TypeKind columnKind, bool isLittleEndian>
- void DoubleColumnReader<columnKind, isLittleEndian>::next(
- ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ template <TypeKind columnKind, bool isLittleEndian>
+ void DoubleColumnReader<columnKind, isLittleEndian>::next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues, char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
// update the notNull from the parent class
notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
@@ -543,19 +484,19 @@ namespace orc {
if (columnKind == FLOAT) {
if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
outArray[i] = readFloat();
}
}
} else {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
outArray[i] = readFloat();
}
}
} else {
if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
outArray[i] = readDouble();
}
@@ -565,8 +506,8 @@ namespace orc {
// Only viable when the machine is little-endian.
uint64_t bufferNum = 0;
if (isLittleEndian) {
- bufferNum = std::min(numValues,
- static_cast<size_t>(bufferEnd - bufferPointer) / bytesPerValue);
+ bufferNum =
+ std::min(numValues, static_cast<size_t>(bufferEnd - bufferPointer) / bytesPerValue);
uint64_t bufferBytes = bufferNum * bytesPerValue;
memcpy(outArray, bufferPointer, bufferBytes);
bufferPointer += bufferBytes;
@@ -578,7 +519,7 @@ namespace orc {
}
}
- template<TypeKind columnKind, bool isLittleEndian>
+ template <TypeKind columnKind, bool isLittleEndian>
void DoubleColumnReader<columnKind, isLittleEndian>::seekToRowGroup(
std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
@@ -604,55 +545,46 @@ namespace orc {
}
}
- class StringDictionaryColumnReader: public ColumnReader {
- private:
+ class StringDictionaryColumnReader : public ColumnReader {
+ private:
std::shared_ptr<StringDictionary> dictionary;
std::unique_ptr<RleDecoder> rle;
- public:
+ public:
StringDictionaryColumnReader(const Type& type, StripeStreams& stipe);
~StringDictionaryColumnReader() override;
uint64_t skip(uint64_t numValues) override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull) override;
+ void nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override;
};
- StringDictionaryColumnReader::StringDictionaryColumnReader
- (const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe),
- dictionary(new StringDictionary(stripe.getMemoryPool())) {
- RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId)
- .kind());
+ StringDictionaryColumnReader::StringDictionaryColumnReader(const Type& type,
+ StripeStreams& stripe)
+ : ColumnReader(type, stripe), dictionary(new StringDictionary(stripe.getMemoryPool())) {
+ RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId).kind());
uint32_t dictSize = stripe.getEncoding(columnId).dictionarysize();
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
if (stream == nullptr) {
throw ParseError("DATA stream not found in StringDictionaryColumn");
}
- rle = createRleDecoder(
- std::move(stream), false, rleVersion, memoryPool, metrics);
+ rle = createRleDecoder(std::move(stream), false, rleVersion, memoryPool, metrics);
stream = stripe.getStream(columnId, proto::Stream_Kind_LENGTH, false);
if (dictSize > 0 && stream == nullptr) {
throw ParseError("LENGTH stream not found in StringDictionaryColumn");
}
- std::unique_ptr<RleDecoder> lengthDecoder = createRleDecoder(
- std::move(stream), false, rleVersion, memoryPool, metrics);
+ std::unique_ptr<RleDecoder> lengthDecoder =
+ createRleDecoder(std::move(stream), false, rleVersion, memoryPool, metrics);
dictionary->dictionaryOffset.resize(dictSize + 1);
int64_t* lengthArray = dictionary->dictionaryOffset.data();
lengthDecoder->next(lengthArray + 1, dictSize, nullptr);
lengthArray[0] = 0;
- for(uint32_t i = 1; i < dictSize + 1; ++i) {
+ for (uint32_t i = 1; i < dictSize + 1; ++i) {
if (lengthArray[i] < 0) {
throw ParseError("Negative dictionary entry length");
}
@@ -661,10 +593,9 @@ namespace orc {
int64_t blobSize = lengthArray[dictSize];
dictionary->dictionaryBlob.resize(static_cast<uint64_t>(blobSize));
std::unique_ptr<SeekableInputStream> blobStream =
- stripe.getStream(columnId, proto::Stream_Kind_DICTIONARY_DATA, false);
+ stripe.getStream(columnId, proto::Stream_Kind_DICTIONARY_DATA, false);
if (blobSize > 0 && blobStream == nullptr) {
- throw ParseError(
- "DICTIONARY_DATA stream not found in StringDictionaryColumn");
+ throw ParseError("DICTIONARY_DATA stream not found in StringDictionaryColumn");
}
readFully(dictionary->dictionaryBlob.data(), blobSize, blobStream.get());
}
@@ -679,47 +610,43 @@ namespace orc {
return numValues;
}
- void StringDictionaryColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void StringDictionaryColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
// update the notNull from the parent class
notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
StringVectorBatch& byteBatch = dynamic_cast<StringVectorBatch&>(rowBatch);
- char *blob = dictionary->dictionaryBlob.data();
- int64_t *dictionaryOffsets = dictionary->dictionaryOffset.data();
- char **outputStarts = byteBatch.data.data();
- int64_t *outputLengths = byteBatch.length.data();
+ char* blob = dictionary->dictionaryBlob.data();
+ int64_t* dictionaryOffsets = dictionary->dictionaryOffset.data();
+ char** outputStarts = byteBatch.data.data();
+ int64_t* outputLengths = byteBatch.length.data();
rle->next(outputLengths, numValues, notNull);
uint64_t dictionaryCount = dictionary->dictionaryOffset.size() - 1;
if (notNull) {
- for(uint64_t i=0; i < numValues; ++i) {
+ for (uint64_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
int64_t entry = outputLengths[i];
- if (entry < 0 || static_cast<uint64_t>(entry) >= dictionaryCount ) {
+ if (entry < 0 || static_cast<uint64_t>(entry) >= dictionaryCount) {
throw ParseError("Entry index out of range in StringDictionaryColumn");
}
outputStarts[i] = blob + dictionaryOffsets[entry];
- outputLengths[i] = dictionaryOffsets[entry+1] -
- dictionaryOffsets[entry];
+ outputLengths[i] = dictionaryOffsets[entry + 1] - dictionaryOffsets[entry];
}
}
} else {
- for(uint64_t i=0; i < numValues; ++i) {
+ for (uint64_t i = 0; i < numValues; ++i) {
int64_t entry = outputLengths[i];
if (entry < 0 || static_cast<uint64_t>(entry) >= dictionaryCount) {
throw ParseError("Entry index out of range in StringDictionaryColumn");
}
outputStarts[i] = blob + dictionaryOffsets[entry];
- outputLengths[i] = dictionaryOffsets[entry+1] -
- dictionaryOffsets[entry];
+ outputLengths[i] = dictionaryOffsets[entry + 1] - dictionaryOffsets[entry];
}
}
}
- void StringDictionaryColumnReader::nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull) {
+ void StringDictionaryColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
rowBatch.isEncoded = true;
@@ -732,17 +659,16 @@ namespace orc {
}
void StringDictionaryColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
rle->seek(positions.at(columnId));
}
-
- class StringDirectColumnReader: public ColumnReader {
- private:
+ class StringDirectColumnReader : public ColumnReader {
+ private:
std::unique_ptr<RleDecoder> lengthRle;
std::unique_ptr<SeekableInputStream> blobStream;
- const char *lastBuffer;
+ const char* lastBuffer;
size_t lastBufferLength;
/**
@@ -752,38 +678,28 @@ namespace orc {
* @param numValues the lengths of the arrays
* @return the total number of bytes for the non-null values
*/
- size_t computeSize(const int64_t *lengths, const char *notNull,
- uint64_t numValues);
+ size_t computeSize(const int64_t* lengths, const char* notNull, uint64_t numValues);
- public:
+ public:
StringDirectColumnReader(const Type& type, StripeStreams& stipe);
~StringDirectColumnReader() override;
uint64_t skip(uint64_t numValues) override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override;
};
- StringDirectColumnReader::StringDirectColumnReader
- (const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe) {
- RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId)
- .kind());
+ StringDirectColumnReader::StringDirectColumnReader(const Type& type, StripeStreams& stripe)
+ : ColumnReader(type, stripe) {
+ RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId).kind());
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true);
- if (stream == nullptr)
- throw ParseError("LENGTH stream not found in StringDirectColumn");
- lengthRle = createRleDecoder(
- std::move(stream), false, rleVersion, memoryPool, metrics);
+ if (stream == nullptr) throw ParseError("LENGTH stream not found in StringDirectColumn");
+ lengthRle = createRleDecoder(std::move(stream), false, rleVersion, memoryPool, metrics);
blobStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (blobStream == nullptr)
- throw ParseError("DATA stream not found in StringDirectColumn");
+ if (blobStream == nullptr) throw ParseError("DATA stream not found in StringDirectColumn");
lastBuffer = nullptr;
lastBufferLength = 0;
}
@@ -800,8 +716,7 @@ namespace orc {
size_t totalBytes = 0;
// read the lengths, so we know haw many bytes to skip
while (done < numValues) {
- uint64_t step = std::min(BUFFER_SIZE,
- static_cast<size_t>(numValues - done));
+ uint64_t step = std::min(BUFFER_SIZE, static_cast<size_t>(numValues - done));
lengthRle->next(buffer, step, nullptr);
totalBytes += computeSize(buffer, nullptr, step);
done += step;
@@ -825,33 +740,31 @@ namespace orc {
return numValues;
}
- size_t StringDirectColumnReader::computeSize(const int64_t* lengths,
- const char* notNull,
+ size_t StringDirectColumnReader::computeSize(const int64_t* lengths, const char* notNull,
uint64_t numValues) {
size_t totalLength = 0;
if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
totalLength += static_cast<size_t>(lengths[i]);
}
}
} else {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
totalLength += static_cast<size_t>(lengths[i]);
}
}
return totalLength;
}
- void StringDirectColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void StringDirectColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
// update the notNull from the parent class
notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
StringVectorBatch& byteBatch = dynamic_cast<StringVectorBatch&>(rowBatch);
- char **startPtr = byteBatch.data.data();
- int64_t *lengthPtr = byteBatch.length.data();
+ char** startPtr = byteBatch.data.data();
+ int64_t* lengthPtr = byteBatch.length.data();
// read the length vector
lengthRle->next(lengthPtr, numValues, notNull);
@@ -863,7 +776,7 @@ namespace orc {
// to get the rest directly out of the stream's buffer.
size_t bytesBuffered = 0;
byteBatch.blob.resize(totalLength);
- char *ptr= byteBatch.blob.data();
+ char* ptr = byteBatch.blob.data();
while (bytesBuffered + lastBufferLength < totalLength) {
memcpy(ptr + bytesBuffered, lastBuffer, lastBufferLength);
bytesBuffered += lastBufferLength;
@@ -903,7 +816,7 @@ namespace orc {
}
void StringDirectColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
blobStream->seek(positions.at(columnId));
lengthRle->seek(positions.at(columnId));
@@ -912,143 +825,120 @@ namespace orc {
lastBufferLength = 0;
}
- class StructColumnReader: public ColumnReader {
- private:
+ class StructColumnReader : public ColumnReader {
+ private:
std::vector<std::unique_ptr<ColumnReader>> children;
- public:
+ public:
StructColumnReader(const Type& type, StripeStreams& stipe);
uint64_t skip(uint64_t numValues) override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override;
- private:
- template<bool encoded>
- void nextInternal(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull);
+ private:
+ template <bool encoded>
+ void nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull);
};
- StructColumnReader::StructColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe) {
+ StructColumnReader::StructColumnReader(const Type& type, StripeStreams& stripe)
+ : ColumnReader(type, stripe) {
// count the number of selected sub-columns
const std::vector<bool> selectedColumns = stripe.getSelectedColumns();
switch (static_cast<int64_t>(stripe.getEncoding(columnId).kind())) {
- case proto::ColumnEncoding_Kind_DIRECT:
- for(unsigned int i=0; i < type.getSubtypeCount(); ++i) {
- const Type& child = *type.getSubtype(i);
- if (selectedColumns[static_cast<uint64_t>(child.getColumnId())]) {
- children.push_back(buildReader(child, stripe));
+ case proto::ColumnEncoding_Kind_DIRECT:
+ for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) {
+ const Type& child = *type.getSubtype(i);
+ if (selectedColumns[static_cast<uint64_t>(child.getColumnId())]) {
+ children.push_back(buildReader(child, stripe));
+ }
}
- }
- break;
- case proto::ColumnEncoding_Kind_DIRECT_V2:
- case proto::ColumnEncoding_Kind_DICTIONARY:
- case proto::ColumnEncoding_Kind_DICTIONARY_V2:
- default:
- throw ParseError("Unknown encoding for StructColumnReader");
+ break;
+ case proto::ColumnEncoding_Kind_DIRECT_V2:
+ case proto::ColumnEncoding_Kind_DICTIONARY:
+ case proto::ColumnEncoding_Kind_DICTIONARY_V2:
+ default:
+ throw ParseError("Unknown encoding for StructColumnReader");
}
}
uint64_t StructColumnReader::skip(uint64_t numValues) {
numValues = ColumnReader::skip(numValues);
- for(auto& ptr : children) {
+ for (auto& ptr : children) {
ptr->skip(numValues);
}
return numValues;
}
- void StructColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void StructColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
nextInternal<false>(rowBatch, numValues, notNull);
}
- void StructColumnReader::nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void StructColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
nextInternal<true>(rowBatch, numValues, notNull);
}
- template<bool encoded>
- void StructColumnReader::nextInternal(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ template <bool encoded>
+ void StructColumnReader::nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
- uint64_t i=0;
- notNull = rowBatch.hasNulls? rowBatch.notNull.data() : nullptr;
- for(auto iter = children.begin(); iter != children.end(); ++iter, ++i) {
+ uint64_t i = 0;
+ notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
+ for (auto iter = children.begin(); iter != children.end(); ++iter, ++i) {
if (encoded) {
- (*iter)->nextEncoded(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]),
- numValues, notNull);
+ (*iter)->nextEncoded(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]), numValues,
+ notNull);
} else {
- (*iter)->next(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]),
- numValues, notNull);
+ (*iter)->next(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]), numValues, notNull);
}
}
}
void StructColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
- for(auto& ptr : children) {
+ for (auto& ptr : children) {
ptr->seekToRowGroup(positions);
}
}
- class ListColumnReader: public ColumnReader {
- private:
+ class ListColumnReader : public ColumnReader {
+ private:
std::unique_ptr<ColumnReader> child;
std::unique_ptr<RleDecoder> rle;
- public:
+ public:
ListColumnReader(const Type& type, StripeStreams& stipe);
~ListColumnReader() override;
uint64_t skip(uint64_t numValues) override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override;
- private:
- template<bool encoded>
- void nextInternal(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull);
+ private:
+ template <bool encoded>
+ void nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull);
};
- ListColumnReader::ListColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe) {
+ ListColumnReader::ListColumnReader(const Type& type, StripeStreams& stripe)
+ : ColumnReader(type, stripe) {
// count the number of selected sub-columns
const std::vector<bool> selectedColumns = stripe.getSelectedColumns();
RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true);
- if (stream == nullptr)
- throw ParseError("LENGTH stream not found in List column");
- rle = createRleDecoder(
- std::move(stream), false, vers, memoryPool, metrics);
+ if (stream == nullptr) throw ParseError("LENGTH stream not found in List column");
+ rle = createRleDecoder(std::move(stream), false, vers, memoryPool, metrics);
const Type& childType = *type.getSubtype(0);
if (selectedColumns[static_cast<uint64_t>(childType.getColumnId())]) {
child = buildReader(childType, stripe);
@@ -1061,7 +951,7 @@ namespace orc {
uint64_t ListColumnReader::skip(uint64_t numValues) {
numValues = ColumnReader::skip(numValues);
- ColumnReader *childReader = child.get();
+ ColumnReader* childReader = child.get();
if (childReader) {
const uint64_t BUFFER_SIZE = 1024;
int64_t buffer[BUFFER_SIZE];
@@ -1070,7 +960,7 @@ namespace orc {
while (lengthsRead < numValues) {
uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE);
rle->next(buffer, chunk, nullptr);
- for(size_t i=0; i < chunk; ++i) {
+ for (size_t i = 0; i < chunk; ++i) {
childrenElements += static_cast<size_t>(buffer[i]);
}
lengthsRead += chunk;
@@ -1082,30 +972,26 @@ namespace orc {
return numValues;
}
- void ListColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void ListColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
nextInternal<false>(rowBatch, numValues, notNull);
}
- void ListColumnReader::nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void ListColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
nextInternal<true>(rowBatch, numValues, notNull);
}
- template<bool encoded>
- void ListColumnReader::nextInternal(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ template <bool encoded>
+ void ListColumnReader::nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
- ListVectorBatch &listBatch = dynamic_cast<ListVectorBatch&>(rowBatch);
+ ListVectorBatch& listBatch = dynamic_cast<ListVectorBatch&>(rowBatch);
int64_t* offsets = listBatch.offsets.data();
notNull = listBatch.hasNulls ? listBatch.notNull.data() : nullptr;
rle->next(offsets, numValues, notNull);
uint64_t totalChildren = 0;
if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
uint64_t tmp = static_cast<uint64_t>(offsets[i]);
offsets[i] = static_cast<int64_t>(totalChildren);
@@ -1115,14 +1001,14 @@ namespace orc {
}
}
} else {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
uint64_t tmp = static_cast<uint64_t>(offsets[i]);
offsets[i] = static_cast<int64_t>(totalChildren);
totalChildren += tmp;
}
}
offsets[numValues] = static_cast<int64_t>(totalChildren);
- ColumnReader *childReader = child.get();
+ ColumnReader* childReader = child.get();
if (childReader) {
if (encoded) {
childReader->nextEncoded(*(listBatch.elements.get()), totalChildren, nullptr);
@@ -1132,8 +1018,7 @@ namespace orc {
}
}
- void ListColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
+ void ListColumnReader::seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
rle->seek(positions.at(columnId));
if (child.get()) {
@@ -1141,48 +1026,38 @@ namespace orc {
}
}
- class MapColumnReader: public ColumnReader {
- private:
+ class MapColumnReader : public ColumnReader {
+ private:
std::unique_ptr<ColumnReader> keyReader;
std::unique_ptr<ColumnReader> elementReader;
std::unique_ptr<RleDecoder> rle;
- public:
+ public:
MapColumnReader(const Type& type, StripeStreams& stipe);
~MapColumnReader() override;
uint64_t skip(uint64_t numValues) override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override;
- private:
- template<bool encoded>
- void nextInternal(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull);
+ private:
+ template <bool encoded>
+ void nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull);
};
- MapColumnReader::MapColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe) {
+ MapColumnReader::MapColumnReader(const Type& type, StripeStreams& stripe)
+ : ColumnReader(type, stripe) {
// Determine if the key and/or value columns are selected
const std::vector<bool> selectedColumns = stripe.getSelectedColumns();
RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true);
- if (stream == nullptr)
- throw ParseError("LENGTH stream not found in Map column");
- rle = createRleDecoder(
- std::move(stream), false, vers, memoryPool, metrics);
+ if (stream == nullptr) throw ParseError("LENGTH stream not found in Map column");
+ rle = createRleDecoder(std::move(stream), false, vers, memoryPool, metrics);
const Type& keyType = *type.getSubtype(0);
if (selectedColumns[static_cast<uint64_t>(keyType.getColumnId())]) {
keyReader = buildReader(keyType, stripe);
@@ -1199,8 +1074,8 @@ namespace orc {
uint64_t MapColumnReader::skip(uint64_t numValues) {
numValues = ColumnReader::skip(numValues);
- ColumnReader *rawKeyReader = keyReader.get();
- ColumnReader *rawElementReader = elementReader.get();
+ ColumnReader* rawKeyReader = keyReader.get();
+ ColumnReader* rawElementReader = elementReader.get();
if (rawKeyReader || rawElementReader) {
const uint64_t BUFFER_SIZE = 1024;
int64_t buffer[BUFFER_SIZE];
@@ -1209,7 +1084,7 @@ namespace orc {
while (lengthsRead < numValues) {
uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE);
rle->next(buffer, chunk, nullptr);
- for(size_t i=0; i < chunk; ++i) {
+ for (size_t i = 0; i < chunk; ++i) {
childrenElements += static_cast<size_t>(buffer[i]);
}
lengthsRead += chunk;
@@ -1226,32 +1101,26 @@ namespace orc {
return numValues;
}
- void MapColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull)
- {
+ void MapColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
nextInternal<false>(rowBatch, numValues, notNull);
}
- void MapColumnReader::nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull)
- {
+ void MapColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
nextInternal<true>(rowBatch, numValues, notNull);
}
- template<bool encoded>
- void MapColumnReader::nextInternal(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ template <bool encoded>
+ void MapColumnReader::nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
- MapVectorBatch &mapBatch = dynamic_cast<MapVectorBatch&>(rowBatch);
+ MapVectorBatch& mapBatch = dynamic_cast<MapVectorBatch&>(rowBatch);
int64_t* offsets = mapBatch.offsets.data();
notNull = mapBatch.hasNulls ? mapBatch.notNull.data() : nullptr;
rle->next(offsets, numValues, notNull);
uint64_t totalChildren = 0;
if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
uint64_t tmp = static_cast<uint64_t>(offsets[i]);
offsets[i] = static_cast<int64_t>(totalChildren);
@@ -1261,14 +1130,14 @@ namespace orc {
}
}
} else {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
uint64_t tmp = static_cast<uint64_t>(offsets[i]);
offsets[i] = static_cast<int64_t>(totalChildren);
totalChildren += tmp;
}
}
offsets[numValues] = static_cast<int64_t>(totalChildren);
- ColumnReader *rawKeyReader = keyReader.get();
+ ColumnReader* rawKeyReader = keyReader.get();
if (rawKeyReader) {
if (encoded) {
rawKeyReader->nextEncoded(*(mapBatch.keys.get()), totalChildren, nullptr);
@@ -1276,7 +1145,7 @@ namespace orc {
rawKeyReader->next(*(mapBatch.keys.get()), totalChildren, nullptr);
}
}
- ColumnReader *rawElementReader = elementReader.get();
+ ColumnReader* rawElementReader = elementReader.get();
if (rawElementReader) {
if (encoded) {
rawElementReader->nextEncoded(*(mapBatch.elements.get()), totalChildren, nullptr);
@@ -1286,8 +1155,7 @@ namespace orc {
}
}
- void MapColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
+ void MapColumnReader::seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
rle->seek(positions.at(columnId));
if (keyReader.get()) {
@@ -1298,52 +1166,43 @@ namespace orc {
}
}
- class UnionColumnReader: public ColumnReader {
- private:
+ class UnionColumnReader : public ColumnReader {
+ private:
std::unique_ptr<ByteRleDecoder> rle;
std::vector<std::unique_ptr<ColumnReader>> childrenReader;
std::vector<int64_t> childrenCounts;
uint64_t numChildren;
- public:
+ public:
UnionColumnReader(const Type& type, StripeStreams& stipe);
uint64_t skip(uint64_t numValues) override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override;
- private:
- template<bool encoded>
- void nextInternal(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull);
+ private:
+ template <bool encoded>
+ void nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull);
};
- UnionColumnReader::UnionColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe) {
+ UnionColumnReader::UnionColumnReader(const Type& type, StripeStreams& stripe)
+ : ColumnReader(type, stripe) {
numChildren = type.getSubtypeCount();
childrenReader.resize(numChildren);
childrenCounts.resize(numChildren);
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (stream == nullptr)
- throw ParseError("LENGTH stream not found in Union column");
+ if (stream == nullptr) throw ParseError("LENGTH stream not found in Union column");
rle = createByteRleDecoder(std::move(stream), metrics);
// figure out which types are selected
const std::vector<bool> selectedColumns = stripe.getSelectedColumns();
- for(unsigned int i=0; i < numChildren; ++i) {
- const Type &child = *type.getSubtype(i);
+ for (unsigned int i = 0; i < numChildren; ++i) {
+ const Type& child = *type.getSubtype(i);
if (selectedColumns[static_cast<size_t>(child.getColumnId())]) {
childrenReader[i] = buildReader(child, stripe);
}
@@ -1355,17 +1214,17 @@ namespace orc {
const uint64_t BUFFER_SIZE = 1024;
char buffer[BUFFER_SIZE];
uint64_t lengthsRead = 0;
- int64_t *counts = childrenCounts.data();
+ int64_t* counts = childrenCounts.data();
memset(counts, 0, sizeof(int64_t) * numChildren);
while (lengthsRead < numValues) {
uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE);
rle->next(buffer, chunk, nullptr);
- for(size_t i=0; i < chunk; ++i) {
+ for (size_t i = 0; i < chunk; ++i) {
counts[static_cast<size_t>(buffer[i])] += 1;
}
lengthsRead += chunk;
}
- for(size_t i=0; i < numChildren; ++i) {
+ for (size_t i = 0; i < numChildren; ++i) {
if (counts[i] != 0 && childrenReader[i] != nullptr) {
childrenReader[i]->skip(static_cast<uint64_t>(counts[i]));
}
@@ -1373,63 +1232,57 @@ namespace orc {
return numValues;
}
- void UnionColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void UnionColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
nextInternal<false>(rowBatch, numValues, notNull);
}
- void UnionColumnReader::nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void UnionColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
nextInternal<true>(rowBatch, numValues, notNull);
}
- template<bool encoded>
- void UnionColumnReader::nextInternal(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ template <bool encoded>
+ void UnionColumnReader::nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
- UnionVectorBatch &unionBatch = dynamic_cast<UnionVectorBatch&>(rowBatch);
+ UnionVectorBatch& unionBatch = dynamic_cast<UnionVectorBatch&>(rowBatch);
uint64_t* offsets = unionBatch.offsets.data();
int64_t* counts = childrenCounts.data();
memset(counts, 0, sizeof(int64_t) * numChildren);
unsigned char* tags = unionBatch.tags.data();
notNull = unionBatch.hasNulls ? unionBatch.notNull.data() : nullptr;
- rle->next(reinterpret_cast<char *>(tags), numValues, notNull);
+ rle->next(reinterpret_cast<char*>(tags), numValues, notNull);
// set the offsets for each row
if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
- offsets[i] =
- static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++);
+ offsets[i] = static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++);
}
}
} else {
- for(size_t i=0; i < numValues; ++i) {
- offsets[i] =
- static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++);
+ for (size_t i = 0; i < numValues; ++i) {
+ offsets[i] = static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++);
}
}
// read the right number of each child column
- for(size_t i=0; i < numChildren; ++i) {
+ for (size_t i = 0; i < numChildren; ++i) {
if (childrenReader[i] != nullptr) {
if (encoded) {
childrenReader[i]->nextEncoded(*(unionBatch.children[i]),
- static_cast<uint64_t>(counts[i]), nullptr);
+ static_cast<uint64_t>(counts[i]), nullptr);
} else {
- childrenReader[i]->next(*(unionBatch.children[i]),
- static_cast<uint64_t>(counts[i]), nullptr);
+ childrenReader[i]->next(*(unionBatch.children[i]), static_cast<uint64_t>(counts[i]),
+ nullptr);
}
}
}
}
void UnionColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
rle->seek(positions.at(columnId));
- for(size_t i = 0; i < numChildren; ++i) {
+ for (size_t i = 0; i < numChildren; ++i) {
if (childrenReader[i] != nullptr) {
childrenReader[i]->seekToRowGroup(positions);
}
@@ -1449,13 +1302,13 @@ namespace orc {
}
}
- class Decimal64ColumnReader: public ColumnReader {
- public:
+ class Decimal64ColumnReader : public ColumnReader {
+ public:
static const uint32_t MAX_PRECISION_64 = 18;
static const uint32_t MAX_PRECISION_128 = 38;
static const int64_t POWERS_OF_TEN[MAX_PRECISION_64 + 1];
- protected:
+ protected:
std::unique_ptr<SeekableInputStream> valueStream;
int32_t precision;
int32_t scale;
@@ -1470,9 +1323,8 @@ namespace orc {
void readBuffer() {
while (buffer == bufferEnd) {
int length;
- if (!valueStream->Next(reinterpret_cast<const void**>(&buffer),
- &length)) {
- throw ParseError("Read past end of stream in Decimal64ColumnReader "+
+ if (!valueStream->Next(reinterpret_cast<const void**>(&buffer), &length)) {
+ throw ParseError("Read past end of stream in Decimal64ColumnReader " +
valueStream->getName());
}
bufferEnd = buffer + length;
@@ -1492,70 +1344,61 @@ namespace orc {
}
}
value = unZigZag(static_cast<uint64_t>(value));
- if (scale > currentScale &&
- static_cast<uint64_t>(scale - currentScale) <= MAX_PRECISION_64) {
+ if (scale > currentScale && static_cast<uint64_t>(scale - currentScale) <= MAX_PRECISION_64) {
value *= POWERS_OF_TEN[scale - currentScale];
} else if (scale < currentScale &&
- static_cast<uint64_t>(currentScale - scale) <= MAX_PRECISION_64) {
+ static_cast<uint64_t>(currentScale - scale) <= MAX_PRECISION_64) {
value /= POWERS_OF_TEN[currentScale - scale];
} else if (scale != currentScale) {
throw ParseError("Decimal scale out of range");
}
}
- public:
+ public:
Decimal64ColumnReader(const Type& type, StripeStreams& stipe);
~Decimal64ColumnReader() override;
uint64_t skip(uint64_t numValues) override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) override;
+ void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions) override;
};
const uint32_t Decimal64ColumnReader::MAX_PRECISION_64;
const uint32_t Decimal64ColumnReader::MAX_PRECISION_128;
- const int64_t Decimal64ColumnReader::POWERS_OF_TEN[MAX_PRECISION_64 + 1]=
- {1,
- 10,
- 100,
- 1000,
- 10000,
- 100000,
- 1000000,
- 10000000,
- 100000000,
- 1000000000,
- 10000000000,
- 100000000000,
- 1000000000000,
- 10000000000000,
- 100000000000000,
- 1000000000000000,
- 10000000000000000,
- 100000000000000000,
- 1000000000000000000};
-
- Decimal64ColumnReader::Decimal64ColumnReader(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe) {
+ const int64_t Decimal64ColumnReader::POWERS_OF_TEN[MAX_PRECISION_64 + 1] = {1,
+ 10,
+ 100,
+ 1000,
+ 10000,
+ 100000,
+ 1000000,
+ 10000000,
+ 100000000,
+ 1000000000,
+ 10000000000,
+ 100000000000,
+ 1000000000000,
+ 10000000000000,
+ 100000000000000,
+ 1000000000000000,
+ 10000000000000000,
+ 100000000000000000,
+ 1000000000000000000};
+
+ Decimal64ColumnReader::Decimal64ColumnReader(const Type& type, StripeStreams& stripe)
+ : ColumnReader(type, stripe) {
scale = static_cast<int32_t>(type.getScale());
precision = static_cast<int32_t>(type.getPrecision());
valueStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
- if (valueStream == nullptr)
- throw ParseError("DATA stream not found in Decimal64Column");
+ if (valueStream == nullptr) throw ParseError("DATA stream not found in Decimal64Column");
buffer = nullptr;
bufferEnd = nullptr;
RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
std::unique_ptr<SeekableInputStream> stream =
stripe.getStream(columnId, proto::Stream_Kind_SECONDARY, true);
- if (stream == nullptr)
- throw ParseError("SECONDARY stream not found in Decimal64Column");
- scaleDecoder = createRleDecoder(
- std::move(stream), true, vers, memoryPool, metrics);
+ if (stream == nullptr) throw ParseError("SECONDARY stream not found in Decimal64Column");
+ scaleDecoder = createRleDecoder(std::move(stream), true, vers, memoryPool, metrics);
}
Decimal64ColumnReader::~Decimal64ColumnReader() {
@@ -1575,13 +1418,10 @@ namespace orc {
return numValues;
}
- void Decimal64ColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void Decimal64ColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
- Decimal64VectorBatch &batch =
- dynamic_cast<Decimal64VectorBatch&>(rowBatch);
+ Decimal64VectorBatch& batch = dynamic_cast<Decimal64VectorBatch&>(rowBatch);
int64_t* values = batch.values.data();
// read the next group of scales
int64_t* scaleBuffer = batch.readScales.data();
@@ -1589,13 +1429,13 @@ namespace orc {
batch.precision = precision;
batch.scale = scale;
if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
readInt64(values[i], static_cast<int32_t>(scaleBuffer[i]));
}
}
} else {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
readInt64(values[i], static_cast<int32_t>(scaleBuffer[i]));
}
}
@@ -1603,28 +1443,25 @@ namespace orc {
void scaleInt128(Int128& value, uint32_t scale, uint32_t currentScale) {
if (scale > currentScale) {
- while(scale > currentScale) {
+ while (scale > currentScale) {
uint32_t scaleAdjust =
- std::min(Decimal64ColumnReader::MAX_PRECISION_64,
- scale - currentScale);
+ std::min(Decimal64ColumnReader::MAX_PRECISION_64, scale - currentScale);
value *= Decimal64ColumnReader::POWERS_OF_TEN[scaleAdjust];
currentScale += scaleAdjust;
}
} else if (scale < currentScale) {
Int128 remainder;
- while(currentScale > scale) {
+ while (currentScale > scale) {
uint32_t scaleAdjust =
- std::min(Decimal64ColumnReader::MAX_PRECISION_64,
- currentScale - scale);
- value = value.divide(Decimal64ColumnReader::POWERS_OF_TEN[scaleAdjust],
- remainder);
+ std::min(Decimal64ColumnReader::MAX_PRECISION_64, currentScale - scale);
+ value = value.divide(Decimal64ColumnReader::POWERS_OF_TEN[scaleAdjust], remainder);
currentScale -= scaleAdjust;
}
}
}
void Decimal64ColumnReader::seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions) {
+ std::unordered_map<uint64_t, PositionProvider>& positions) {
ColumnReader::seekToRowGroup(positions);
valueStream->seek(positions.at(columnId));
scaleDecoder->seek(positions.at(columnId));
@@ -1633,16 +1470,14 @@ namespace orc {
bufferEnd = nullptr;
}
- class Decimal128ColumnReader: public Decimal64ColumnReader {
- public:
+ class Decimal128ColumnReader : public Decimal64ColumnReader {
+ public:
Decimal128ColumnReader(const Type& type, StripeStreams& stipe);
~Decimal128ColumnReader() override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
- private:
+ private:
void readInt128(Int128& value, int32_t currentScale) {
value = 0;
Int128 work;
@@ -1652,22 +1487,19 @@ namespace orc {
unsigned char ch = static_cast<unsigned char>(*(buffer++));
work = ch & 0x7f;
work <<= offset;
- value |= work;
+ value |= work;
offset += 7;
if (!(ch & 0x80)) {
break;
}
}
unZigZagInt128(value);
- scaleInt128(value, static_cast<uint32_t>(scale),
- static_cast<uint32_t>(currentScale));
+ scaleInt128(value, static_cast<uint32_t>(scale), static_cast<uint32_t>(currentScale));
}
};
- Decimal128ColumnReader::Decimal128ColumnReader
- (const Type& type,
- StripeStreams& stripe
- ): Decimal64ColumnReader(type, stripe) {
+ Decimal128ColumnReader::Decimal128ColumnReader(const Type& type, StripeStreams& stripe)
+ : Decimal64ColumnReader(type, stripe) {
// PASS
}
@@ -1675,13 +1507,11 @@ namespace orc {
// PASS
}
- void Decimal128ColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void Decimal128ColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
- Decimal128VectorBatch &batch =
- dynamic_cast<Decimal128VectorBatch&>(rowBatch);
+ Decimal128VectorBatch& batch = dynamic_cast<Decimal128VectorBatch&>(rowBatch);
Int128* values = batch.values.data();
// read the next group of scales
int64_t* scaleBuffer = batch.readScales.data();
@@ -1689,38 +1519,35 @@ namespace orc {
batch.precision = precision;
batch.scale = scale;
if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]));
}
}
} else {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]));
}
}
}
- class Decimal64ColumnReaderV2: public ColumnReader {
- protected:
+ class Decimal64ColumnReaderV2 : public ColumnReader {
+ protected:
std::unique_ptr<RleDecoder> valueDecoder;
int32_t precision;
int32_t scale;
- public:
+ public:
Decimal64ColumnReaderV2(const Type& type, StripeStreams& stripe);
~Decimal64ColumnReaderV2() override;
uint64_t skip(uint64_t numValues) override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
};
- Decimal64ColumnReaderV2::Decimal64ColumnReaderV2(const Type& type,
- StripeStreams& stripe
- ): ColumnReader(type, stripe) {
+ Decimal64ColumnReaderV2::Decimal64ColumnReaderV2(const Type& type, StripeStreams& stripe)
+ : ColumnReader(type, stripe) {
scale = static_cast<int32_t>(type.getScale());
precision = static_cast<int32_t>(type.getPrecision());
std::unique_ptr<SeekableInputStream> stream =
@@ -1730,8 +1557,7 @@ namespace orc {
ss << "DATA stream not found in Decimal64V2 column. ColumnId=" << columnId;
throw ParseError(ss.str());
}
- valueDecoder = createRleDecoder(
- std::move(stream), true, RleVersion_2, memoryPool, metrics);
+ valueDecoder = createRleDecoder(std::move(stream), true, RleVersion_2, memoryPool, metrics);
}
Decimal64ColumnReaderV2::~Decimal64ColumnReaderV2() {
@@ -1744,20 +1570,18 @@ namespace orc {
return numValues;
}
- void Decimal64ColumnReaderV2::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void Decimal64ColumnReaderV2::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
- Decimal64VectorBatch &batch =
- dynamic_cast<Decimal64VectorBatch&>(rowBatch);
+ Decimal64VectorBatch& batch = dynamic_cast<Decimal64VectorBatch&>(rowBatch);
valueDecoder->next(batch.values.data(), numValues, notNull);
batch.precision = precision;
batch.scale = scale;
}
- class DecimalHive11ColumnReader: public Decimal64ColumnReader {
- private:
+ class DecimalHive11ColumnReader : public Decimal64ColumnReader {
+ private:
bool throwOnOverflow;
std::ostream* errorStream;
@@ -1767,7 +1591,7 @@ namespace orc {
bool readInt128(Int128& value, int32_t currentScale) {
// -/+ 99999999999999999999999999999999999999
static const Int128 MIN_VALUE(-0x4b3b4ca85a86c47b, 0xf675ddc000000001);
- static const Int128 MAX_VALUE( 0x4b3b4ca85a86c47a, 0x098a223fffffffff);
+ static const Int128 MAX_VALUE(0x4b3b4ca85a86c47a, 0x098a223fffffffff);
value = 0;
Int128 work;
@@ -1783,7 +1607,7 @@ namespace orc {
result = false;
}
work <<= offset;
- value |= work;
+ value |= work;
offset += 7;
if (!(ch & 0x80)) {
break;
@@ -1794,24 +1618,19 @@ namespace orc {
return result;
}
unZigZagInt128(value);
- scaleInt128(value, static_cast<uint32_t>(scale),
- static_cast<uint32_t>(currentScale));
+ scaleInt128(value, static_cast<uint32_t>(scale), static_cast<uint32_t>(currentScale));
return value >= MIN_VALUE && value <= MAX_VALUE;
}
- public:
+ public:
DecimalHive11ColumnReader(const Type& type, StripeStreams& stipe);
~DecimalHive11ColumnReader() override;
- void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) override;
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) override;
};
- DecimalHive11ColumnReader::DecimalHive11ColumnReader
- (const Type& type,
- StripeStreams& stripe
- ): Decimal64ColumnReader(type, stripe) {
+ DecimalHive11ColumnReader::DecimalHive11ColumnReader(const Type& type, StripeStreams& stripe)
+ : Decimal64ColumnReader(type, stripe) {
scale = stripe.getForcedScaleOnHive11Decimal();
throwOnOverflow = stripe.getThrowOnHive11DecimalOverflow();
errorStream = stripe.getErrorStream();
@@ -1821,13 +1640,11 @@ namespace orc {
// PASS
}
- void DecimalHive11ColumnReader::next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char *notNull) {
+ void DecimalHive11ColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
ColumnReader::next(rowBatch, numValues, notNull);
notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
- Decimal128VectorBatch &batch =
- dynamic_cast<Decimal128VectorBatch&>(rowBatch);
+ Decimal128VectorBatch& batch = dynamic_cast<Decimal128VectorBatch&>(rowBatch);
Int128* values = batch.values.data();
// read the next group of scales
int64_t* scaleBuffer = batch.readScales.data();
@@ -1837,10 +1654,9 @@ namespace orc {
batch.precision = precision;
batch.scale = scale;
if (notNull) {
- for(size_t i=0; i < numValues; ++i) {
+ for (size_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
- if (!readInt128(values[i],
- static_cast<int32_t>(scaleBuffer[i]))) {
+ if (!readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]))) {
if (throwOnOverflow) {
throw ParseError("Hive 0.11 decimal was more than 38 digits.");
} else {
@@ -1853,9 +1669,8 @@ namespace orc {
}
}
} else {
- for(size_t i=0; i < numValues; ++i) {
- if (!readInt128(values[i],
- static_cast<int32_t>(scaleBuffer[i]))) {
+ for (size_t i = 0; i < numValues; ++i) {
+ if (!readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]))) {
if (throwOnOverflow) {
throw ParseError("Hive 0.11 decimal was more than 38 digits.");
} else {
@@ -1871,109 +1686,92 @@ namespace orc {
}
static bool isLittleEndian() {
- static union { uint32_t i; char c[4]; } num = { 0x01020304 };
+ static union {
+ uint32_t i;
+ char c[4];
+ } num = {0x01020304};
return num.c[0] == 4;
}
/**
* Create a reader for the given stripe.
*/
- std::unique_ptr<ColumnReader> buildReader(const Type& type,
- StripeStreams& stripe) {
+ std::unique_ptr<ColumnReader> buildReader(const Type& type, StripeStreams& stripe) {
switch (static_cast<int64_t>(type.getKind())) {
- case DATE:
- case INT:
- case LONG:
- case SHORT:
- return std::unique_ptr<ColumnReader>(
- new IntegerColumnReader(type, stripe));
- case BINARY:
- case CHAR:
- case STRING:
- case VARCHAR:
- switch (static_cast<int64_t>(stripe.getEncoding(type.getColumnId()).kind())){
- case proto::ColumnEncoding_Kind_DICTIONARY:
- case proto::ColumnEncoding_Kind_DICTIONARY_V2:
- return std::unique_ptr<ColumnReader>(
- new StringDictionaryColumnReader(type, stripe));
- case proto::ColumnEncoding_Kind_DIRECT:
- case proto::ColumnEncoding_Kind_DIRECT_V2:
- return std::unique_ptr<ColumnReader>(
- new StringDirectColumnReader(type, stripe));
- default:
- throw NotImplementedYet("buildReader unhandled string encoding");
- }
+ case DATE:
+ case INT:
+ case LONG:
+ case SHORT:
+ return std::unique_ptr<ColumnReader>(new IntegerColumnReader(type, stripe));
+ case BINARY:
+ case CHAR:
+ case STRING:
+ case VARCHAR:
+ switch (static_cast<int64_t>(stripe.getEncoding(type.getColumnId()).kind())) {
+ case proto::ColumnEncoding_Kind_DICTIONARY:
+ case proto::ColumnEncoding_Kind_DICTIONARY_V2:
+ return std::unique_ptr<ColumnReader>(new StringDictionaryColumnReader(type, stripe));
+ case proto::ColumnEncoding_Kind_DIRECT:
+ case proto::ColumnEncoding_Kind_DIRECT_V2:
+ return std::unique_ptr<ColumnReader>(new StringDirectColumnReader(type, stripe));
+ default:
+ throw NotImplementedYet("buildReader unhandled string encoding");
+ }
- case BOOLEAN:
- return std::unique_ptr<ColumnReader>(
- new BooleanColumnReader(type, stripe));
+ case BOOLEAN:
+ return std::unique_ptr<ColumnReader>(new BooleanColumnReader(type, stripe));
- case BYTE:
- return std::unique_ptr<ColumnReader>(
- new ByteColumnReader(type, stripe));
+ case BYTE:
+ return std::unique_ptr<ColumnReader>(new ByteColumnReader(type, stripe));
- case LIST:
- return std::unique_ptr<ColumnReader>(
- new ListColumnReader(type, stripe));
+ case LIST:
+ return std::unique_ptr<ColumnReader>(new ListColumnReader(type, stripe));
- case MAP:
- return std::unique_ptr<ColumnReader>(
- new MapColumnReader(type, stripe));
+ case MAP:
+ return std::unique_ptr<ColumnReader>(new MapColumnReader(type, stripe));
- case UNION:
- return std::unique_ptr<ColumnReader>(
- new UnionColumnReader(type, stripe));
+ case UNION:
+ return std::unique_ptr<ColumnReader>(new UnionColumnReader(type, stripe));
- case STRUCT:
- return std::unique_ptr<ColumnReader>(
- new StructColumnReader(type, stripe));
+ case STRUCT:
+ return std::unique_ptr<ColumnReader>(new StructColumnReader(type, stripe));
- case FLOAT:
- if (isLittleEndian()) {
- return std::unique_ptr<ColumnReader>(
- new DoubleColumnReader<FLOAT, true>(type, stripe));
- }
- return std::unique_ptr<ColumnReader>(
- new DoubleColumnReader<FLOAT, false>(type, stripe));
+ case FLOAT:
+ if (isLittleEndian()) {
+ return std::unique_ptr<ColumnReader>(new DoubleColumnReader<FLOAT, true>(type, stripe));
+ }
+ return std::unique_ptr<ColumnReader>(new DoubleColumnReader<FLOAT, false>(type, stripe));
- case DOUBLE:
- if (isLittleEndian()) {
- return std::unique_ptr<ColumnReader>(
- new DoubleColumnReader<DOUBLE, true>(type, stripe));
- }
- return std::unique_ptr<ColumnReader>(
- new DoubleColumnReader<DOUBLE, false>(type, stripe));
-
- case TIMESTAMP:
- return std::unique_ptr<ColumnReader>
- (new TimestampColumnReader(type, stripe, false));
-
- case TIMESTAMP_INSTANT:
- return std::unique_ptr<ColumnReader>
- (new TimestampColumnReader(type, stripe, true));
-
- case DECIMAL:
- // is this a Hive 0.11 or 0.12 file?
- if (type.getPrecision() == 0) {
- return std::unique_ptr<ColumnReader>
- (new DecimalHive11ColumnReader(type, stripe));
- }
- // can we represent the values using int64_t?
- if (type.getPrecision() <= Decimal64ColumnReader::MAX_PRECISION_64) {
- if (stripe.isDecimalAsLong()) {
- return std::unique_ptr<ColumnReader>
- (new Decimal64ColumnReaderV2(type, stripe));
+ case DOUBLE:
+ if (isLittleEndian()) {
+ return std::unique_ptr<ColumnReader>(new DoubleColumnReader<DOUBLE, true>(type, stripe));
}
- return std::unique_ptr<ColumnReader>
- (new Decimal64ColumnReader(type, stripe));
- }
- // otherwise we use the Int128 implementation
- return std::unique_ptr<ColumnReader>
- (new Decimal128ColumnReader(type, stripe));
+ return std::unique_ptr<ColumnReader>(new DoubleColumnReader<DOUBLE, false>(type, stripe));
+
+ case TIMESTAMP:
+ return std::unique_ptr<ColumnReader>(new TimestampColumnReader(type, stripe, false));
+
+ case TIMESTAMP_INSTANT:
+ return std::unique_ptr<ColumnReader>(new TimestampColumnReader(type, stripe, true));
- default:
- throw NotImplementedYet("buildReader unhandled type");
+ case DECIMAL:
+ // is this a Hive 0.11 or 0.12 file?
+ if (type.getPrecision() == 0) {
+ return std::unique_ptr<ColumnReader>(new DecimalHive11ColumnReader(type, stripe));
+ }
+ // can we represent the values using int64_t?
+ if (type.getPrecision() <= Decimal64ColumnReader::MAX_PRECISION_64) {
+ if (stripe.isDecimalAsLong()) {
+ return std::unique_ptr<ColumnReader>(new Decimal64ColumnReaderV2(type, stripe));
+ }
+ return std::unique_ptr<ColumnReader>(new Decimal64ColumnReader(type, stripe));
+ }
+ // otherwise we use the Int128 implementation
+ return std::unique_ptr<ColumnReader>(new Decimal128ColumnReader(type, stripe));
+
+ default:
+ throw NotImplementedYet("buildReader unhandled type");
}
}
-}
+} // namespace orc
diff --git a/c++/src/ColumnReader.hh b/c++/src/ColumnReader.hh
index 67bd3ddeb..10b0bb6dc 100644
--- a/c++/src/ColumnReader.hh
+++ b/c++/src/ColumnReader.hh
@@ -31,7 +31,7 @@
namespace orc {
class StripeStreams {
- public:
+ public:
virtual ~StripeStreams();
/**
@@ -53,10 +53,9 @@ namespace orc {
* @param shouldStream should the reading page the stream in
* @return the new stream
*/
- virtual std::unique_ptr<SeekableInputStream>
- getStream(uint64_t columnId,
- proto::Stream_Kind kind,
- bool shouldStream) const = 0;
+ virtual std::unique_ptr<SeekableInputStream> getStream(uint64_t columnId,
+ proto::Stream_Kind kind,
+ bool shouldStream) const = 0;
/**
* Get the memory pool for this reader.
@@ -108,13 +107,13 @@ namespace orc {
* The interface for reading ORC data types.
*/
class ColumnReader {
- protected:
+ protected:
std::unique_ptr<ByteRleDecoder> notNullDecoder;
uint64_t columnId;
MemoryPool& memoryPool;
ReaderMetrics* metrics;
- public:
+ public:
ColumnReader(const Type& type, StripeStreams& stipe);
virtual ~ColumnReader();
@@ -134,9 +133,7 @@ namespace orc {
* a mask (with at least numValues bytes) for which values to
* set.
*/
- virtual void next(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull);
+ virtual void next(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull);
/**
* Read the next group of values without decoding
@@ -146,10 +143,7 @@ namespace orc {
* a mask (with at least numValues bytes) for which values to
* set.
*/
- virtual void nextEncoded(ColumnVectorBatch& rowBatch,
- uint64_t numValues,
- char* notNull)
- {
+ virtual void nextEncoded(ColumnVectorBatch& rowBatch, uint64_t numValues, char* notNull) {
rowBatch.isEncoded = false;
next(rowBatch, numValues, notNull);
}
@@ -158,16 +152,13 @@ namespace orc {
* Seek to beginning of a row group in the current stripe
* @param positions a list of PositionProviders storing the positions
*/
- virtual void seekToRowGroup(
- std::unordered_map<uint64_t, PositionProvider>& positions);
-
+ virtual void seekToRowGroup(std::unordered_map<uint64_t, PositionProvider>& positions);
};
/**
* Create a reader for the given stripe.
*/
- std::unique_ptr<ColumnReader> buildReader(const Type& type,
- StripeStreams& stripe);
-}
+ std::unique_ptr<ColumnReader> buildReader(const Type& type, StripeStreams& stripe);
+} // namespace orc
#endif
diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index c8375d317..a70d0bf79 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -27,56 +27,43 @@
namespace orc {
StreamsFactory::~StreamsFactory() {
- //PASS
+ // PASS
}
class StreamsFactoryImpl : public StreamsFactory {
- public:
- StreamsFactoryImpl(
- const WriterOptions& writerOptions,
- OutputStream* outputStream) :
- options(writerOptions),
- outStream(outputStream) {
- }
-
- virtual std::unique_ptr<BufferedOutputStream>
- createStream(proto::Stream_Kind kind) const override;
- private:
+ public:
+ StreamsFactoryImpl(const WriterOptions& writerOptions, OutputStream* outputStream)
+ : options(writerOptions), outStream(outputStream) {}
+
+ virtual std::unique_ptr<BufferedOutputStream> createStream(
+ proto::Stream_Kind kind) const override;
+
+ private:
const WriterOptions& options;
OutputStream* outStream;
};
- std::unique_ptr<BufferedOutputStream> StreamsFactoryImpl::createStream(
- proto::Stream_Kind) const {
+ std::unique_ptr<BufferedOutputStream> StreamsFactoryImpl::createStream(proto::Stream_Kind) const {
// In the future, we can decide compression strategy and modifier
// based on stream kind. But for now we just use the setting from
// WriterOption
- return createCompressor(
- options.getCompression(),
- outStream,
- options.getCompressionStrategy(),
+ return createCompressor(options.getCompression(), outStream, options.getCompressionStrategy(),
// BufferedOutputStream initial capacity
- 1 * 1024 * 1024,
- options.getCompressionBlockSize(),
- *options.getMemoryPool(),
- options.getWriterMetrics());
+ 1 * 1024 * 1024, options.getCompressionBlockSize(),
+ *options.getMemoryPool(), options.getWriterMetrics());
}
- std::unique_ptr<StreamsFactory> createStreamsFactory(
- const WriterOptions& options,
- OutputStream* outStream) {
- return std::unique_ptr<StreamsFactory>(
- new StreamsFactoryImpl(options, outStream));
+ std::unique_ptr<StreamsFactory> createStreamsFactory(const WriterOptions& options,
+ OutputStream* outStream) {
+ return std::unique_ptr<StreamsFactory>(new StreamsFactoryImpl(options, outStream));
}
RowIndexPositionRecorder::~RowIndexPositionRecorder() {
// PASS
}
- proto::ColumnEncoding_Kind RleVersionMapper(RleVersion rleVersion)
- {
- switch (rleVersion)
- {
+ proto::ColumnEncoding_Kind RleVersionMapper(RleVersion rleVersion) {
+ switch (rleVersion) {
case RleVersion_1:
return proto::ColumnEncoding_Kind_DIRECT;
case RleVersion_2:
@@ -86,24 +73,21 @@ namespace orc {
}
}
- ColumnWriter::ColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- columnId(type.getColumnId()),
- colIndexStatistics(),
- colStripeStatistics(),
- colFileStatistics(),
- enableIndex(options.getEnableIndex()),
- rowIndex(),
- rowIndexEntry(),
- rowIndexPosition(),
- enableBloomFilter(false),
- memPool(*options.getMemoryPool()),
- indexStream(),
- bloomFilterStream(),
- hasNullValue(false) {
-
+ ColumnWriter::ColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : columnId(type.getColumnId()),
+ colIndexStatistics(),
+ colStripeStatistics(),
+ colFileStatistics(),
+ enableIndex(options.getEnableIndex()),
+ rowIndex(),
+ rowIndexEntry(),
+ rowIndexPosition(),
+ enableBloomFilter(false),
+ memPool(*options.getMemoryPool()),
+ indexStream(),
+ bloomFilterStream(),
+ hasNullValue(false) {
std::unique_ptr<BufferedOutputStream> presentStream =
factory.createStream(proto::Stream_Kind_PRESENT);
notNullEncoder = createBooleanRleEncoder(std::move(presentStream));
@@ -114,19 +98,17 @@ namespace orc {
if (enableIndex) {
rowIndex = std::unique_ptr<proto::RowIndex>(new proto::RowIndex());
- rowIndexEntry =
- std::unique_ptr<proto::RowIndexEntry>(new proto::RowIndexEntry());
- rowIndexPosition = std::unique_ptr<RowIndexPositionRecorder>(
- new RowIndexPositionRecorder(*rowIndexEntry));
- indexStream =
- factory.createStream(proto::Stream_Kind_ROW_INDEX);
+ rowIndexEntry = std::unique_ptr<proto::RowIndexEntry>(new proto::RowIndexEntry());
+ rowIndexPosition =
+ std::unique_ptr<RowIndexPositionRecorder>(new RowIndexPositionRecorder(*rowIndexEntry));
+ indexStream = factory.createStream(proto::Stream_Kind_ROW_INDEX);
// BloomFilters for non-UTF8 strings and non-UTC timestamps are not supported
- if (options.isColumnUseBloomFilter(columnId)
- && options.getBloomFilterVersion() == BloomFilterVersion::UTF8) {
+ if (options.isColumnUseBloomFilter(columnId) &&
+ options.getBloomFilterVersion() == BloomFilterVersion::UTF8) {
enableBloomFilter = true;
- bloomFilter.reset(new BloomFilterImpl(
- options.getRowIndexStride(), options.getBloomFilterFPP()));
+ bloomFilter.reset(
+ new BloomFilterImpl(options.getRowIndexStride(), options.getBloomFilterFPP()));
bloomFilterIndex.reset(new proto::BloomFilterIndex());
bloomFilterStream = factory.createStream(proto::Stream_Kind_BLOOM_FILTER_UTF8);
}
@@ -137,9 +119,7 @@ namespace orc {
// PASS
}
- void ColumnWriter::add(ColumnVectorBatch& batch,
- uint64_t offset,
- uint64_t numValues,
+ void ColumnWriter::add(ColumnVectorBatch& batch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
const char* notNull = batch.notNull.data() + offset;
notNullEncoder->add(notNull, numValues, incomingMask);
@@ -168,8 +148,7 @@ namespace orc {
return notNullEncoder->getBufferSize();
}
- void ColumnWriter::getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
+ void ColumnWriter::getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const {
getProtoBufStatistics(stats, colStripeStatistics.get());
}
@@ -183,13 +162,12 @@ namespace orc {
colIndexStatistics->reset();
}
- void ColumnWriter::getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
+ void ColumnWriter::getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const {
getProtoBufStatistics(stats, colFileStatistics.get());
}
void ColumnWriter::createRowIndexEntry() {
- proto::ColumnStatistics *indexStats = rowIndexEntry->mutable_statistics();
+ proto::ColumnStatistics* indexStats = rowIndexEntry->mutable_statistics();
colIndexStatistics->toProtoBuf(*indexStats);
*rowIndex->add_entry() = *rowIndexEntry;
@@ -212,7 +190,7 @@ namespace orc {
}
}
- void ColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const {
+ void ColumnWriter::writeIndex(std::vector<proto::Stream>& streams) const {
if (!hasNullValue) {
// remove positions of present stream
int presentCount = indexStream->isCompressed() ? 4 : 3;
@@ -276,28 +254,21 @@ namespace orc {
}
class StructColumnWriter : public ColumnWriter {
- public:
- StructColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
+ public:
+ StructColumnWriter(const Type& type, const StreamsFactory& factory,
const WriterOptions& options);
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
virtual void flush(std::vector<proto::Stream>& streams) override;
virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override;
- virtual void getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const override;
+ virtual void getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const override;
- virtual void getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const override;
+ virtual void getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const override;
virtual void mergeStripeStatsIntoFileStats() override;
@@ -305,23 +276,20 @@ namespace orc {
virtual void createRowIndexEntry() override;
- virtual void writeIndex(
- std::vector<proto::Stream> &streams) const override;
+ virtual void writeIndex(std::vector<proto::Stream>& streams) const override;
virtual void writeDictionary() override;
virtual void reset() override;
- private:
+ private:
std::vector<std::unique_ptr<ColumnWriter>> children;
};
- StructColumnWriter::StructColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options) {
- for(unsigned int i = 0; i < type.getSubtypeCount(); ++i) {
+ StructColumnWriter::StructColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : ColumnWriter(type, factory, options) {
+ for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) {
const Type& child = *type.getSubtype(i);
children.push_back(buildWriter(child, factory, options));
}
@@ -331,20 +299,15 @@ namespace orc {
}
}
- void StructColumnWriter::add(
- ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void StructColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
- const StructVectorBatch* structBatch =
- dynamic_cast<const StructVectorBatch *>(&rowBatch);
+ const StructVectorBatch* structBatch = dynamic_cast<const StructVectorBatch*>(&rowBatch);
if (structBatch == nullptr) {
throw InvalidArgument("Failed to cast to StructVectorBatch");
}
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
- const char* notNull = structBatch->hasNulls ?
- structBatch->notNull.data() + offset : nullptr;
+ const char* notNull = structBatch->hasNulls ? structBatch->notNull.data() + offset : nullptr;
for (uint32_t i = 0; i < children.size(); ++i) {
children[i]->add(*structBatch->fields[i], offset, numValues, notNull);
}
@@ -373,8 +336,7 @@ namespace orc {
}
}
- void StructColumnWriter::writeIndex(
- std::vector<proto::Stream> &streams) const {
+ void StructColumnWriter::writeIndex(std::vector<proto::Stream>& streams) const {
ColumnWriter::writeIndex(streams);
for (uint32_t i = 0; i < children.size(); ++i) {
children[i]->writeIndex(streams);
@@ -389,8 +351,7 @@ namespace orc {
return size;
}
- void StructColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
+ void StructColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
encoding.set_dictionarysize(0);
@@ -400,8 +361,7 @@ namespace orc {
}
}
- void StructColumnWriter::getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
+ void StructColumnWriter::getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const {
ColumnWriter::getStripeStatistics(stats);
for (uint32_t i = 0; i < children.size(); ++i) {
@@ -417,8 +377,7 @@ namespace orc {
}
}
- void StructColumnWriter::getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
+ void StructColumnWriter::getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const {
ColumnWriter::getFileStatistics(stats);
for (uint32_t i = 0; i < children.size(); ++i) {
@@ -426,7 +385,7 @@ namespace orc {
}
}
- void StructColumnWriter::mergeRowGroupStatsIntoStripeStats() {
+ void StructColumnWriter::mergeRowGroupStatsIntoStripeStats() {
ColumnWriter::mergeRowGroupStatsIntoStripeStats();
for (uint32_t i = 0; i < children.size(); ++i) {
@@ -457,46 +416,34 @@ namespace orc {
}
class IntegerColumnWriter : public ColumnWriter {
- public:
- IntegerColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
+ public:
+ IntegerColumnWriter(const Type& type, const StreamsFactory& factory,
const WriterOptions& options);
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
virtual void flush(std::vector<proto::Stream>& streams) override;
virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override;
virtual void recordPosition() const override;
- protected:
+ protected:
std::unique_ptr<RleEncoder> rleEncoder;
- private:
+ private:
RleVersion rleVersion;
};
- IntegerColumnWriter::IntegerColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options),
- rleVersion(options.getRleVersion()) {
+ IntegerColumnWriter::IntegerColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : ColumnWriter(type, factory, options), rleVersion(options.getRleVersion()) {
std::unique_ptr<BufferedOutputStream> dataStream =
- factory.createStream(proto::Stream_Kind_DATA);
- rleEncoder = createRleEncoder(
- std::move(dataStream),
- true,
- rleVersion,
- memPool,
+ factory.createStream(proto::Stream_Kind_DATA);
+ rleEncoder = createRleEncoder(std::move(dataStream), true, rleVersion, memPool,
options.getAlignedBitpacking());
if (enableIndex) {
@@ -504,13 +451,9 @@ namespace orc {
}
}
- void IntegerColumnWriter::add(
- ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void IntegerColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
- const LongVectorBatch* longBatch =
- dynamic_cast<const LongVectorBatch*>(&rowBatch);
+ const LongVectorBatch* longBatch = dynamic_cast<const LongVectorBatch*>(&rowBatch);
if (longBatch == nullptr) {
throw InvalidArgument("Failed to cast to LongVectorBatch");
}
@@ -523,8 +466,7 @@ namespace orc {
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
const int64_t* data = longBatch->data.data() + offset;
- const char* notNull = longBatch->hasNulls ?
- longBatch->notNull.data() + offset : nullptr;
+ const char* notNull = longBatch->hasNulls ? longBatch->notNull.data() + offset : nullptr;
rleEncoder->add(data, numValues, notNull);
@@ -561,8 +503,7 @@ namespace orc {
return size;
}
- void IntegerColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
+ void IntegerColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
encoding.set_kind(RleVersionMapper(rleVersion));
encoding.set_dictionarysize(0);
@@ -578,36 +519,29 @@ namespace orc {
}
class ByteColumnWriter : public ColumnWriter {
- public:
- ByteColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options);
-
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ public:
+ ByteColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options);
+
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
virtual void flush(std::vector<proto::Stream>& streams) override;
virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override;
virtual void recordPosition() const override;
- private:
+ private:
std::unique_ptr<ByteRleEncoder> byteRleEncoder;
};
- ByteColumnWriter::ByteColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options) {
+ ByteColumnWriter::ByteColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : ColumnWriter(type, factory, options) {
std::unique_ptr<BufferedOutputStream> dataStream =
- factory.createStream(proto::Stream_Kind_DATA);
+ factory.createStream(proto::Stream_Kind_DATA);
byteRleEncoder = createByteRleEncoder(std::move(dataStream));
if (enableIndex) {
@@ -615,9 +549,7 @@ namespace orc {
}
}
- void ByteColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void ByteColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
LongVectorBatch* byteBatch = dynamic_cast<LongVectorBatch*>(&rowBatch);
if (byteBatch == nullptr) {
@@ -632,8 +564,7 @@ namespace orc {
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
int64_t* data = byteBatch->data.data() + offset;
- const char* notNull = byteBatch->hasNulls ?
- byteBatch->notNull.data() + offset : nullptr;
+ const char* notNull = byteBatch->hasNulls ? byteBatch->notNull.data() + offset : nullptr;
char* byteData = reinterpret_cast<char*>(data);
for (uint64_t i = 0; i < numValues; ++i) {
@@ -673,8 +604,7 @@ namespace orc {
return size;
}
- void ByteColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
+ void ByteColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
encoding.set_dictionarysize(0);
@@ -690,36 +620,30 @@ namespace orc {
}
class BooleanColumnWriter : public ColumnWriter {
- public:
- BooleanColumnWriter(const Type& type,
- const StreamsFactory& factory,
+ public:
+ BooleanColumnWriter(const Type& type, const StreamsFactory& factory,
const WriterOptions& options);
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
virtual void flush(std::vector<proto::Stream>& streams) override;
virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override;
virtual void recordPosition() const override;
- private:
+ private:
std::unique_ptr<ByteRleEncoder> rleEncoder;
};
- BooleanColumnWriter::BooleanColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options) {
+ BooleanColumnWriter::BooleanColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : ColumnWriter(type, factory, options) {
std::unique_ptr<BufferedOutputStream> dataStream =
- factory.createStream(proto::Stream_Kind_DATA);
+ factory.createStream(proto::Stream_Kind_DATA);
rleEncoder = createBooleanRleEncoder(std::move(dataStream));
if (enableIndex) {
@@ -727,9 +651,7 @@ namespace orc {
}
}
- void BooleanColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void BooleanColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
LongVectorBatch* byteBatch = dynamic_cast<LongVectorBatch*>(&rowBatch);
if (byteBatch == nullptr) {
@@ -744,8 +666,7 @@ namespace orc {
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
int64_t* data = byteBatch->data.data() + offset;
- const char* notNull = byteBatch->hasNulls ?
- byteBatch->notNull.data() + offset : nullptr;
+ const char* notNull = byteBatch->hasNulls ? byteBatch->notNull.data() + offset : nullptr;
char* byteData = reinterpret_cast<char*>(data);
for (uint64_t i = 0; i < numValues; ++i) {
@@ -785,8 +706,7 @@ namespace orc {
return size;
}
- void BooleanColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
+ void BooleanColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
encoding.set_dictionarysize(0);
@@ -802,42 +722,33 @@ namespace orc {
}
class DoubleColumnWriter : public ColumnWriter {
- public:
- DoubleColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options,
- bool isFloat);
-
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ public:
+ DoubleColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options, bool isFloat);
+
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
virtual void flush(std::vector<proto::Stream>& streams) override;
virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override;
virtual void recordPosition() const override;
- private:
+ private:
bool isFloat;
std::unique_ptr<AppendOnlyBufferedStream> dataStream;
DataBuffer<char> buffer;
};
- DoubleColumnWriter::DoubleColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options,
- bool isFloatType) :
- ColumnWriter(type, factory, options),
- isFloat(isFloatType),
- buffer(*options.getMemoryPool()) {
- dataStream.reset(new AppendOnlyBufferedStream(
- factory.createStream(proto::Stream_Kind_DATA)));
+ DoubleColumnWriter::DoubleColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options, bool isFloatType)
+ : ColumnWriter(type, factory, options),
+ isFloat(isFloatType),
+ buffer(*options.getMemoryPool()) {
+ dataStream.reset(new AppendOnlyBufferedStream(factory.createStream(proto::Stream_Kind_DATA)));
buffer.resize(isFloat ? 4 : 8);
if (enableIndex) {
@@ -855,17 +766,14 @@ namespace orc {
}
}
- void DoubleColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void DoubleColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
- const DoubleVectorBatch* dblBatch =
- dynamic_cast<const DoubleVectorBatch*>(&rowBatch);
+ const DoubleVectorBatch* dblBatch = dynamic_cast<const DoubleVectorBatch*>(&rowBatch);
if (dblBatch == nullptr) {
throw InvalidArgument("Failed to cast to DoubleVectorBatch");
}
DoubleColumnStatisticsImpl* doubleStats =
- dynamic_cast<DoubleColumnStatisticsImpl*>(colIndexStatistics.get());
+ dynamic_cast<DoubleColumnStatisticsImpl*>(colIndexStatistics.get());
if (doubleStats == nullptr) {
throw InvalidArgument("Failed to cast to DoubleColumnStatisticsImpl");
}
@@ -873,8 +781,7 @@ namespace orc {
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
const double* doubleData = dblBatch->data.data() + offset;
- const char* notNull = dblBatch->hasNulls ?
- dblBatch->notNull.data() + offset : nullptr;
+ const char* notNull = dblBatch->hasNulls ? dblBatch->notNull.data() + offset : nullptr;
size_t bytes = isFloat ? 4 : 8;
char* data = buffer.data();
@@ -916,8 +823,7 @@ namespace orc {
return size;
}
- void DoubleColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
+ void DoubleColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
encoding.set_dictionarysize(0);
@@ -936,27 +842,26 @@ namespace orc {
* Implementation of increasing sorted string dictionary
*/
class SortedStringDictionary {
- public:
+ public:
struct DictEntry {
- DictEntry(const char * str, size_t len):data(str),length(len) {}
- const char * data;
+ DictEntry(const char* str, size_t len) : data(str), length(len) {}
+ const char* data;
size_t length;
};
- SortedStringDictionary():totalLength(0) {}
+ SortedStringDictionary() : totalLength(0) {}
// insert a new string into dictionary, return its insertion order
- size_t insert(const char * data, size_t len);
+ size_t insert(const char* data, size_t len);
// write dictionary data & length to output buffer
- void flush(AppendOnlyBufferedStream * dataStream,
- RleEncoder * lengthEncoder) const;
+ void flush(AppendOnlyBufferedStream* dataStream, RleEncoder* lengthEncoder) const;
// reorder input index buffer from insertion order to dictionary order
void reorder(std::vector<int64_t>& idxBuffer) const;
// get dict entries in insertion order
- void getEntriesInInsertionOrder(std::vector<const DictEntry *>&) const;
+ void getEntriesInInsertionOrder(std::vector<const DictEntry*>&) const;
// return count of entries
size_t size() const;
@@ -966,7 +871,7 @@ namespace orc {
void clear();
- private:
+ private:
struct LessThan {
bool operator()(const DictEntry& left, const DictEntry& right) const {
int ret = memcmp(left.data, right.data, std::min(left.length, right.length));
@@ -990,14 +895,14 @@ namespace orc {
};
// insert a new string into dictionary, return its insertion order
- size_t SortedStringDictionary::insert(const char * str, size_t len) {
+ size_t SortedStringDictionary::insert(const char* str, size_t len) {
auto ret = dict.insert({DictEntry(str, len), dict.size()});
if (ret.second) {
// make a copy to internal storage
data.push_back(std::vector<char>(len));
memcpy(data.back().data(), str, len);
// update dictionary entry to link pointer to internal storage
- DictEntry * entry = const_cast<DictEntry *>(&(ret.first->first));
+ DictEntry* entry = const_cast<DictEntry*>(&(ret.first->first));
entry->data = data.back().data();
totalLength += len;
}
@@ -1005,8 +910,8 @@ namespace orc {
}
// write dictionary data & length to output buffer
- void SortedStringDictionary::flush(AppendOnlyBufferedStream * dataStream,
- RleEncoder * lengthEncoder) const {
+ void SortedStringDictionary::flush(AppendOnlyBufferedStream* dataStream,
+ RleEncoder* lengthEncoder) const {
for (auto it = dict.cbegin(); it != dict.cend(); ++it) {
dataStream->write(it->first.data, it->first.length);
lengthEncoder->write(static_cast<int64_t>(it->first.length));
@@ -1033,14 +938,13 @@ namespace orc {
// do the transformation
for (size_t i = 0; i != idxBuffer.size(); ++i) {
- idxBuffer[i] = static_cast<int64_t>(
- mapping[static_cast<size_t>(idxBuffer[i])]);
+ idxBuffer[i] = static_cast<int64_t>(mapping[static_cast<size_t>(idxBuffer[i])]);
}
}
// get dict entries in insertion order
void SortedStringDictionary::getEntriesInInsertionOrder(
- std::vector<const DictEntry *>& entries) const {
+ std::vector<const DictEntry*>& entries) const {
entries.resize(dict.size());
for (auto it = dict.cbegin(); it != dict.cend(); ++it) {
entries[it->second] = &(it->first);
@@ -1057,29 +961,25 @@ namespace orc {
return totalLength;
}
- void SortedStringDictionary::clear() {
+ void SortedStringDictionary::clear() {
totalLength = 0;
data.clear();
dict.clear();
}
class StringColumnWriter : public ColumnWriter {
- public:
- StringColumnWriter(const Type& type,
- const StreamsFactory& factory,
+ public:
+ StringColumnWriter(const Type& type, const StreamsFactory& factory,
const WriterOptions& options);
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
virtual void flush(std::vector<proto::Stream>& streams) override;
virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override;
virtual void recordPosition() const override;
@@ -1089,7 +989,7 @@ namespace orc {
virtual void reset() override;
- private:
+ private:
/**
* dictionary related functions
*/
@@ -1099,7 +999,7 @@ namespace orc {
void deleteDictStreams();
void fallbackToDirectEncoding();
- protected:
+ protected:
RleVersion rleVersion;
bool useCompression;
const StreamsFactory& streamsFactory;
@@ -1129,18 +1029,16 @@ namespace orc {
mutable std::vector<size_t> startOfRowGroups;
};
- StringColumnWriter::StringColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options),
- rleVersion(options.getRleVersion()),
- useCompression(options.getCompression() != CompressionKind_NONE),
- streamsFactory(factory),
- alignedBitPacking(options.getAlignedBitpacking()),
- doneDictionaryCheck(false),
- useDictionary(options.getEnableDictionary()),
- dictSizeThreshold(options.getDictionaryKeySizeThreshold()){
+ StringColumnWriter::StringColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : ColumnWriter(type, factory, options),
+ rleVersion(options.getRleVersion()),
+ useCompression(options.getCompression() != CompressionKind_NONE),
+ streamsFactory(factory),
+ alignedBitPacking(options.getAlignedBitpacking()),
+ doneDictionaryCheck(false),
+ useDictionary(options.getEnableDictionary()),
+ dictSizeThreshold(options.getDictionaryKeySizeThreshold()) {
if (type.getKind() == TypeKind::BINARY) {
useDictionary = false;
doneDictionaryCheck = true;
@@ -1158,12 +1056,9 @@ namespace orc {
}
}
- void StringColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void StringColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
- const StringVectorBatch* stringBatch =
- dynamic_cast<const StringVectorBatch*>(&rowBatch);
+ const StringVectorBatch* stringBatch = dynamic_cast<const StringVectorBatch*>(&rowBatch);
if (stringBatch == nullptr) {
throw InvalidArgument("Failed to cast to StringVectorBatch");
}
@@ -1176,12 +1071,11 @@ namespace orc {
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
- char *const * data = stringBatch->data.data() + offset;
+ char* const* data = stringBatch->data.data() + offset;
const int64_t* length = stringBatch->length.data() + offset;
- const char* notNull = stringBatch->hasNulls ?
- stringBatch->notNull.data() + offset : nullptr;
+ const char* notNull = stringBatch->hasNulls ? stringBatch->notNull.data() + offset : nullptr;
- if (!useDictionary){
+ if (!useDictionary) {
directLengthEncoder->add(length, numValues, notNull);
}
@@ -1260,17 +1154,14 @@ namespace orc {
return size;
}
- void StringColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
+ void StringColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
if (!useDictionary) {
- encoding.set_kind(rleVersion == RleVersion_1 ?
- proto::ColumnEncoding_Kind_DIRECT :
- proto::ColumnEncoding_Kind_DIRECT_V2);
+ encoding.set_kind(rleVersion == RleVersion_1 ? proto::ColumnEncoding_Kind_DIRECT
+ : proto::ColumnEncoding_Kind_DIRECT_V2);
} else {
- encoding.set_kind(rleVersion == RleVersion_1 ?
- proto::ColumnEncoding_Kind_DICTIONARY :
- proto::ColumnEncoding_Kind_DICTIONARY_V2);
+ encoding.set_kind(rleVersion == RleVersion_1 ? proto::ColumnEncoding_Kind_DICTIONARY
+ : proto::ColumnEncoding_Kind_DICTIONARY_V2);
}
encoding.set_dictionarysize(static_cast<uint32_t>(dictionary.size()));
if (enableBloomFilter) {
@@ -1293,8 +1184,9 @@ namespace orc {
bool StringColumnWriter::checkDictionaryKeyRatio() {
if (!doneDictionaryCheck) {
- useDictionary = dictionary.size() <= static_cast<size_t>(
- static_cast<double>(dictionary.idxInDictBuffer.size()) * dictSizeThreshold);
+ useDictionary = dictionary.size() <=
+ static_cast<size_t>(static_cast<double>(dictionary.idxInDictBuffer.size()) *
+ dictSizeThreshold);
doneDictionaryCheck = true;
}
@@ -1321,33 +1213,24 @@ namespace orc {
void StringColumnWriter::createDirectStreams() {
std::unique_ptr<BufferedOutputStream> directLengthStream =
- streamsFactory.createStream(proto::Stream_Kind_LENGTH);
- directLengthEncoder = createRleEncoder(std::move(directLengthStream),
- false,
- rleVersion,
- memPool,
- alignedBitPacking);
- directDataStream.reset(new AppendOnlyBufferedStream(
- streamsFactory.createStream(proto::Stream_Kind_DATA)));
+ streamsFactory.createStream(proto::Stream_Kind_LENGTH);
+ directLengthEncoder = createRleEncoder(std::move(directLengthStream), false, rleVersion,
+ memPool, alignedBitPacking);
+ directDataStream.reset(
+ new AppendOnlyBufferedStream(streamsFactory.createStream(proto::Stream_Kind_DATA)));
}
void StringColumnWriter::createDictStreams() {
std::unique_ptr<BufferedOutputStream> dictDataStream =
- streamsFactory.createStream(proto::Stream_Kind_DATA);
- dictDataEncoder = createRleEncoder(std::move(dictDataStream),
- false,
- rleVersion,
- memPool,
- alignedBitPacking);
+ streamsFactory.createStream(proto::Stream_Kind_DATA);
+ dictDataEncoder =
+ createRleEncoder(std::move(dictDataStream), false, rleVersion, memPool, alignedBitPacking);
std::unique_ptr<BufferedOutputStream> dictLengthStream =
- streamsFactory.createStream(proto::Stream_Kind_LENGTH);
- dictLengthEncoder = createRleEncoder(std::move(dictLengthStream),
- false,
- rleVersion,
- memPool,
+ streamsFactory.createStream(proto::Stream_Kind_LENGTH);
+ dictLengthEncoder = createRleEncoder(std::move(dictLengthStream), false, rleVersion, memPool,
alignedBitPacking);
dictStream.reset(new AppendOnlyBufferedStream(
- streamsFactory.createStream(proto::Stream_Kind_DICTIONARY_DATA)));
+ streamsFactory.createStream(proto::Stream_Kind_DICTIONARY_DATA)));
}
void StringColumnWriter::deleteDictStreams() {
@@ -1361,7 +1244,7 @@ namespace orc {
}
void StringColumnWriter::writeDictionary() {
- if (useDictionary && !doneDictionaryCheck) {
+ if (useDictionary && !doneDictionaryCheck) {
// when index is disabled, dictionary check happens while writing 1st stripe
if (!checkDictionaryKeyRatio()) {
fallbackToDirectEncoding();
@@ -1377,7 +1260,7 @@ namespace orc {
dictionary.reorder(dictionary.idxInDictBuffer);
// write data sequences
- int64_t * data = dictionary.idxInDictBuffer.data();
+ int64_t* data = dictionary.idxInDictBuffer.data();
if (enableIndex) {
size_t prevOffset = 0;
for (size_t i = 0; i < startOfRowGroups.size(); ++i) {
@@ -1387,9 +1270,9 @@ namespace orc {
// update index positions
int rowGroupId = static_cast<int>(i);
- proto::RowIndexEntry* indexEntry =
- (rowGroupId < rowIndex->entry_size()) ?
- rowIndex->mutable_entry(rowGroupId) : rowIndexEntry.get();
+ proto::RowIndexEntry* indexEntry = (rowGroupId < rowIndex->entry_size())
+ ? rowIndex->mutable_entry(rowGroupId)
+ : rowIndexEntry.get();
// add positions for direct streams
RowIndexPositionRecorder recorder(*indexEntry);
@@ -1398,8 +1281,7 @@ namespace orc {
prevOffset = offset;
}
- dictDataEncoder->add(data + prevOffset,
- dictionary.idxInDictBuffer.size() - prevOffset,
+ dictDataEncoder->add(data + prevOffset, dictionary.idxInDictBuffer.size() - prevOffset,
nullptr);
} else {
dictDataEncoder->add(data, dictionary.idxInDictBuffer.size(), nullptr);
@@ -1413,18 +1295,18 @@ namespace orc {
if (enableIndex) {
// fallback happens at the 1st row group;
// simply complete positions for direct streams
- proto::RowIndexEntry * indexEntry = rowIndexEntry.get();
+ proto::RowIndexEntry* indexEntry = rowIndexEntry.get();
RowIndexPositionRecorder recorder(*indexEntry);
directDataStream->recordPosition(&recorder);
directLengthEncoder->recordPosition(&recorder);
}
// get dictionary entries in insertion order
- std::vector<const SortedStringDictionary::DictEntry *> entries;
+ std::vector<const SortedStringDictionary::DictEntry*> entries;
dictionary.getEntriesInInsertionOrder(entries);
// store each length of the data into a vector
- const SortedStringDictionary::DictEntry * dictEntry = nullptr;
+ const SortedStringDictionary::DictEntry* dictEntry = nullptr;
for (uint64_t i = 0; i != dictionary.idxInDictBuffer.size(); ++i) {
// write one row data in direct encoding
dictEntry = entries[static_cast<size_t>(dictionary.idxInDictBuffer[i])];
@@ -1439,7 +1321,7 @@ namespace orc {
/**
* Counts how many utf-8 chars of the input data
*/
- static uint64_t charLength(const char * data, uint64_t length) {
+ static uint64_t charLength(const char* data, uint64_t length) {
uint64_t chars = 0;
for (uint64_t i = 0; i < length; i++) {
if (isUtfStartByte(data[i])) {
@@ -1459,9 +1341,7 @@ namespace orc {
* @param data the bytes of UTF-8
* @param length the length of data to truncate
*/
- static uint64_t truncateBytesTo(uint64_t maxCharLength,
- const char * data,
- uint64_t length) {
+ static uint64_t truncateBytesTo(uint64_t maxCharLength, const char* data, uint64_t length) {
uint64_t chars = 0;
if (length <= maxCharLength) {
return length;
@@ -1491,8 +1371,8 @@ namespace orc {
* @param from the first byte location
* @param until the last byte location
* @return the index of the last character
- */
- static uint64_t findLastCharacter(const char * text, uint64_t from, uint64_t until) {
+ */
+ static uint64_t findLastCharacter(const char* text, uint64_t from, uint64_t until) {
uint64_t posn = until;
/* we don't expect characters more than 5 bytes */
while (posn >= from) {
@@ -1502,36 +1382,29 @@ namespace orc {
posn -= 1;
}
/* beginning of a valid char not found */
- throw std::logic_error(
- "Could not truncate string, beginning of a valid char not found");
+ throw std::logic_error("Could not truncate string, beginning of a valid char not found");
}
};
class CharColumnWriter : public StringColumnWriter {
- public:
- CharColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- StringColumnWriter(type, factory, options),
- maxLength(type.getMaximumLength()),
- padBuffer(*options.getMemoryPool()) {
+ public:
+ CharColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options)
+ : StringColumnWriter(type, factory, options),
+ maxLength(type.getMaximumLength()),
+ padBuffer(*options.getMemoryPool()) {
// utf-8 is currently 4 bytes long, but it could be up to 6
padBuffer.resize(maxLength * 6);
}
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
- private:
+ private:
uint64_t maxLength;
DataBuffer<char> padBuffer;
};
- void CharColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void CharColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
StringVectorBatch* charsBatch = dynamic_cast<StringVectorBatch*>(&rowBatch);
if (charsBatch == nullptr) {
@@ -1548,26 +1421,24 @@ namespace orc {
char** data = charsBatch->data.data() + offset;
int64_t* length = charsBatch->length.data() + offset;
- const char* notNull = charsBatch->hasNulls ?
- charsBatch->notNull.data() + offset : nullptr;
+ const char* notNull = charsBatch->hasNulls ? charsBatch->notNull.data() + offset : nullptr;
uint64_t count = 0;
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
- const char * charData = nullptr;
+ const char* charData = nullptr;
uint64_t originLength = static_cast<uint64_t>(length[i]);
uint64_t charLength = Utf8Utils::charLength(data[i], originLength);
if (charLength >= maxLength) {
charData = data[i];
- length[i] = static_cast<int64_t>(
- Utf8Utils::truncateBytesTo(maxLength, data[i], originLength));
+ length[i] =
+ static_cast<int64_t>(Utf8Utils::truncateBytesTo(maxLength, data[i], originLength));
} else {
charData = padBuffer.data();
// the padding is exactly 1 byte per char
length[i] = length[i] + static_cast<int64_t>(maxLength - charLength);
memcpy(padBuffer.data(), data[i], originLength);
- memset(padBuffer.data() + originLength,
- ' ',
+ memset(padBuffer.data() + originLength, ' ',
static_cast<size_t>(length[i]) - originLength);
}
@@ -1597,27 +1468,21 @@ namespace orc {
}
class VarCharColumnWriter : public StringColumnWriter {
- public:
- VarCharColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- StringColumnWriter(type, factory, options),
- maxLength(type.getMaximumLength()) {
+ public:
+ VarCharColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : StringColumnWriter(type, factory, options), maxLength(type.getMaximumLength()) {
// PASS
}
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
- private:
+ private:
uint64_t maxLength;
};
- void VarCharColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void VarCharColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
StringVectorBatch* charsBatch = dynamic_cast<StringVectorBatch*>(&rowBatch);
if (charsBatch == nullptr) {
@@ -1634,14 +1499,13 @@ namespace orc {
char* const* data = charsBatch->data.data() + offset;
int64_t* length = charsBatch->length.data() + offset;
- const char* notNull = charsBatch->hasNulls ?
- charsBatch->notNull.data() + offset : nullptr;
+ const char* notNull = charsBatch->hasNulls ? charsBatch->notNull.data() + offset : nullptr;
uint64_t count = 0;
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
- uint64_t itemLength = Utf8Utils::truncateBytesTo(
- maxLength, data[i], static_cast<uint64_t>(length[i]));
+ uint64_t itemLength =
+ Utf8Utils::truncateBytesTo(maxLength, data[i], static_cast<uint64_t>(length[i]));
length[i] = static_cast<int64_t>(itemLength);
if (useDictionary) {
@@ -1670,23 +1534,18 @@ namespace orc {
}
class BinaryColumnWriter : public StringColumnWriter {
- public:
- BinaryColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- StringColumnWriter(type, factory, options) {
+ public:
+ BinaryColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : StringColumnWriter(type, factory, options) {
// PASS
}
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
};
- void BinaryColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void BinaryColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
StringVectorBatch* binBatch = dynamic_cast<StringVectorBatch*>(&rowBatch);
if (binBatch == nullptr) {
@@ -1703,8 +1562,7 @@ namespace orc {
char** data = binBatch->data.data() + offset;
int64_t* length = binBatch->length.data() + offset;
- const char* notNull = binBatch->hasNulls ?
- binBatch->notNull.data() + offset : nullptr;
+ const char* notNull = binBatch->hasNulls ? binBatch->notNull.data() + offset : nullptr;
uint64_t count = 0;
for (uint64_t i = 0; i < numValues; ++i) {
@@ -1727,60 +1585,43 @@ namespace orc {
}
class TimestampColumnWriter : public ColumnWriter {
- public:
- TimestampColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options,
- bool isInstantType);
-
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ public:
+ TimestampColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options, bool isInstantType);
+
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
virtual void flush(std::vector<proto::Stream>& streams) override;
virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override;
virtual void recordPosition() const override;
- protected:
+ protected:
std::unique_ptr<RleEncoder> secRleEncoder, nanoRleEncoder;
- private:
+ private:
RleVersion rleVersion;
const Timezone& timezone;
const bool isUTC;
};
- TimestampColumnWriter::TimestampColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options,
- bool isInstantType) :
- ColumnWriter(type, factory, options),
- rleVersion(options.getRleVersion()),
- timezone(isInstantType ?
- getTimezoneByName("GMT") :
- options.getTimezone()),
- isUTC(isInstantType ||
- options.getTimezoneName() == "GMT") {
+ TimestampColumnWriter::TimestampColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options, bool isInstantType)
+ : ColumnWriter(type, factory, options),
+ rleVersion(options.getRleVersion()),
+ timezone(isInstantType ? getTimezoneByName("GMT") : options.getTimezone()),
+ isUTC(isInstantType || options.getTimezoneName() == "GMT") {
std::unique_ptr<BufferedOutputStream> dataStream =
factory.createStream(proto::Stream_Kind_DATA);
std::unique_ptr<BufferedOutputStream> secondaryStream =
factory.createStream(proto::Stream_Kind_SECONDARY);
- secRleEncoder = createRleEncoder(std::move(dataStream),
- true,
- rleVersion,
- memPool,
+ secRleEncoder = createRleEncoder(std::move(dataStream), true, rleVersion, memPool,
options.getAlignedBitpacking());
- nanoRleEncoder = createRleEncoder(std::move(secondaryStream),
- false,
- rleVersion,
- memPool,
+ nanoRleEncoder = createRleEncoder(std::move(secondaryStream), false, rleVersion, memPool,
options.getAlignedBitpacking());
if (enableIndex) {
@@ -1809,12 +1650,9 @@ namespace orc {
}
}
- void TimestampColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void TimestampColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
- TimestampVectorBatch* tsBatch =
- dynamic_cast<TimestampVectorBatch*>(&rowBatch);
+ TimestampVectorBatch* tsBatch = dynamic_cast<TimestampVectorBatch*>(&rowBatch);
if (tsBatch == nullptr) {
throw InvalidArgument("Failed to cast to TimestampVectorBatch");
}
@@ -1827,10 +1665,9 @@ namespace orc {
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
- const char* notNull = tsBatch->hasNulls ?
- tsBatch->notNull.data() + offset : nullptr;
- int64_t *secs = tsBatch->data.data() + offset;
- int64_t *nanos = tsBatch->nanoseconds.data() + offset;
+ const char* notNull = tsBatch->hasNulls ? tsBatch->notNull.data() + offset : nullptr;
+ int64_t* secs = tsBatch->data.data() + offset;
+ int64_t* nanos = tsBatch->nanoseconds.data() + offset;
uint64_t count = 0;
for (uint64_t i = 0; i < numValues; ++i) {
@@ -1887,7 +1724,7 @@ namespace orc {
}
void TimestampColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
+ std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
encoding.set_kind(RleVersionMapper(rleVersion));
encoding.set_dictionarysize(0);
@@ -1904,31 +1741,22 @@ namespace orc {
}
class DateColumnWriter : public IntegerColumnWriter {
- public:
- DateColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options);
-
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ public:
+ DateColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options);
+
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
};
- DateColumnWriter::DateColumnWriter(
- const Type &type,
- const StreamsFactory &factory,
- const WriterOptions &options) :
- IntegerColumnWriter(type, factory, options) {
+ DateColumnWriter::DateColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : IntegerColumnWriter(type, factory, options) {
// PASS
}
- void DateColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void DateColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
- const LongVectorBatch* longBatch =
- dynamic_cast<const LongVectorBatch*>(&rowBatch);
+ const LongVectorBatch* longBatch = dynamic_cast<const LongVectorBatch*>(&rowBatch);
if (longBatch == nullptr) {
throw InvalidArgument("Failed to cast to LongVectorBatch");
}
@@ -1942,8 +1770,7 @@ namespace orc {
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
const int64_t* data = longBatch->data.data() + offset;
- const char* notNull = longBatch->hasNulls ?
- longBatch->notNull.data() + offset : nullptr;
+ const char* notNull = longBatch->hasNulls ? longBatch->notNull.data() + offset : nullptr;
rleEncoder->add(data, numValues, notNull);
@@ -1964,55 +1791,45 @@ namespace orc {
}
class Decimal64ColumnWriter : public ColumnWriter {
- public:
+ public:
static const uint32_t MAX_PRECISION_64 = 18;
static const uint32_t MAX_PRECISION_128 = 38;
- Decimal64ColumnWriter(const Type& type,
- const StreamsFactory& factory,
+ Decimal64ColumnWriter(const Type& type, const StreamsFactory& factory,
const WriterOptions& options);
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
virtual void flush(std::vector<proto::Stream>& streams) override;
virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override;
virtual void recordPosition() const override;
- protected:
+ protected:
RleVersion rleVersion;
uint64_t precision;
uint64_t scale;
std::unique_ptr<AppendOnlyBufferedStream> valueStream;
std::unique_ptr<RleEncoder> scaleEncoder;
- private:
+ private:
char buffer[10];
};
- Decimal64ColumnWriter::Decimal64ColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options),
- rleVersion(options.getRleVersion()),
- precision(type.getPrecision()),
- scale(type.getScale()) {
- valueStream.reset(new AppendOnlyBufferedStream(
- factory.createStream(proto::Stream_Kind_DATA)));
+ Decimal64ColumnWriter::Decimal64ColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : ColumnWriter(type, factory, options),
+ rleVersion(options.getRleVersion()),
+ precision(type.getPrecision()),
+ scale(type.getScale()) {
+ valueStream.reset(new AppendOnlyBufferedStream(factory.createStream(proto::Stream_Kind_DATA)));
std::unique_ptr<BufferedOutputStream> scaleStream =
factory.createStream(proto::Stream_Kind_SECONDARY);
- scaleEncoder = createRleEncoder(std::move(scaleStream),
- true,
- rleVersion,
- memPool,
+ scaleEncoder = createRleEncoder(std::move(scaleStream), true, rleVersion, memPool,
options.getAlignedBitpacking());
if (enableIndex) {
@@ -2020,26 +1837,22 @@ namespace orc {
}
}
- void Decimal64ColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void Decimal64ColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
- const Decimal64VectorBatch* decBatch =
- dynamic_cast<const Decimal64VectorBatch*>(&rowBatch);
+ const Decimal64VectorBatch* decBatch = dynamic_cast<const Decimal64VectorBatch*>(&rowBatch);
if (decBatch == nullptr) {
throw InvalidArgument("Failed to cast to Decimal64VectorBatch");
}
DecimalColumnStatisticsImpl* decStats =
- dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get());
+ dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get());
if (decStats == nullptr) {
throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl");
}
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
- const char* notNull = decBatch->hasNulls ?
- decBatch->notNull.data() + offset : nullptr;
+ const char* notNull = decBatch->hasNulls ? decBatch->notNull.data() + offset : nullptr;
const int64_t* values = decBatch->values.data() + offset;
uint64_t count = 0;
@@ -2060,10 +1873,8 @@ namespace orc {
valueStream->write(buffer, static_cast<size_t>(data - buffer));
++count;
if (enableBloomFilter) {
- std::string decimal = Decimal(
- values[i], static_cast<int32_t>(scale)).toString(true);
- bloomFilter->addBytes(
- decimal.c_str(), static_cast<int64_t>(decimal.size()));
+ std::string decimal = Decimal(values[i], static_cast<int32_t>(scale)).toString(true);
+ bloomFilter->addBytes(decimal.c_str(), static_cast<int64_t>(decimal.size()));
}
decStats->update(Decimal(values[i], static_cast<int32_t>(scale)));
}
@@ -2100,7 +1911,7 @@ namespace orc {
}
void Decimal64ColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
+ std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
encoding.set_kind(RleVersionMapper(rleVersion));
encoding.set_dictionarysize(0);
@@ -2117,44 +1928,35 @@ namespace orc {
}
class Decimal64ColumnWriterV2 : public ColumnWriter {
- public:
- Decimal64ColumnWriterV2(const Type& type,
- const StreamsFactory& factory,
+ public:
+ Decimal64ColumnWriterV2(const Type& type, const StreamsFactory& factory,
const WriterOptions& options);
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
virtual void flush(std::vector<proto::Stream>& streams) override;
virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override;
virtual void recordPosition() const override;
- protected:
+ protected:
uint64_t precision;
uint64_t scale;
std::unique_ptr<RleEncoder> valueEncoder;
};
- Decimal64ColumnWriterV2::Decimal64ColumnWriterV2(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options),
- precision(type.getPrecision()),
- scale(type.getScale()) {
+ Decimal64ColumnWriterV2::Decimal64ColumnWriterV2(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : ColumnWriter(type, factory, options),
+ precision(type.getPrecision()),
+ scale(type.getScale()) {
std::unique_ptr<BufferedOutputStream> dataStream =
factory.createStream(proto::Stream_Kind_DATA);
- valueEncoder = createRleEncoder(std::move(dataStream),
- true,
- RleVersion_2,
- memPool,
+ valueEncoder = createRleEncoder(std::move(dataStream), true, RleVersion_2, memPool,
options.getAlignedBitpacking());
if (enableIndex) {
@@ -2162,18 +1964,15 @@ namespace orc {
}
}
- void Decimal64ColumnWriterV2::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char* incomingMask) {
- const Decimal64VectorBatch* decBatch =
- dynamic_cast<const Decimal64VectorBatch*>(&rowBatch);
+ void Decimal64ColumnWriterV2::add(ColumnVectorBatch& rowBatch, uint64_t offset,
+ uint64_t numValues, const char* incomingMask) {
+ const Decimal64VectorBatch* decBatch = dynamic_cast<const Decimal64VectorBatch*>(&rowBatch);
if (decBatch == nullptr) {
throw InvalidArgument("Failed to cast to Decimal64VectorBatch");
}
DecimalColumnStatisticsImpl* decStats =
- dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get());
+ dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get());
if (decStats == nullptr) {
throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl");
}
@@ -2181,8 +1980,7 @@ namespace orc {
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
const int64_t* data = decBatch->values.data() + offset;
- const char* notNull = decBatch->hasNulls ?
- decBatch->notNull.data() + offset : nullptr;
+ const char* notNull = decBatch->hasNulls ? decBatch->notNull.data() + offset : nullptr;
valueEncoder->add(data, numValues, notNull);
@@ -2191,10 +1989,8 @@ namespace orc {
if (!notNull || notNull[i]) {
++count;
if (enableBloomFilter) {
- std::string decimal = Decimal(
- data[i], static_cast<int32_t>(scale)).toString(true);
- bloomFilter->addBytes(
- decimal.c_str(), static_cast<int64_t>(decimal.size()));
+ std::string decimal = Decimal(data[i], static_cast<int32_t>(scale)).toString(true);
+ bloomFilter->addBytes(decimal.c_str(), static_cast<int64_t>(decimal.size()));
}
decStats->update(Decimal(data[i], static_cast<int32_t>(scale)));
}
@@ -2222,7 +2018,7 @@ namespace orc {
}
void Decimal64ColumnWriterV2::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
+ std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
encoding.set_kind(RleVersionMapper(RleVersion_2));
encoding.set_dictionarysize(0);
@@ -2238,25 +2034,20 @@ namespace orc {
}
class Decimal128ColumnWriter : public Decimal64ColumnWriter {
- public:
- Decimal128ColumnWriter(const Type& type,
- const StreamsFactory& factory,
+ public:
+ Decimal128ColumnWriter(const Type& type, const StreamsFactory& factory,
const WriterOptions& options);
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
- private:
+ private:
char buffer[20];
};
- Decimal128ColumnWriter::Decimal128ColumnWriter(
- const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- Decimal64ColumnWriter(type, factory, options) {
+ Decimal128ColumnWriter::Decimal128ColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : Decimal64ColumnWriter(type, factory, options) {
// PASS
}
@@ -2273,26 +2064,22 @@ namespace orc {
return val;
}
- void Decimal128ColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void Decimal128ColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
- const Decimal128VectorBatch* decBatch =
- dynamic_cast<const Decimal128VectorBatch*>(&rowBatch);
+ const Decimal128VectorBatch* decBatch = dynamic_cast<const Decimal128VectorBatch*>(&rowBatch);
if (decBatch == nullptr) {
throw InvalidArgument("Failed to cast to Decimal128VectorBatch");
}
DecimalColumnStatisticsImpl* decStats =
- dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get());
+ dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get());
if (decStats == nullptr) {
throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl");
}
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
- const char* notNull = decBatch->hasNulls ?
- decBatch->notNull.data() + offset : nullptr;
+ const char* notNull = decBatch->hasNulls ? decBatch->notNull.data() + offset : nullptr;
const Int128* values = decBatch->values.data() + offset;
// The current encoding of decimal columns stores the integer representation
@@ -2315,10 +2102,8 @@ namespace orc {
++count;
if (enableBloomFilter) {
- std::string decimal = Decimal(
- values[i], static_cast<int32_t>(scale)).toString(true);
- bloomFilter->addBytes(
- decimal.c_str(), static_cast<int64_t>(decimal.size()));
+ std::string decimal = Decimal(values[i], static_cast<int32_t>(scale)).toString(true);
+ bloomFilter->addBytes(decimal.c_str(), static_cast<int64_t>(decimal.size()));
}
decStats->update(Decimal(values[i], static_cast<int32_t>(scale)));
}
@@ -2332,29 +2117,22 @@ namespace orc {
}
class ListColumnWriter : public ColumnWriter {
- public:
- ListColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options);
+ public:
+ ListColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options);
~ListColumnWriter() override;
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
virtual void flush(std::vector<proto::Stream>& streams) override;
virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override;
- virtual void getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const override;
+ virtual void getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const override;
- virtual void getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const override;
+ virtual void getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const override;
virtual void mergeStripeStatsIntoFileStats() override;
@@ -2362,8 +2140,7 @@ namespace orc {
virtual void createRowIndexEntry() override;
- virtual void writeIndex(
- std::vector<proto::Stream> &streams) const override;
+ virtual void writeIndex(std::vector<proto::Stream>& streams) const override;
virtual void recordPosition() const override;
@@ -2371,24 +2148,18 @@ namespace orc {
virtual void reset() override;
- private:
+ private:
std::unique_ptr<RleEncoder> lengthEncoder;
RleVersion rleVersion;
std::unique_ptr<ColumnWriter> child;
};
- ListColumnWriter::ListColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options),
- rleVersion(options.getRleVersion()){
-
+ ListColumnWriter::ListColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : ColumnWriter(type, factory, options), rleVersion(options.getRleVersion()) {
std::unique_ptr<BufferedOutputStream> lengthStream =
- factory.createStream(proto::Stream_Kind_LENGTH);
- lengthEncoder = createRleEncoder(std::move(lengthStream),
- false,
- rleVersion,
- memPool,
+ factory.createStream(proto::Stream_Kind_LENGTH);
+ lengthEncoder = createRleEncoder(std::move(lengthStream), false, rleVersion, memPool,
options.getAlignedBitpacking());
if (type.getSubtypeCount() == 1) {
@@ -2404,9 +2175,7 @@ namespace orc {
// PASS
}
- void ListColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void ListColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
ListVectorBatch* listBatch = dynamic_cast<ListVectorBatch*>(&rowBatch);
if (listBatch == nullptr) {
@@ -2421,8 +2190,7 @@ namespace orc {
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
int64_t* offsets = listBatch->offsets.data() + offset;
- const char* notNull = listBatch->hasNulls ?
- listBatch->notNull.data() + offset : nullptr;
+ const char* notNull = listBatch->hasNulls ? listBatch->notNull.data() + offset : nullptr;
uint64_t elemOffset = static_cast<uint64_t>(offsets[0]);
uint64_t totalNumValues = static_cast<uint64_t>(offsets[numValues] - offsets[0]);
@@ -2474,7 +2242,7 @@ namespace orc {
}
}
- void ListColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const {
+ void ListColumnWriter::writeIndex(std::vector<proto::Stream>& streams) const {
ColumnWriter::writeIndex(streams);
if (child.get()) {
child->writeIndex(streams);
@@ -2490,8 +2258,7 @@ namespace orc {
return size;
}
- void ListColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
+ void ListColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
encoding.set_kind(RleVersionMapper(rleVersion));
encoding.set_dictionarysize(0);
@@ -2504,8 +2271,7 @@ namespace orc {
}
}
- void ListColumnWriter::getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
+ void ListColumnWriter::getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const {
ColumnWriter::getStripeStatistics(stats);
if (child.get()) {
child->getStripeStatistics(stats);
@@ -2519,15 +2285,14 @@ namespace orc {
}
}
- void ListColumnWriter::getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
+ void ListColumnWriter::getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const {
ColumnWriter::getFileStatistics(stats);
if (child.get()) {
child->getFileStatistics(stats);
}
}
- void ListColumnWriter::mergeRowGroupStatsIntoStripeStats() {
+ void ListColumnWriter::mergeRowGroupStatsIntoStripeStats() {
ColumnWriter::mergeRowGroupStatsIntoStripeStats();
if (child.get()) {
child->mergeRowGroupStatsIntoStripeStats();
@@ -2560,29 +2325,22 @@ namespace orc {
}
class MapColumnWriter : public ColumnWriter {
- public:
- MapColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options);
+ public:
+ MapColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options);
~MapColumnWriter() override;
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
virtual void flush(std::vector<proto::Stream>& streams) override;
virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override;
- virtual void getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const override;
+ virtual void getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const override;
- virtual void getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const override;
+ virtual void getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const override;
virtual void mergeStripeStatsIntoFileStats() override;
@@ -2590,8 +2348,7 @@ namespace orc {
virtual void createRowIndexEntry() override;
- virtual void writeIndex(
- std::vector<proto::Stream> &streams) const override;
+ virtual void writeIndex(std::vector<proto::Stream>& streams) const override;
virtual void recordPosition() const override;
@@ -2599,24 +2356,19 @@ namespace orc {
virtual void reset() override;
- private:
+ private:
std::unique_ptr<ColumnWriter> keyWriter;
std::unique_ptr<ColumnWriter> elemWriter;
std::unique_ptr<RleEncoder> lengthEncoder;
RleVersion rleVersion;
};
- MapColumnWriter::MapColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options),
- rleVersion(options.getRleVersion()){
+ MapColumnWriter::MapColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : ColumnWriter(type, factory, options), rleVersion(options.getRleVersion()) {
std::unique_ptr<BufferedOutputStream> lengthStream =
- factory.createStream(proto::Stream_Kind_LENGTH);
- lengthEncoder = createRleEncoder(std::move(lengthStream),
- false,
- rleVersion,
- memPool,
+ factory.createStream(proto::Stream_Kind_LENGTH);
+ lengthEncoder = createRleEncoder(std::move(lengthStream), false, rleVersion, memPool,
options.getAlignedBitpacking());
if (type.getSubtypeCount() > 0) {
@@ -2636,9 +2388,7 @@ namespace orc {
// PASS
}
- void MapColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void MapColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
MapVectorBatch* mapBatch = dynamic_cast<MapVectorBatch*>(&rowBatch);
if (mapBatch == nullptr) {
@@ -2653,8 +2403,7 @@ namespace orc {
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
int64_t* offsets = mapBatch->offsets.data() + offset;
- const char* notNull = mapBatch->hasNulls ?
- mapBatch->notNull.data() + offset : nullptr;
+ const char* notNull = mapBatch->hasNulls ? mapBatch->notNull.data() + offset : nullptr;
uint64_t elemOffset = static_cast<uint64_t>(offsets[0]);
uint64_t totalNumValues = static_cast<uint64_t>(offsets[numValues] - offsets[0]);
@@ -2713,8 +2462,7 @@ namespace orc {
}
}
- void MapColumnWriter::writeIndex(
- std::vector<proto::Stream> &streams) const {
+ void MapColumnWriter::writeIndex(std::vector<proto::Stream>& streams) const {
ColumnWriter::writeIndex(streams);
if (keyWriter.get()) {
keyWriter->writeIndex(streams);
@@ -2736,8 +2484,7 @@ namespace orc {
return size;
}
- void MapColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
+ void MapColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
encoding.set_kind(RleVersionMapper(rleVersion));
encoding.set_dictionarysize(0);
@@ -2753,8 +2500,7 @@ namespace orc {
}
}
- void MapColumnWriter::getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
+ void MapColumnWriter::getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const {
ColumnWriter::getStripeStatistics(stats);
if (keyWriter.get()) {
keyWriter->getStripeStatistics(stats);
@@ -2774,8 +2520,7 @@ namespace orc {
}
}
- void MapColumnWriter::getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
+ void MapColumnWriter::getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const {
ColumnWriter::getFileStatistics(stats);
if (keyWriter.get()) {
keyWriter->getFileStatistics(stats);
@@ -2785,7 +2530,7 @@ namespace orc {
}
}
- void MapColumnWriter::mergeRowGroupStatsIntoStripeStats() {
+ void MapColumnWriter::mergeRowGroupStatsIntoStripeStats() {
ColumnWriter::mergeRowGroupStatsIntoStripeStats();
if (keyWriter.get()) {
keyWriter->mergeRowGroupStatsIntoStripeStats();
@@ -2830,28 +2575,22 @@ namespace orc {
}
class UnionColumnWriter : public ColumnWriter {
- public:
- UnionColumnWriter(const Type& type,
- const StreamsFactory& factory,
+ public:
+ UnionColumnWriter(const Type& type, const StreamsFactory& factory,
const WriterOptions& options);
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) override;
virtual void flush(std::vector<proto::Stream>& streams) override;
virtual uint64_t getEstimatedSize() const override;
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const override;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const override;
- virtual void getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const override;
+ virtual void getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const override;
- virtual void getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const override;
+ virtual void getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const override;
virtual void mergeStripeStatsIntoFileStats() override;
@@ -2859,8 +2598,7 @@ namespace orc {
virtual void createRowIndexEntry() override;
- virtual void writeIndex(
- std::vector<proto::Stream> &streams) const override;
+ virtual void writeIndex(std::vector<proto::Stream>& streams) const override;
virtual void recordPosition() const override;
@@ -2868,24 +2606,20 @@ namespace orc {
virtual void reset() override;
- private:
+ private:
std::unique_ptr<ByteRleEncoder> rleEncoder;
std::vector<std::unique_ptr<ColumnWriter>> children;
};
- UnionColumnWriter::UnionColumnWriter(const Type& type,
- const StreamsFactory& factory,
- const WriterOptions& options) :
- ColumnWriter(type, factory, options) {
-
+ UnionColumnWriter::UnionColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : ColumnWriter(type, factory, options) {
std::unique_ptr<BufferedOutputStream> dataStream =
- factory.createStream(proto::Stream_Kind_DATA);
+ factory.createStream(proto::Stream_Kind_DATA);
rleEncoder = createByteRleEncoder(std::move(dataStream));
for (uint64_t i = 0; i != type.getSubtypeCount(); ++i) {
- children.push_back(buildWriter(*type.getSubtype(i),
- factory,
- options));
+ children.push_back(buildWriter(*type.getSubtype(i), factory, options));
}
if (enableIndex) {
@@ -2893,9 +2627,7 @@ namespace orc {
}
}
- void UnionColumnWriter::add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
+ void UnionColumnWriter::add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
const char* incomingMask) {
UnionVectorBatch* unionBatch = dynamic_cast<UnionVectorBatch*>(&rowBatch);
if (unionBatch == nullptr) {
@@ -2904,10 +2636,9 @@ namespace orc {
ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
- const char* notNull = unionBatch->hasNulls ?
- unionBatch->notNull.data() + offset : nullptr;
- unsigned char * tags = unionBatch->tags.data() + offset;
- uint64_t * offsets = unionBatch->offsets.data() + offset;
+ const char* notNull = unionBatch->hasNulls ? unionBatch->notNull.data() + offset : nullptr;
+ unsigned char* tags = unionBatch->tags.data() + offset;
+ uint64_t* offsets = unionBatch->offsets.data() + offset;
std::vector<int64_t> childOffset(children.size(), -1);
std::vector<uint64_t> childLength(children.size(), 0);
@@ -2923,8 +2654,7 @@ namespace orc {
for (uint32_t i = 0; i < children.size(); ++i) {
if (childLength[i] > 0) {
- children[i]->add(*unionBatch->children[i],
- static_cast<uint64_t>(childOffset[i]),
+ children[i]->add(*unionBatch->children[i], static_cast<uint64_t>(childOffset[i]),
childLength[i], nullptr);
}
}
@@ -2965,7 +2695,7 @@ namespace orc {
}
}
- void UnionColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const {
+ void UnionColumnWriter::writeIndex(std::vector<proto::Stream>& streams) const {
ColumnWriter::writeIndex(streams);
for (uint32_t i = 0; i < children.size(); ++i) {
children[i]->writeIndex(streams);
@@ -2981,8 +2711,7 @@ namespace orc {
return size;
}
- void UnionColumnWriter::getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const {
+ void UnionColumnWriter::getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const {
proto::ColumnEncoding encoding;
encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT);
encoding.set_dictionarysize(0);
@@ -2995,8 +2724,7 @@ namespace orc {
}
}
- void UnionColumnWriter::getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
+ void UnionColumnWriter::getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const {
ColumnWriter::getStripeStatistics(stats);
for (uint32_t i = 0; i < children.size(); ++i) {
children[i]->getStripeStatistics(stats);
@@ -3010,15 +2738,14 @@ namespace orc {
}
}
- void UnionColumnWriter::getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const {
+ void UnionColumnWriter::getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const {
ColumnWriter::getFileStatistics(stats);
for (uint32_t i = 0; i < children.size(); ++i) {
children[i]->getFileStatistics(stats);
}
}
- void UnionColumnWriter::mergeRowGroupStatsIntoStripeStats() {
+ void UnionColumnWriter::mergeRowGroupStatsIntoStripeStats() {
ColumnWriter::mergeRowGroupStatsIntoStripeStats();
for (uint32_t i = 0; i < children.size(); ++i) {
children[i]->mergeRowGroupStatsIntoStripeStats();
@@ -3050,140 +2777,63 @@ namespace orc {
}
}
- std::unique_ptr<ColumnWriter> buildWriter(
- const Type& type,
- const StreamsFactory& factory,
+ std::unique_ptr<ColumnWriter> buildWriter(const Type& type, const StreamsFactory& factory,
const WriterOptions& options) {
switch (static_cast<int64_t>(type.getKind())) {
case STRUCT:
- return std::unique_ptr<ColumnWriter>(
- new StructColumnWriter(
- type,
- factory,
- options));
+ return std::unique_ptr<ColumnWriter>(new StructColumnWriter(type, factory, options));
case INT:
case LONG:
case SHORT:
- return std::unique_ptr<ColumnWriter>(
- new IntegerColumnWriter(
- type,
- factory,
- options));
+ return std::unique_ptr<ColumnWriter>(new IntegerColumnWriter(type, factory, options));
case BYTE:
- return std::unique_ptr<ColumnWriter>(
- new ByteColumnWriter(
- type,
- factory,
- options));
+ return std::unique_ptr<ColumnWriter>(new ByteColumnWriter(type, factory, options));
case BOOLEAN:
- return std::unique_ptr<ColumnWriter>(
- new BooleanColumnWriter(
- type,
- factory,
- options));
+ return std::unique_ptr<ColumnWriter>(new BooleanColumnWriter(type, factory, options));
case DOUBLE:
- return std::unique_ptr<ColumnWriter>(
- new DoubleColumnWriter(
- type,
- factory,
- options,
- false));
+ return std::unique_ptr<ColumnWriter>(new DoubleColumnWriter(type, factory, options, false));
case FLOAT:
- return std::unique_ptr<ColumnWriter>(
- new DoubleColumnWriter(
- type,
- factory,
- options,
- true));
+ return std::unique_ptr<ColumnWriter>(new DoubleColumnWriter(type, factory, options, true));
case BINARY:
- return std::unique_ptr<ColumnWriter>(
- new BinaryColumnWriter(
- type,
- factory,
- options));
+ return std::unique_ptr<ColumnWriter>(new BinaryColumnWriter(type, factory, options));
case STRING:
- return std::unique_ptr<ColumnWriter>(
- new StringColumnWriter(
- type,
- factory,
- options));
+ return std::unique_ptr<ColumnWriter>(new StringColumnWriter(type, factory, options));
case CHAR:
- return std::unique_ptr<ColumnWriter>(
- new CharColumnWriter(
- type,
- factory,
- options));
+ return std::unique_ptr<ColumnWriter>(new CharColumnWriter(type, factory, options));
case VARCHAR:
- return std::unique_ptr<ColumnWriter>(
- new VarCharColumnWriter(
- type,
- factory,
- options));
+ return std::unique_ptr<ColumnWriter>(new VarCharColumnWriter(type, factory, options));
case DATE:
- return std::unique_ptr<ColumnWriter>(
- new DateColumnWriter(
- type,
- factory,
- options));
+ return std::unique_ptr<ColumnWriter>(new DateColumnWriter(type, factory, options));
case TIMESTAMP:
return std::unique_ptr<ColumnWriter>(
- new TimestampColumnWriter(
- type,
- factory,
- options,
- false));
+ new TimestampColumnWriter(type, factory, options, false));
case TIMESTAMP_INSTANT:
return std::unique_ptr<ColumnWriter>(
- new TimestampColumnWriter(
- type,
- factory,
- options,
- true));
+ new TimestampColumnWriter(type, factory, options, true));
case DECIMAL:
if (type.getPrecision() <= Decimal64ColumnWriter::MAX_PRECISION_64) {
if (options.getFileVersion() == FileVersion::UNSTABLE_PRE_2_0()) {
return std::unique_ptr<ColumnWriter>(
- new Decimal64ColumnWriterV2(
- type,
- factory,
- options));
+ new Decimal64ColumnWriterV2(type, factory, options));
}
- return std::unique_ptr<ColumnWriter>(
- new Decimal64ColumnWriter(
- type,
- factory,
- options));
+ return std::unique_ptr<ColumnWriter>(new Decimal64ColumnWriter(type, factory, options));
} else if (type.getPrecision() <= Decimal64ColumnWriter::MAX_PRECISION_128) {
- return std::unique_ptr<ColumnWriter>(
- new Decimal128ColumnWriter(
- type,
- factory,
- options));
+ return std::unique_ptr<ColumnWriter>(new Decimal128ColumnWriter(type, factory, options));
} else {
- throw NotImplementedYet("Decimal precision more than 38 is not "
- "supported");
+ throw NotImplementedYet(
+ "Decimal precision more than 38 is not "
+ "supported");
}
case LIST:
- return std::unique_ptr<ColumnWriter>(
- new ListColumnWriter(
- type,
- factory,
- options));
+ return std::unique_ptr<ColumnWriter>(new ListColumnWriter(type, factory, options));
case MAP:
- return std::unique_ptr<ColumnWriter>(
- new MapColumnWriter(
- type,
- factory,
- options));
+ return std::unique_ptr<ColumnWriter>(new MapColumnWriter(type, factory, options));
case UNION:
- return std::unique_ptr<ColumnWriter>(
- new UnionColumnWriter(
- type,
- factory,
- options));
+ return std::unique_ptr<ColumnWriter>(new UnionColumnWriter(type, factory, options));
default:
- throw NotImplementedYet("Type is not supported yet for creating "
- "ColumnWriter.");
+ throw NotImplementedYet(
+ "Type is not supported yet for creating "
+ "ColumnWriter.");
}
}
-}
+} // namespace orc
diff --git a/c++/src/ColumnWriter.hh b/c++/src/ColumnWriter.hh
index 20983774c..f21ffd6f8 100644
--- a/c++/src/ColumnWriter.hh
+++ b/c++/src/ColumnWriter.hh
@@ -24,15 +24,15 @@
#include "BloomFilter.hh"
#include "ByteRLE.hh"
#include "Compression.hh"
-#include "orc/Exceptions.hh"
#include "Statistics.hh"
+#include "orc/Exceptions.hh"
#include "wrap/orc-proto-wrapper.hh"
namespace orc {
class StreamsFactory {
- public:
+ public:
virtual ~StreamsFactory();
/**
@@ -40,29 +40,26 @@ namespace orc {
* @param kind the kind of the stream
* @return the buffered output stream
*/
- virtual std::unique_ptr<BufferedOutputStream>
- createStream(proto::Stream_Kind kind) const = 0;
+ virtual std::unique_ptr<BufferedOutputStream> createStream(proto::Stream_Kind kind) const = 0;
};
- std::unique_ptr<StreamsFactory> createStreamsFactory(
- const WriterOptions& options,
- OutputStream * outStream);
+ std::unique_ptr<StreamsFactory> createStreamsFactory(const WriterOptions& options,
+ OutputStream* outStream);
/**
* record stream positions for row index
*/
class RowIndexPositionRecorder : public PositionRecorder {
- public:
+ public:
virtual ~RowIndexPositionRecorder() override;
- RowIndexPositionRecorder(proto::RowIndexEntry& entry):
- rowIndexEntry(entry) {}
+ RowIndexPositionRecorder(proto::RowIndexEntry& entry) : rowIndexEntry(entry) {}
virtual void add(uint64_t pos) override {
rowIndexEntry.add_positions(pos);
}
- private:
+ private:
proto::RowIndexEntry& rowIndexEntry;
};
@@ -70,7 +67,7 @@ namespace orc {
* The interface for writing ORC data types.
*/
class ColumnWriter {
- protected:
+ protected:
std::unique_ptr<ByteRleEncoder> notNullEncoder;
uint64_t columnId;
std::unique_ptr<MutableColumnStatistics> colIndexStatistics;
@@ -88,9 +85,8 @@ namespace orc {
std::unique_ptr<BloomFilterImpl> bloomFilter;
std::unique_ptr<proto::BloomFilterIndex> bloomFilterIndex;
- public:
- ColumnWriter(const Type& type, const StreamsFactory& factory,
- const WriterOptions& options);
+ public:
+ ColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options);
virtual ~ColumnWriter();
@@ -103,10 +99,8 @@ namespace orc {
* a mask (with at least numValues bytes) for which
* values to write.
*/
- virtual void add(ColumnVectorBatch& rowBatch,
- uint64_t offset,
- uint64_t numValues,
- const char * incomingMask);
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
+ const char* incomingMask);
/**
* Flush column writer output streams.
* @param streams vector to store streams generated by flush()
@@ -123,22 +117,19 @@ namespace orc {
* Get the encoding used by the writer for this column.
* @param encodings vector to store the returned ColumnEncoding info
*/
- virtual void getColumnEncoding(
- std::vector<proto::ColumnEncoding>& encodings) const = 0;
+ virtual void getColumnEncoding(std::vector<proto::ColumnEncoding>& encodings) const = 0;
/**
* Get the stripe statistics for this column.
* @param stats vector to store the returned stripe statistics
*/
- virtual void getStripeStatistics(
- std::vector<proto::ColumnStatistics>& stats) const;
+ virtual void getStripeStatistics(std::vector<proto::ColumnStatistics>& stats) const;
/**
* Get the file statistics for this column.
* @param stats vector to store the returned file statistics
*/
- virtual void getFileStatistics(
- std::vector<proto::ColumnStatistics>& stats) const;
+ virtual void getFileStatistics(std::vector<proto::ColumnStatistics>& stats) const;
/**
* Merge index stats into stripe stats and reset index stats.
@@ -167,7 +158,7 @@ namespace orc {
* Write row index streams for this column.
* @param streams output list of ROW_INDEX streams
*/
- virtual void writeIndex(std::vector<proto::Stream> &streams) const;
+ virtual void writeIndex(std::vector<proto::Stream>& streams) const;
/**
* Record positions for index.
@@ -188,22 +179,21 @@ namespace orc {
*/
virtual void writeDictionary();
- protected:
+ protected:
/**
* Utility function to translate ColumnStatistics into protobuf form and
* add it to output list.
* @param statsList output list for protobuf stats
* @param stats ColumnStatistics to be transformed and added
*/
- void getProtoBufStatistics(
- std::vector<proto::ColumnStatistics>& statsList,
- const MutableColumnStatistics* stats) const {
- proto::ColumnStatistics pbStats;
- stats->toProtoBuf(pbStats);
- statsList.push_back(pbStats);
- }
+ void getProtoBufStatistics(std::vector<proto::ColumnStatistics>& statsList,
+ const MutableColumnStatistics* stats) const {
+ proto::ColumnStatistics pbStats;
+ stats->toProtoBuf(pbStats);
+ statsList.push_back(pbStats);
+ }
- protected:
+ protected:
MemoryPool& memPool;
std::unique_ptr<BufferedOutputStream> indexStream;
std::unique_ptr<BufferedOutputStream> bloomFilterStream;
@@ -213,10 +203,8 @@ namespace orc {
/**
* Create a writer for the given type.
*/
- std::unique_ptr<ColumnWriter> buildWriter(
- const Type& type,
- const StreamsFactory& factory,
+ std::unique_ptr<ColumnWriter> buildWriter(const Type& type, const StreamsFactory& factory,
const WriterOptions& options);
-}
+} // namespace orc
#endif
diff --git a/c++/src/Common.cc b/c++/src/Common.cc
index 477bfd3b4..e220e274d 100644
--- a/c++/src/Common.cc
+++ b/c++/src/Common.cc
@@ -138,14 +138,14 @@ namespace orc {
ss << majorVersion << '.' << minorVersion;
return ss.str();
}
-
- const FileVersion& FileVersion::v_0_11(){
- static FileVersion version(0,11);
+
+ const FileVersion& FileVersion::v_0_11() {
+ static FileVersion version(0, 11);
return version;
}
-
- const FileVersion& FileVersion::v_0_12(){
- static FileVersion version(0,12);
+
+ const FileVersion& FileVersion::v_0_12() {
+ static FileVersion version(0, 12);
return version;
}
@@ -156,9 +156,9 @@ namespace orc {
* without providing any forward or backward compatibility.
*
* When 2.0 is released, this version identifier will be completely removed.
- */
+ */
const FileVersion& FileVersion::UNSTABLE_PRE_2_0() {
static FileVersion version(1, 9999);
return version;
}
-}
+} // namespace orc
diff --git a/c++/src/Compression.cc b/c++/src/Compression.cc
index 5e256c5cd..6acc52d6f 100644
--- a/c++/src/Compression.cc
+++ b/c++/src/Compression.cc
@@ -16,12 +16,12 @@
* limitations under the License.
*/
-#include "Adaptor.hh"
#include "Compression.hh"
-#include "Utils.hh"
-#include "orc/Exceptions.hh"
+#include "Adaptor.hh"
#include "LzoDecompressor.hh"
+#include "Utils.hh"
#include "lz4.h"
+#include "orc/Exceptions.hh"
#include <algorithm>
#include <array>
@@ -49,26 +49,24 @@
namespace orc {
- class CompressionStreamBase: public BufferedOutputStream {
- public:
- CompressionStreamBase(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool,
- WriterMetrics* metrics);
+ class CompressionStreamBase : public BufferedOutputStream {
+ public:
+ CompressionStreamBase(OutputStream* outStream, int compressionLevel, uint64_t capacity,
+ uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics);
- virtual bool Next(void** data, int*size) override = 0;
+ virtual bool Next(void** data, int* size) override = 0;
virtual void BackUp(int count) override;
virtual std::string getName() const override = 0;
virtual uint64_t flush() override;
virtual void suppress() override;
- virtual bool isCompressed() const override { return true; }
+ virtual bool isCompressed() const override {
+ return true;
+ }
virtual uint64_t getSize() const override;
- protected:
+ protected:
void writeData(const unsigned char* data, int size);
void writeHeader(size_t compressedSize, bool original) {
@@ -87,7 +85,7 @@ namespace orc {
int level;
// Compressed data output buffer
- char * outputBuffer;
+ char* outputBuffer;
// Size for compressionBuffer
int bufferSize;
@@ -103,23 +101,16 @@ namespace orc {
std::array<char*, HEADER_SIZE> header;
};
- CompressionStreamBase::CompressionStreamBase(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool,
- WriterMetrics* metrics) :
- BufferedOutputStream(pool,
- outStream,
- capacity,
- blockSize,
- metrics),
- rawInputBuffer(pool, blockSize),
- level(compressionLevel),
- outputBuffer(nullptr),
- bufferSize(0),
- outputPosition(0),
- outputSize(0) {
+ CompressionStreamBase::CompressionStreamBase(OutputStream* outStream, int compressionLevel,
+ uint64_t capacity, uint64_t blockSize,
+ MemoryPool& pool, WriterMetrics* metrics)
+ : BufferedOutputStream(pool, outStream, capacity, blockSize, metrics),
+ rawInputBuffer(pool, blockSize),
+ level(compressionLevel),
+ outputBuffer(nullptr),
+ bufferSize(0),
+ outputPosition(0),
+ outputSize(0) {
// init header pointer array
header.fill(nullptr);
}
@@ -132,7 +123,7 @@ namespace orc {
}
uint64_t CompressionStreamBase::flush() {
- void * data;
+ void* data;
int size;
if (!Next(&data, &size)) {
throw std::runtime_error("Failed to flush compression buffer.");
@@ -149,8 +140,7 @@ namespace orc {
}
uint64_t CompressionStreamBase::getSize() const {
- return BufferedOutputStream::getSize() -
- static_cast<uint64_t>(outputSize - outputPosition);
+ return BufferedOutputStream::getSize() - static_cast<uint64_t>(outputSize - outputPosition);
}
// write the data content into outputBuffer
@@ -158,22 +148,16 @@ namespace orc {
int offset = 0;
while (offset < size) {
if (outputPosition == outputSize) {
- if (!BufferedOutputStream::Next(
- reinterpret_cast<void **>(&outputBuffer),
- &outputSize)) {
- throw std::runtime_error(
- "Failed to get next output buffer from output stream.");
+ if (!BufferedOutputStream::Next(reinterpret_cast<void**>(&outputBuffer), &outputSize)) {
+ throw std::runtime_error("Failed to get next output buffer from output stream.");
}
outputPosition = 0;
- } else if (outputPosition > outputSize) {
+ } else if (outputPosition > outputSize) {
// for safety this will unlikely happen
- throw std::logic_error(
- "Write to an out-of-bound place during compression!");
+ throw std::logic_error("Write to an out-of-bound place during compression!");
}
int currentSize = std::min(outputSize - outputPosition, size - offset);
- memcpy(outputBuffer + outputPosition,
- data + offset,
- static_cast<size_t>(currentSize));
+ memcpy(outputBuffer + outputPosition, data + offset, static_cast<size_t>(currentSize));
offset += currentSize;
outputPosition += currentSize;
}
@@ -183,11 +167,8 @@ namespace orc {
// adjust 3 bytes for the compression header
for (uint32_t i = 0; i < HEADER_SIZE; ++i) {
if (outputPosition >= outputSize) {
- if (!BufferedOutputStream::Next(
- reinterpret_cast<void **>(&outputBuffer),
- &outputSize)) {
- throw std::runtime_error(
- "Failed to get next output buffer from output stream.");
+ if (!BufferedOutputStream::Next(reinterpret_cast<void**>(&outputBuffer), &outputSize)) {
+ throw std::runtime_error("Failed to get next output buffer from output stream.");
}
outputPosition = 0;
}
@@ -199,39 +180,27 @@ namespace orc {
/**
* Streaming compression base class
*/
- class CompressionStream: public CompressionStreamBase {
- public:
- CompressionStream(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool,
- WriterMetrics* metrics);
-
- virtual bool Next(void** data, int*size) override;
+ class CompressionStream : public CompressionStreamBase {
+ public:
+ CompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity,
+ uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics);
+
+ virtual bool Next(void** data, int* size) override;
virtual std::string getName() const override = 0;
- protected:
+ protected:
// return total compressed size
virtual uint64_t doStreamingCompression() = 0;
};
- CompressionStream::CompressionStream(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool,
- WriterMetrics* metrics) :
- CompressionStreamBase(outStream,
- compressionLevel,
- capacity,
- blockSize,
- pool,
- metrics) {
+ CompressionStream::CompressionStream(OutputStream* outStream, int compressionLevel,
+ uint64_t capacity, uint64_t blockSize, MemoryPool& pool,
+ WriterMetrics* metrics)
+ : CompressionStreamBase(outStream, compressionLevel, capacity, blockSize, pool, metrics) {
// PASS
}
- bool CompressionStream::Next(void** data, int*size) {
+ bool CompressionStream::Next(void** data, int* size) {
if (bufferSize != 0) {
ensureHeader();
@@ -259,14 +228,10 @@ namespace orc {
return true;
}
- class ZlibCompressionStream: public CompressionStream {
- public:
- ZlibCompressionStream(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool,
- WriterMetrics* metrics);
+ class ZlibCompressionStream : public CompressionStream {
+ public:
+ ZlibCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity,
+ uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics);
virtual ~ZlibCompressionStream() override {
end();
@@ -274,28 +239,19 @@ namespace orc {
virtual std::string getName() const override;
- protected:
+ protected:
virtual uint64_t doStreamingCompression() override;
- private:
+ private:
void init();
void end();
z_stream strm;
};
- ZlibCompressionStream::ZlibCompressionStream(
- OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool,
- WriterMetrics* metrics)
- : CompressionStream(outStream,
- compressionLevel,
- capacity,
- blockSize,
- pool,
- metrics) {
+ ZlibCompressionStream::ZlibCompressionStream(OutputStream* outStream, int compressionLevel,
+ uint64_t capacity, uint64_t blockSize,
+ MemoryPool& pool, WriterMetrics* metrics)
+ : CompressionStream(outStream, compressionLevel, capacity, blockSize, pool, metrics) {
init();
}
@@ -309,18 +265,13 @@ namespace orc {
do {
if (outputPosition >= outputSize) {
- if (!BufferedOutputStream::Next(
- reinterpret_cast<void **>(&outputBuffer),
- &outputSize)) {
- throw std::runtime_error(
- "Failed to get next output buffer from output stream.");
+ if (!BufferedOutputStream::Next(reinterpret_cast<void**>(&outputBuffer), &outputSize)) {
+ throw std::runtime_error("Failed to get next output buffer from output stream.");
}
outputPosition = 0;
}
- strm.next_out = reinterpret_cast<unsigned char *>
- (outputBuffer + outputPosition);
- strm.avail_out = static_cast<unsigned int>
- (outputSize - outputPosition);
+ strm.next_out = reinterpret_cast<unsigned char*>(outputBuffer + outputPosition);
+ strm.avail_out = static_cast<unsigned int>(outputSize - outputPosition);
int ret = deflate(&strm, Z_FINISH);
outputPosition = outputSize - static_cast<int>(strm.avail_out);
@@ -341,7 +292,7 @@ namespace orc {
return "ZlibCompressionStream";
}
-DIAGNOSTIC_PUSH
+ DIAGNOSTIC_PUSH
#if defined(__GNUC__) || defined(__clang__)
DIAGNOSTIC_IGNORE("-Wold-style-cast")
@@ -353,8 +304,7 @@ DIAGNOSTIC_PUSH
strm.opaque = nullptr;
strm.next_in = nullptr;
- if (deflateInit2(&strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY)
- != Z_OK) {
+ if (deflateInit2(&strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) {
throw std::runtime_error("Error while calling deflateInit2() for zlib.");
}
}
@@ -363,43 +313,46 @@ DIAGNOSTIC_PUSH
(void)deflateEnd(&strm);
}
-DIAGNOSTIC_PUSH
+ DIAGNOSTIC_PUSH
- enum DecompressState { DECOMPRESS_HEADER,
- DECOMPRESS_START,
- DECOMPRESS_CONTINUE,
- DECOMPRESS_ORIGINAL,
- DECOMPRESS_EOF};
+ enum DecompressState {
+ DECOMPRESS_HEADER,
+ DECOMPRESS_START,
+ DECOMPRESS_CONTINUE,
+ DECOMPRESS_ORIGINAL,
+ DECOMPRESS_EOF
+ };
std::string decompressStateToString(DecompressState state) {
switch (state) {
- case DECOMPRESS_HEADER: return "DECOMPRESS_HEADER";
- case DECOMPRESS_START: return "DECOMPRESS_START";
- case DECOMPRESS_CONTINUE: return "DECOMPRESS_CONTINUE";
- case DECOMPRESS_ORIGINAL: return "DECOMPRESS_ORIGINAL";
- case DECOMPRESS_EOF: return "DECOMPRESS_EOF";
+ case DECOMPRESS_HEADER:
+ return "DECOMPRESS_HEADER";
+ case DECOMPRESS_START:
+ return "DECOMPRESS_START";
+ case DECOMPRESS_CONTINUE:
+ return "DECOMPRESS_CONTINUE";
+ case DECOMPRESS_ORIGINAL:
+ return "DECOMPRESS_ORIGINAL";
+ case DECOMPRESS_EOF:
+ return "DECOMPRESS_EOF";
}
return "unknown";
}
class DecompressionStream : public SeekableInputStream {
- public:
- DecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
- size_t bufferSize,
- MemoryPool& pool,
- ReaderMetrics* metrics);
+ public:
+ DecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t bufferSize,
+ MemoryPool& pool, ReaderMetrics* metrics);
virtual ~DecompressionStream() override {}
- virtual bool Next(const void** data, int*size) override;
+ virtual bool Next(const void** data, int* size) override;
virtual void BackUp(int count) override;
virtual bool Skip(int count) override;
virtual int64_t ByteCount() const override;
virtual void seek(PositionProvider& position) override;
virtual std::string getName() const override = 0;
- protected:
- virtual void NextDecompress(const void** data,
- int*size,
- size_t availableSize) = 0;
+ protected:
+ virtual void NextDecompress(const void** data, int* size, size_t availableSize) = 0;
std::string getStreamName() const;
void readBuffer(bool failOnEof);
@@ -417,8 +370,8 @@ DIAGNOSTIC_PUSH
// The starting and current position of the buffer for the uncompressed
// data. It either points to the data buffer or the underlying input stream.
- const char *outputBufferStart;
- const char *outputBuffer;
+ const char* outputBufferStart;
+ const char* outputBuffer;
size_t outputBufferLength;
// The uncompressed buffer length. For compressed chunk, it's the original
// (ie. the overall) and the actual length of the decompressed data.
@@ -430,9 +383,9 @@ DIAGNOSTIC_PUSH
size_t remainingLength;
// the last buffer returned from the input
- const char *inputBufferStart;
- const char *inputBuffer;
- const char *inputBufferEnd;
+ const char* inputBufferStart;
+ const char* inputBuffer;
+ const char* inputBufferEnd;
// Variables for saving the position of the header and the start of the
// buffer. Used when we have to seek a position.
@@ -445,28 +398,25 @@ DIAGNOSTIC_PUSH
ReaderMetrics* metrics;
};
- DecompressionStream::DecompressionStream(
- std::unique_ptr<SeekableInputStream> inStream,
- size_t bufferSize,
- MemoryPool& _pool,
- ReaderMetrics* _metrics
- ) : pool(_pool),
- input(std::move(inStream)),
- outputDataBuffer(pool, bufferSize),
- state(DECOMPRESS_HEADER),
- outputBufferStart(nullptr),
- outputBuffer(nullptr),
- outputBufferLength(0),
- uncompressedBufferLength(0),
- remainingLength(0),
- inputBufferStart(nullptr),
- inputBuffer(nullptr),
- inputBufferEnd(nullptr),
- headerPosition(0),
- inputBufferStartPosition(0),
- bytesReturned(0),
- metrics(_metrics) {
- }
+ DecompressionStream::DecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
+ size_t bufferSize, MemoryPool& _pool,
+ ReaderMetrics* _metrics)
+ : pool(_pool),
+ input(std::move(inStream)),
+ outputDataBuffer(pool, bufferSize),
+ state(DECOMPRESS_HEADER),
+ outputBufferStart(nullptr),
+ outputBuffer(nullptr),
+ outputBufferLength(0),
+ uncompressedBufferLength(0),
+ remainingLength(0),
+ inputBufferStart(nullptr),
+ inputBuffer(nullptr),
+ inputBufferEnd(nullptr),
+ headerPosition(0),
+ inputBufferStartPosition(0),
+ bytesReturned(0),
+ metrics(_metrics) {}
std::string DecompressionStream::getStreamName() const {
return input->getName();
@@ -475,8 +425,7 @@ DIAGNOSTIC_PUSH
void DecompressionStream::readBuffer(bool failOnEof) {
SCOPED_MINUS_STOPWATCH(metrics, DecompressionLatencyUs);
int length;
- if (!input->Next(reinterpret_cast<const void**>(&inputBuffer),
- &length)) {
+ if (!input->Next(reinterpret_cast<const void**>(&inputBuffer), &length)) {
if (failOnEof) {
throw ParseError("Read past EOF in DecompressionStream::readBuffer");
}
@@ -486,8 +435,7 @@ DIAGNOSTIC_PUSH
inputBufferStart = nullptr;
} else {
inputBufferEnd = inputBuffer + length;
- inputBufferStartPosition
- = static_cast<size_t>(input->ByteCount() - length);
+ inputBufferStartPosition = static_cast<size_t>(input->ByteCount() - length);
inputBufferStart = inputBuffer;
}
}
@@ -518,7 +466,7 @@ DIAGNOSTIC_PUSH
}
}
- bool DecompressionStream::Next(const void** data, int*size) {
+ bool DecompressionStream::Next(const void** data, int* size) {
SCOPED_STOPWATCH(metrics, DecompressionLatencyUs, DecompressionCall);
// If we are starting a new header, we will have to store its positions
// after decompressing.
@@ -535,8 +483,8 @@ DIAGNOSTIC_PUSH
if (state == DECOMPRESS_HEADER || remainingLength == 0) {
readHeader();
// Here we already read the three bytes of the header.
- headerPosition = inputBufferStartPosition
- + static_cast<size_t>(inputBuffer - inputBufferStart) - 3;
+ headerPosition =
+ inputBufferStartPosition + static_cast<size_t>(inputBuffer - inputBufferStart) - 3;
saveBufferPositions = true;
}
if (state == DECOMPRESS_EOF) {
@@ -546,8 +494,7 @@ DIAGNOSTIC_PUSH
readBuffer(true);
}
size_t availableSize =
- std::min(static_cast<size_t>(inputBufferEnd - inputBuffer),
- remainingLength);
+ std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), remainingLength);
if (state == DECOMPRESS_ORIGINAL) {
*data = inputBuffer;
*size = static_cast<int>(availableSize);
@@ -558,8 +505,9 @@ DIAGNOSTIC_PUSH
} else if (state == DECOMPRESS_START) {
NextDecompress(data, size, availableSize);
} else {
- throw std::logic_error("Unknown compression state in "
- "DecompressionStream::Next");
+ throw std::logic_error(
+ "Unknown compression state in "
+ "DecompressionStream::Next");
}
bytesReturned += static_cast<off_t>(*size);
if (saveBufferPositions) {
@@ -587,7 +535,7 @@ DIAGNOSTIC_PUSH
// this is a stupid implementation for now.
// should skip entire blocks without decompressing
while (count > 0) {
- const void *ptr;
+ const void* ptr;
int len;
if (!Next(&ptr, &len)) {
return false;
@@ -617,10 +565,10 @@ DIAGNOSTIC_PUSH
// Case 1: the seeked position is in the current chunk and it's buffered and
// decompressed/uncompressed. Note that after the headerPosition comes the 3 bytes of
// the header.
- if (headerPosition == seekedHeaderPosition
- && inputBufferStartPosition <= headerPosition + 3 && inputBufferStart) {
- position.next(); // Skip the input level position, i.e. seekedHeaderPosition.
- size_t posInChunk = position.next(); // Chunk level position.
+ if (headerPosition == seekedHeaderPosition && inputBufferStartPosition <= headerPosition + 3 &&
+ inputBufferStart) {
+ position.next(); // Skip the input level position, i.e. seekedHeaderPosition.
+ size_t posInChunk = position.next(); // Chunk level position.
// Case 1.a: The position is in the decompressed/uncompressed buffer. Here we only
// need to set the output buffer's pointer to the seeked position.
if (uncompressedBufferLength >= posInChunk) {
@@ -632,9 +580,8 @@ DIAGNOSTIC_PUSH
// Skip bytes to seek.
if (!Skip(static_cast<int>(posInChunk - uncompressedBufferLength))) {
std::ostringstream ss;
- ss << "Bad seek to (chunkHeader=" << seekedHeaderPosition << ", posInChunk="
- << posInChunk << ") in " << getName() << ". DecompressionState: "
- << decompressStateToString(state);
+ ss << "Bad seek to (chunkHeader=" << seekedHeaderPosition << ", posInChunk=" << posInChunk
+ << ") in " << getName() << ". DecompressionState: " << decompressStateToString(state);
throw ParseError(ss.str());
}
return;
@@ -649,15 +596,14 @@ DIAGNOSTIC_PUSH
// Case 2: The input is buffered, but not yet decompressed. No need to
// force re-reading the inputBuffer, we just have to move it to the
// seeked position.
- position.next(); // Skip the input level position.
- inputBuffer
- = inputBufferStart + (seekedHeaderPosition - inputBufferStartPosition);
+ position.next(); // Skip the input level position.
+ inputBuffer = inputBufferStart + (seekedHeaderPosition - inputBufferStartPosition);
} else {
// Case 3: The seeked position is not in the input buffer, here we are
// forcing to read it.
inputBuffer = nullptr;
inputBufferEnd = nullptr;
- input->seek(position); // Actually use the input level position.
+ input->seek(position); // Actually use the input level position.
}
bytesReturned = static_cast<off_t>(input->ByteCount());
if (!Skip(static_cast<int>(position.next()))) {
@@ -666,35 +612,29 @@ DIAGNOSTIC_PUSH
}
class ZlibDecompressionStream : public DecompressionStream {
- public:
- ZlibDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
- size_t blockSize,
- MemoryPool& pool,
- ReaderMetrics* metrics);
+ public:
+ ZlibDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize,
+ MemoryPool& pool, ReaderMetrics* metrics);
virtual ~ZlibDecompressionStream() override;
virtual std::string getName() const override;
- protected:
- virtual void NextDecompress(const void** data,
- int* size,
- size_t availableSize) override;
- private:
+ protected:
+ virtual void NextDecompress(const void** data, int* size, size_t availableSize) override;
+
+ private:
z_stream zstream;
};
-DIAGNOSTIC_PUSH
+ DIAGNOSTIC_PUSH
#if defined(__GNUC__) || defined(__clang__)
DIAGNOSTIC_IGNORE("-Wold-style-cast")
#endif
- ZlibDecompressionStream::ZlibDecompressionStream
- (std::unique_ptr<SeekableInputStream> inStream,
- size_t bufferSize,
- MemoryPool& _pool,
- ReaderMetrics* _metrics
- ): DecompressionStream
- (std::move(inStream), bufferSize, _pool, _metrics) {
+ ZlibDecompressionStream::ZlibDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
+ size_t bufferSize, MemoryPool& _pool,
+ ReaderMetrics* _metrics)
+ : DecompressionStream(std::move(inStream), bufferSize, _pool, _metrics) {
zstream.next_in = nullptr;
zstream.avail_in = 0;
zstream.zalloc = nullptr;
@@ -704,20 +644,20 @@ DIAGNOSTIC_PUSH
zstream.avail_out = static_cast<uInt>(outputDataBuffer.capacity());
int64_t result = inflateInit2(&zstream, -15);
switch (result) {
- case Z_OK:
- break;
- case Z_MEM_ERROR:
- throw std::logic_error("Memory error from inflateInit2");
- case Z_VERSION_ERROR:
- throw std::logic_error("Version error from inflateInit2");
- case Z_STREAM_ERROR:
- throw std::logic_error("Stream error from inflateInit2");
- default:
- throw std::logic_error("Unknown error from inflateInit2");
+ case Z_OK:
+ break;
+ case Z_MEM_ERROR:
+ throw std::logic_error("Memory error from inflateInit2");
+ case Z_VERSION_ERROR:
+ throw std::logic_error("Version error from inflateInit2");
+ case Z_STREAM_ERROR:
+ throw std::logic_error("Stream error from inflateInit2");
+ default:
+ throw std::logic_error("Unknown error from inflateInit2");
}
}
-DIAGNOSTIC_POP
+ DIAGNOSTIC_POP
ZlibDecompressionStream::~ZlibDecompressionStream() {
int64_t result = inflateEnd(&zstream);
@@ -727,49 +667,48 @@ DIAGNOSTIC_POP
}
}
- void ZlibDecompressionStream::NextDecompress(const void** data, int* size,
- size_t availableSize) {
- zstream.next_in =
- reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer));
+ void ZlibDecompressionStream::NextDecompress(const void** data, int* size, size_t availableSize) {
+ zstream.next_in = reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer));
zstream.avail_in = static_cast<uInt>(availableSize);
outputBuffer = outputDataBuffer.data();
- zstream.next_out =
- reinterpret_cast<Bytef*>(const_cast<char*>(outputBuffer));
+ zstream.next_out = reinterpret_cast<Bytef*>(const_cast<char*>(outputBuffer));
zstream.avail_out = static_cast<uInt>(outputDataBuffer.capacity());
if (inflateReset(&zstream) != Z_OK) {
- throw std::logic_error("Bad inflateReset in "
- "ZlibDecompressionStream::NextDecompress");
+ throw std::logic_error(
+ "Bad inflateReset in "
+ "ZlibDecompressionStream::NextDecompress");
}
int64_t result;
do {
- result = inflate(&zstream, availableSize == remainingLength ? Z_FINISH :
- Z_SYNC_FLUSH);
+ result = inflate(&zstream, availableSize == remainingLength ? Z_FINISH : Z_SYNC_FLUSH);
switch (result) {
- case Z_OK:
- remainingLength -= availableSize;
- inputBuffer += availableSize;
- readBuffer(true);
- availableSize =
- std::min(static_cast<size_t>(inputBufferEnd - inputBuffer),
- remainingLength);
- zstream.next_in =
- reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer));
- zstream.avail_in = static_cast<uInt>(availableSize);
- break;
- case Z_STREAM_END:
- break;
- case Z_BUF_ERROR:
- throw std::logic_error("Buffer error in "
- "ZlibDecompressionStream::NextDecompress");
- case Z_DATA_ERROR:
- throw std::logic_error("Data error in "
- "ZlibDecompressionStream::NextDecompress");
- case Z_STREAM_ERROR:
- throw std::logic_error("Stream error in "
- "ZlibDecompressionStream::NextDecompress");
- default:
- throw std::logic_error("Unknown error in "
- "ZlibDecompressionStream::NextDecompress");
+ case Z_OK:
+ remainingLength -= availableSize;
+ inputBuffer += availableSize;
+ readBuffer(true);
+ availableSize =
+ std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), remainingLength);
+ zstream.next_in = reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer));
+ zstream.avail_in = static_cast<uInt>(availableSize);
+ break;
+ case Z_STREAM_END:
+ break;
+ case Z_BUF_ERROR:
+ throw std::logic_error(
+ "Buffer error in "
+ "ZlibDecompressionStream::NextDecompress");
+ case Z_DATA_ERROR:
+ throw std::logic_error(
+ "Data error in "
+ "ZlibDecompressionStream::NextDecompress");
+ case Z_STREAM_ERROR:
+ throw std::logic_error(
+ "Stream error in "
+ "ZlibDecompressionStream::NextDecompress");
+ default:
+ throw std::logic_error(
+ "Unknown error in "
+ "ZlibDecompressionStream::NextDecompress");
}
} while (result != Z_STREAM_END);
*size = static_cast<int>(outputDataBuffer.capacity() - zstream.avail_out);
@@ -786,46 +725,38 @@ DIAGNOSTIC_POP
return result.str();
}
- class BlockDecompressionStream: public DecompressionStream {
- public:
- BlockDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
- size_t blockSize,
- MemoryPool& pool,
- ReaderMetrics* metrics);
+ class BlockDecompressionStream : public DecompressionStream {
+ public:
+ BlockDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize,
+ MemoryPool& pool, ReaderMetrics* metrics);
virtual ~BlockDecompressionStream() override {}
virtual std::string getName() const override = 0;
- protected:
- virtual void NextDecompress(const void** data,
- int* size,
- size_t availableSize) override;
+ protected:
+ virtual void NextDecompress(const void** data, int* size, size_t availableSize) override;
+
+ virtual uint64_t decompress(const char* input, uint64_t length, char* output,
+ size_t maxOutputLength) = 0;
- virtual uint64_t decompress(const char *input, uint64_t length,
- char *output, size_t maxOutputLength) = 0;
- private:
+ private:
// may need to stitch together multiple input buffers;
// to give snappy a contiguous block
DataBuffer<char> inputDataBuffer;
};
- BlockDecompressionStream::BlockDecompressionStream
- (std::unique_ptr<SeekableInputStream> inStream,
- size_t blockSize,
- MemoryPool& _pool,
- ReaderMetrics* _metrics
- ) : DecompressionStream
- (std::move(inStream), blockSize, _pool, _metrics),
- inputDataBuffer(pool, blockSize) {
- }
-
+ BlockDecompressionStream::BlockDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
+ size_t blockSize, MemoryPool& _pool,
+ ReaderMetrics* _metrics)
+ : DecompressionStream(std::move(inStream), blockSize, _pool, _metrics),
+ inputDataBuffer(pool, blockSize) {}
void BlockDecompressionStream::NextDecompress(const void** data, int* size,
- size_t availableSize) {
+ size_t availableSize) {
// Get contiguous bytes of compressed block.
- const char *compressed = inputBuffer;
+ const char* compressed = inputBuffer;
if (remainingLength == availableSize) {
- inputBuffer += availableSize;
+ inputBuffer += availableSize;
} else {
// Did not read enough from input.
if (inputDataBuffer.capacity() < remainingLength) {
@@ -835,19 +766,16 @@ DIAGNOSTIC_POP
inputBuffer += availableSize;
compressed = inputDataBuffer.data();
- for (size_t pos = availableSize; pos < remainingLength; ) {
+ for (size_t pos = availableSize; pos < remainingLength;) {
readBuffer(true);
size_t avail =
- std::min(static_cast<size_t>(inputBufferEnd -
- inputBuffer),
- remainingLength - pos);
+ std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), remainingLength - pos);
::memcpy(inputDataBuffer.data() + pos, inputBuffer, avail);
pos += avail;
inputBuffer += avail;
}
}
- outputBufferLength = decompress(compressed, remainingLength,
- outputDataBuffer.data(),
+ outputBufferLength = decompress(compressed, remainingLength, outputDataBuffer.data(),
outputDataBuffer.capacity());
remainingLength = 0;
state = DECOMPRESS_HEADER;
@@ -857,17 +785,11 @@ DIAGNOSTIC_POP
outputBufferLength = 0;
}
- class SnappyDecompressionStream: public BlockDecompressionStream {
- public:
- SnappyDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
- size_t blockSize,
- MemoryPool& _pool,
- ReaderMetrics* _metrics
- ): BlockDecompressionStream
- (std::move(inStream),
- blockSize,
- _pool,
- _metrics) {
+ class SnappyDecompressionStream : public BlockDecompressionStream {
+ public:
+ SnappyDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize,
+ MemoryPool& _pool, ReaderMetrics* _metrics)
+ : BlockDecompressionStream(std::move(inStream), blockSize, _pool, _metrics) {
// PASS
}
@@ -877,15 +799,12 @@ DIAGNOSTIC_POP
return result.str();
}
- protected:
- virtual uint64_t decompress(const char *input, uint64_t length,
- char *output, size_t maxOutputLength
- ) override;
+ protected:
+ virtual uint64_t decompress(const char* input, uint64_t length, char* output,
+ size_t maxOutputLength) override;
};
- uint64_t SnappyDecompressionStream::decompress(const char *_input,
- uint64_t length,
- char *output,
+ uint64_t SnappyDecompressionStream::decompress(const char* _input, uint64_t length, char* output,
size_t maxOutputLength) {
size_t outLength;
if (!snappy::GetUncompressedLength(_input, length, &outLength)) {
@@ -902,17 +821,11 @@ DIAGNOSTIC_POP
return outLength;
}
- class LzoDecompressionStream: public BlockDecompressionStream {
- public:
- LzoDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
- size_t blockSize,
- MemoryPool& _pool,
- ReaderMetrics* _metrics
- ): BlockDecompressionStream
- (std::move(inStream),
- blockSize,
- _pool,
- _metrics) {
+ class LzoDecompressionStream : public BlockDecompressionStream {
+ public:
+ LzoDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize,
+ MemoryPool& _pool, ReaderMetrics* _metrics)
+ : BlockDecompressionStream(std::move(inStream), blockSize, _pool, _metrics) {
// PASS
}
@@ -922,31 +835,21 @@ DIAGNOSTIC_POP
return result.str();
}
- protected:
- virtual uint64_t decompress(const char *input, uint64_t length,
- char *output, size_t maxOutputLength
- ) override;
+ protected:
+ virtual uint64_t decompress(const char* input, uint64_t length, char* output,
+ size_t maxOutputLength) override;
};
- uint64_t LzoDecompressionStream::decompress(const char *inputPtr,
- uint64_t length,
- char *output,
+ uint64_t LzoDecompressionStream::decompress(const char* inputPtr, uint64_t length, char* output,
size_t maxOutputLength) {
- return lzoDecompress(inputPtr, inputPtr + length, output,
- output + maxOutputLength);
+ return lzoDecompress(inputPtr, inputPtr + length, output, output + maxOutputLength);
}
- class Lz4DecompressionStream: public BlockDecompressionStream {
- public:
- Lz4DecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
- size_t blockSize,
- MemoryPool& _pool,
- ReaderMetrics* _metrics
- ): BlockDecompressionStream
- (std::move(inStream),
- blockSize,
- _pool,
- _metrics) {
+ class Lz4DecompressionStream : public BlockDecompressionStream {
+ public:
+ Lz4DecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize,
+ MemoryPool& _pool, ReaderMetrics* _metrics)
+ : BlockDecompressionStream(std::move(inStream), blockSize, _pool, _metrics) {
// PASS
}
@@ -956,15 +859,12 @@ DIAGNOSTIC_POP
return result.str();
}
- protected:
- virtual uint64_t decompress(const char *input, uint64_t length,
- char *output, size_t maxOutputLength
- ) override;
+ protected:
+ virtual uint64_t decompress(const char* input, uint64_t length, char* output,
+ size_t maxOutputLength) override;
};
- uint64_t Lz4DecompressionStream::decompress(const char *inputPtr,
- uint64_t length,
- char *output,
+ uint64_t Lz4DecompressionStream::decompress(const char* inputPtr, uint64_t length, char* output,
size_t maxOutputLength) {
int result = LZ4_decompress_safe(inputPtr, output, static_cast<int>(length),
static_cast<int>(maxOutputLength));
@@ -977,29 +877,20 @@ DIAGNOSTIC_POP
/**
* Block compression base class
*/
- class BlockCompressionStream: public CompressionStreamBase {
- public:
- BlockCompressionStream(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool,
- WriterMetrics* metrics)
- : CompressionStreamBase(outStream,
- compressionLevel,
- capacity,
- blockSize,
- pool,
- metrics)
- , compressorBuffer(pool) {
+ class BlockCompressionStream : public CompressionStreamBase {
+ public:
+ BlockCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity,
+ uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics)
+ : CompressionStreamBase(outStream, compressionLevel, capacity, blockSize, pool, metrics),
+ compressorBuffer(pool) {
// PASS
}
- virtual bool Next(void** data, int*size) override;
+ virtual bool Next(void** data, int* size) override;
virtual void suppress() override;
virtual std::string getName() const override = 0;
- protected:
+ protected:
// compresses a block and returns the compressed size
virtual uint64_t doBlockCompression() = 0;
@@ -1011,14 +902,14 @@ DIAGNOSTIC_POP
DataBuffer<unsigned char> compressorBuffer;
};
- bool BlockCompressionStream::Next(void** data, int*size) {
+ bool BlockCompressionStream::Next(void** data, int* size) {
if (bufferSize != 0) {
ensureHeader();
// perform compression
size_t totalCompressedSize = doBlockCompression();
- const unsigned char * dataToWrite = nullptr;
+ const unsigned char* dataToWrite = nullptr;
int totalSizeToWrite = 0;
if (totalCompressedSize >= static_cast<size_t>(bufferSize)) {
@@ -1050,51 +941,40 @@ DIAGNOSTIC_POP
/**
* LZ4 block compression
*/
- class Lz4CompressionSteam: public BlockCompressionStream {
- public:
- Lz4CompressionSteam(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool,
- WriterMetrics* metrics)
- : BlockCompressionStream(outStream,
- compressionLevel,
- capacity,
- blockSize,
- pool,
- metrics) {
+ class Lz4CompressionSteam : public BlockCompressionStream {
+ public:
+ Lz4CompressionSteam(OutputStream* outStream, int compressionLevel, uint64_t capacity,
+ uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics)
+ : BlockCompressionStream(outStream, compressionLevel, capacity, blockSize, pool, metrics) {
this->init();
}
virtual std::string getName() const override {
return "Lz4CompressionStream";
}
-
+
virtual ~Lz4CompressionSteam() override {
this->end();
}
- protected:
+ protected:
virtual uint64_t doBlockCompression() override;
virtual uint64_t estimateMaxCompressionSize() override {
return static_cast<uint64_t>(LZ4_compressBound(bufferSize));
}
- private:
+ private:
void init();
void end();
- LZ4_stream_t *state;
+ LZ4_stream_t* state;
};
uint64_t Lz4CompressionSteam::doBlockCompression() {
- int result = LZ4_compress_fast_extState(static_cast<void*>(state),
- reinterpret_cast<const char*>(rawInputBuffer.data()),
- reinterpret_cast<char*>(compressorBuffer.data()),
- bufferSize,
- static_cast<int>(compressorBuffer.size()),
- level);
+ int result = LZ4_compress_fast_extState(
+ static_cast<void*>(state), reinterpret_cast<const char*>(rawInputBuffer.data()),
+ reinterpret_cast<char*>(compressorBuffer.data()), bufferSize,
+ static_cast<int>(compressorBuffer.size()), level);
if (result == 0) {
throw std::runtime_error("Error during block compression using lz4.");
}
@@ -1116,36 +996,25 @@ DIAGNOSTIC_POP
/**
* Snappy block compression
*/
- class SnappyCompressionStream: public BlockCompressionStream {
- public:
- SnappyCompressionStream(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool,
- WriterMetrics* metrics)
- : BlockCompressionStream(outStream,
- compressionLevel,
- capacity,
- blockSize,
- pool,
- metrics) {
- }
+ class SnappyCompressionStream : public BlockCompressionStream {
+ public:
+ SnappyCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity,
+ uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics)
+ : BlockCompressionStream(outStream, compressionLevel, capacity, blockSize, pool, metrics) {}
virtual std::string getName() const override {
return "SnappyCompressionStream";
}
-
+
virtual ~SnappyCompressionStream() override {
// PASS
}
- protected:
+ protected:
virtual uint64_t doBlockCompression() override;
virtual uint64_t estimateMaxCompressionSize() override {
- return static_cast<uint64_t>
- (snappy::MaxCompressedLength(static_cast<size_t>(bufferSize)));
+ return static_cast<uint64_t>(snappy::MaxCompressedLength(static_cast<size_t>(bufferSize)));
}
};
@@ -1153,96 +1022,75 @@ DIAGNOSTIC_POP
size_t compressedLength;
snappy::RawCompress(reinterpret_cast<const char*>(rawInputBuffer.data()),
static_cast<size_t>(bufferSize),
- reinterpret_cast<char*>(compressorBuffer.data()),
- &compressedLength);
+ reinterpret_cast<char*>(compressorBuffer.data()), &compressedLength);
return static_cast<uint64_t>(compressedLength);
}
/**
* ZSTD block compression
*/
- class ZSTDCompressionStream: public BlockCompressionStream {
- public:
- ZSTDCompressionStream(OutputStream * outStream,
- int compressionLevel,
- uint64_t capacity,
- uint64_t blockSize,
- MemoryPool& pool,
- WriterMetrics* metrics)
- : BlockCompressionStream(outStream,
- compressionLevel,
- capacity,
- blockSize,
- pool,
- metrics) {
+ class ZSTDCompressionStream : public BlockCompressionStream {
+ public:
+ ZSTDCompressionStream(OutputStream* outStream, int compressionLevel, uint64_t capacity,
+ uint64_t blockSize, MemoryPool& pool, WriterMetrics* metrics)
+ : BlockCompressionStream(outStream, compressionLevel, capacity, blockSize, pool, metrics) {
this->init();
}
virtual std::string getName() const override {
return "ZstdCompressionStream";
}
-
+
virtual ~ZSTDCompressionStream() override {
this->end();
}
- protected:
+ protected:
virtual uint64_t doBlockCompression() override;
virtual uint64_t estimateMaxCompressionSize() override {
return ZSTD_compressBound(static_cast<size_t>(bufferSize));
}
-
- private:
+
+ private:
void init();
void end();
- ZSTD_CCtx *cctx;
+ ZSTD_CCtx* cctx;
};
uint64_t ZSTDCompressionStream::doBlockCompression() {
- return ZSTD_compressCCtx(cctx,
- compressorBuffer.data(),
- compressorBuffer.size(),
- rawInputBuffer.data(),
- static_cast<size_t>(bufferSize),
- level);
+ return ZSTD_compressCCtx(cctx, compressorBuffer.data(), compressorBuffer.size(),
+ rawInputBuffer.data(), static_cast<size_t>(bufferSize), level);
}
-
-DIAGNOSTIC_PUSH
+
+ DIAGNOSTIC_PUSH
#if defined(__GNUC__) || defined(__clang__)
DIAGNOSTIC_IGNORE("-Wold-style-cast")
#endif
void ZSTDCompressionStream::init() {
-
cctx = ZSTD_createCCtx();
if (!cctx) {
throw std::runtime_error("Error while calling ZSTD_createCCtx() for zstd.");
}
}
-
void ZSTDCompressionStream::end() {
(void)ZSTD_freeCCtx(cctx);
cctx = nullptr;
}
-DIAGNOSTIC_PUSH
+ DIAGNOSTIC_PUSH
/**
* ZSTD block decompression
*/
- class ZSTDDecompressionStream: public BlockDecompressionStream {
- public:
- ZSTDDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
- size_t blockSize,
- MemoryPool& _pool,
- ReaderMetrics* _metrics)
- : BlockDecompressionStream(std::move(inStream),
- blockSize,
- _pool,
- _metrics) {
+ class ZSTDDecompressionStream : public BlockDecompressionStream {
+ public:
+ ZSTDDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, size_t blockSize,
+ MemoryPool& _pool, ReaderMetrics* _metrics)
+ : BlockDecompressionStream(std::move(inStream), blockSize, _pool, _metrics) {
this->init();
}
@@ -1256,133 +1104,108 @@ DIAGNOSTIC_PUSH
return result.str();
}
- protected:
- virtual uint64_t decompress(const char *input,
- uint64_t length,
- char *output,
+ protected:
+ virtual uint64_t decompress(const char* input, uint64_t length, char* output,
size_t maxOutputLength) override;
- private:
+ private:
void init();
void end();
- ZSTD_DCtx *dctx;
+ ZSTD_DCtx* dctx;
};
- uint64_t ZSTDDecompressionStream::decompress(const char *inputPtr,
- uint64_t length,
- char *output,
+ uint64_t ZSTDDecompressionStream::decompress(const char* inputPtr, uint64_t length, char* output,
size_t maxOutputLength) {
- return static_cast<uint64_t>(ZSTD_decompressDCtx(dctx,
- output,
- maxOutputLength,
- inputPtr,
- length));
+ return static_cast<uint64_t>(
+ ZSTD_decompressDCtx(dctx, output, maxOutputLength, inputPtr, length));
}
-DIAGNOSTIC_PUSH
+ DIAGNOSTIC_PUSH
#if defined(__GNUC__) || defined(__clang__)
DIAGNOSTIC_IGNORE("-Wold-style-cast")
#endif
void ZSTDDecompressionStream::init() {
-
dctx = ZSTD_createDCtx();
if (!dctx) {
throw std::runtime_error("Error while calling ZSTD_createDCtx() for zstd.");
}
}
-
void ZSTDDecompressionStream::end() {
(void)ZSTD_freeDCtx(dctx);
dctx = nullptr;
}
-DIAGNOSTIC_PUSH
-
- std::unique_ptr<BufferedOutputStream>
- createCompressor(
- CompressionKind kind,
- OutputStream * outStream,
- CompressionStrategy strategy,
- uint64_t bufferCapacity,
- uint64_t compressionBlockSize,
- MemoryPool& pool,
- WriterMetrics* metrics) {
+ DIAGNOSTIC_PUSH
+
+ std::unique_ptr<BufferedOutputStream> createCompressor(CompressionKind kind,
+ OutputStream* outStream,
+ CompressionStrategy strategy,
+ uint64_t bufferCapacity,
+ uint64_t compressionBlockSize,
+ MemoryPool& pool, WriterMetrics* metrics) {
switch (static_cast<int64_t>(kind)) {
- case CompressionKind_NONE: {
- return std::unique_ptr<BufferedOutputStream>
- (new BufferedOutputStream(
- pool, outStream, bufferCapacity, compressionBlockSize, metrics));
- }
- case CompressionKind_ZLIB: {
- int level = (strategy == CompressionStrategy_SPEED) ?
- Z_BEST_SPEED + 1 : Z_DEFAULT_COMPRESSION;
- return std::unique_ptr<BufferedOutputStream>
- (new ZlibCompressionStream(
- outStream, level, bufferCapacity,
- compressionBlockSize, pool, metrics));
- }
- case CompressionKind_ZSTD: {
- int level = (strategy == CompressionStrategy_SPEED) ?
- 1 : ZSTD_CLEVEL_DEFAULT;
- return std::unique_ptr<BufferedOutputStream>
- (new ZSTDCompressionStream(
- outStream, level, bufferCapacity,
- compressionBlockSize, pool, metrics));
- }
- case CompressionKind_LZ4: {
- int level = (strategy == CompressionStrategy_SPEED) ?
- LZ4_ACCELERATION_MAX : LZ4_ACCELERATION_DEFAULT;
- return std::unique_ptr<BufferedOutputStream>
- (new Lz4CompressionSteam(
- outStream, level, bufferCapacity,
- compressionBlockSize, pool, metrics));
- }
- case CompressionKind_SNAPPY: {
- int level = 0;
- return std::unique_ptr<BufferedOutputStream>
- (new SnappyCompressionStream(
- outStream, level, bufferCapacity,
- compressionBlockSize, pool, metrics));
- }
- case CompressionKind_LZO:
- default:
- throw NotImplementedYet("compression codec");
+ case CompressionKind_NONE: {
+ return std::unique_ptr<BufferedOutputStream>(new BufferedOutputStream(
+ pool, outStream, bufferCapacity, compressionBlockSize, metrics));
+ }
+ case CompressionKind_ZLIB: {
+ int level =
+ (strategy == CompressionStrategy_SPEED) ? Z_BEST_SPEED + 1 : Z_DEFAULT_COMPRESSION;
+ return std::unique_ptr<BufferedOutputStream>(new ZlibCompressionStream(
+ outStream, level, bufferCapacity, compressionBlockSize, pool, metrics));
+ }
+ case CompressionKind_ZSTD: {
+ int level = (strategy == CompressionStrategy_SPEED) ? 1 : ZSTD_CLEVEL_DEFAULT;
+ return std::unique_ptr<BufferedOutputStream>(new ZSTDCompressionStream(
+ outStream, level, bufferCapacity, compressionBlockSize, pool, metrics));
+ }
+ case CompressionKind_LZ4: {
+ int level = (strategy == CompressionStrategy_SPEED) ? LZ4_ACCELERATION_MAX
+ : LZ4_ACCELERATION_DEFAULT;
+ return std::unique_ptr<BufferedOutputStream>(new Lz4CompressionSteam(
+ outStream, level, bufferCapacity, compressionBlockSize, pool, metrics));
+ }
+ case CompressionKind_SNAPPY: {
+ int level = 0;
+ return std::unique_ptr<BufferedOutputStream>(new SnappyCompressionStream(
+ outStream, level, bufferCapacity, compressionBlockSize, pool, metrics));
+ }
+ case CompressionKind_LZO:
+ default:
+ throw NotImplementedYet("compression codec");
}
}
- std::unique_ptr<SeekableInputStream>
- createDecompressor(CompressionKind kind,
- std::unique_ptr<SeekableInputStream> input,
- uint64_t blockSize,
- MemoryPool& pool,
- ReaderMetrics* metrics) {
+ std::unique_ptr<SeekableInputStream> createDecompressor(
+ CompressionKind kind, std::unique_ptr<SeekableInputStream> input, uint64_t blockSize,
+ MemoryPool& pool, ReaderMetrics* metrics) {
switch (static_cast<int64_t>(kind)) {
- case CompressionKind_NONE:
- return REDUNDANT_MOVE(input);
- case CompressionKind_ZLIB:
- return std::unique_ptr<SeekableInputStream>
- (new ZlibDecompressionStream(std::move(input), blockSize, pool, metrics));
- case CompressionKind_SNAPPY:
- return std::unique_ptr<SeekableInputStream>
- (new SnappyDecompressionStream(std::move(input), blockSize, pool, metrics));
- case CompressionKind_LZO:
- return std::unique_ptr<SeekableInputStream>
- (new LzoDecompressionStream(std::move(input), blockSize, pool, metrics));
- case CompressionKind_LZ4:
- return std::unique_ptr<SeekableInputStream>
- (new Lz4DecompressionStream(std::move(input), blockSize, pool, metrics));
- case CompressionKind_ZSTD:
- return std::unique_ptr<SeekableInputStream>
- (new ZSTDDecompressionStream(std::move(input), blockSize, pool, metrics));
- default: {
- std::ostringstream buffer;
- buffer << "Unknown compression codec " << kind;
- throw NotImplementedYet(buffer.str());
- }
+ case CompressionKind_NONE:
+ return REDUNDANT_MOVE(input);
+ case CompressionKind_ZLIB:
+ return std::unique_ptr<SeekableInputStream>(
+ new ZlibDecompressionStream(std::move(input), blockSize, pool, metrics));
+ case CompressionKind_SNAPPY:
+ return std::unique_ptr<SeekableInputStream>(
+ new SnappyDecompressionStream(std::move(input), blockSize, pool, metrics));
+ case CompressionKind_LZO:
+ return std::unique_ptr<SeekableInputStream>(
+ new LzoDecompressionStream(std::move(input), blockSize, pool, metrics));
+ case CompressionKind_LZ4:
+ return std::unique_ptr<SeekableInputStream>(
+ new Lz4DecompressionStream(std::move(input), blockSize, pool, metrics));
+ case CompressionKind_ZSTD:
+ return std::unique_ptr<SeekableInputStream>(
+ new ZSTDDecompressionStream(std::move(input), blockSize, pool, metrics));
+ default: {
+ std::ostringstream buffer;
+ buffer << "Unknown compression codec " << kind;
+ throw NotImplementedYet(buffer.str());
+ }
}
}
-}
+} // namespace orc
diff --git a/c++/src/Compression.hh b/c++/src/Compression.hh
index 50a252443..55b152dd6 100644
--- a/c++/src/Compression.hh
+++ b/c++/src/Compression.hh
@@ -32,12 +32,9 @@ namespace orc {
* @param pool the memory pool
* @param metrics the reader metrics
*/
- std::unique_ptr<SeekableInputStream>
- createDecompressor(CompressionKind kind,
- std::unique_ptr<SeekableInputStream> input,
- uint64_t bufferSize,
- MemoryPool& pool,
- ReaderMetrics* metrics);
+ std::unique_ptr<SeekableInputStream> createDecompressor(
+ CompressionKind kind, std::unique_ptr<SeekableInputStream> input, uint64_t bufferSize,
+ MemoryPool& pool, ReaderMetrics* metrics);
/**
* Create a compressor for the given compression kind.
@@ -48,14 +45,12 @@ namespace orc {
* @param compressionBlockSize compression buffer block size
* @param pool the memory pool
*/
- std::unique_ptr<BufferedOutputStream>
- createCompressor(CompressionKind kind,
- OutputStream * outStream,
- CompressionStrategy strategy,
- uint64_t bufferCapacity,
- uint64_t compressionBlockSize,
- MemoryPool& pool,
- WriterMetrics* metrics);
-}
+ std::unique_ptr<BufferedOutputStream> createCompressor(CompressionKind kind,
+ OutputStream* outStream,
+ CompressionStrategy strategy,
+ uint64_t bufferCapacity,
+ uint64_t compressionBlockSize,
+ MemoryPool& pool, WriterMetrics* metrics);
+} // namespace orc
#endif
diff --git a/c++/src/Exceptions.cc b/c++/src/Exceptions.cc
index 2077b27df..3a711f30a 100644
--- a/c++/src/Exceptions.cc
+++ b/c++/src/Exceptions.cc
@@ -20,18 +20,15 @@
namespace orc {
- NotImplementedYet::NotImplementedYet(const std::string& what_arg
- ) : logic_error(what_arg) {
+ NotImplementedYet::NotImplementedYet(const std::string& what_arg) : logic_error(what_arg) {
// PASS
}
- NotImplementedYet::NotImplementedYet(const char* what_arg
- ) :logic_error(what_arg) {
+ NotImplementedYet::NotImplementedYet(const char* what_arg) : logic_error(what_arg) {
// PASS
}
- NotImplementedYet::NotImplementedYet(const NotImplementedYet& error
- ): logic_error(error) {
+ NotImplementedYet::NotImplementedYet(const NotImplementedYet& error) : logic_error(error) {
// PASS
}
@@ -39,17 +36,15 @@ namespace orc {
// PASS
}
- ParseError::ParseError(const std::string& what_arg
- ): runtime_error(what_arg) {
+ ParseError::ParseError(const std::string& what_arg) : runtime_error(what_arg) {
// PASS
}
- ParseError::ParseError(const char* what_arg
- ): runtime_error(what_arg) {
+ ParseError::ParseError(const char* what_arg) : runtime_error(what_arg) {
// PASS
}
- ParseError::ParseError(const ParseError& error): runtime_error(error) {
+ ParseError::ParseError(const ParseError& error) : runtime_error(error) {
// PASS
}
@@ -57,22 +52,19 @@ namespace orc {
// PASS
}
- InvalidArgument::InvalidArgument(const std::string& what_arg
- ): runtime_error(what_arg) {
+ InvalidArgument::InvalidArgument(const std::string& what_arg) : runtime_error(what_arg) {
// PASS
}
- InvalidArgument::InvalidArgument(const char* what_arg
- ): runtime_error(what_arg) {
+ InvalidArgument::InvalidArgument(const char* what_arg) : runtime_error(what_arg) {
// PASS
}
- InvalidArgument::InvalidArgument(const InvalidArgument& error
- ): runtime_error(error) {
+ InvalidArgument::InvalidArgument(const InvalidArgument& error) : runtime_error(error) {
// PASS
}
InvalidArgument::~InvalidArgument() ORC_NOEXCEPT {
// PASS
}
-}
+} // namespace orc
diff --git a/c++/src/Int128.cc b/c++/src/Int128.cc
index 4ff500fba..0a36fe669 100644
--- a/c++/src/Int128.cc
+++ b/c++/src/Int128.cc
@@ -45,7 +45,7 @@ namespace orc {
size_t group = std::min(static_cast<size_t>(18), length - posn);
int64_t chunk = std::stoll(str.substr(posn, group));
int64_t multiple = 1;
- for(size_t i=0; i < group; ++i) {
+ for (size_t i = 0; i < group; ++i) {
multiple *= 10;
}
*this *= multiple;
@@ -58,7 +58,7 @@ namespace orc {
}
}
- Int128& Int128::operator*=(const Int128 &right) {
+ Int128& Int128::operator*=(const Int128& right) {
const uint64_t INT_MASK = 0xffffffff;
const uint64_t CARRY_BIT = INT_MASK + 1;
@@ -100,7 +100,7 @@ namespace orc {
* @param wasNegative a flag for whether the value was original negative
* @result the output length of the array
*/
- int64_t Int128::fillInArray(uint32_t* array, bool &wasNegative) const {
+ int64_t Int128::fillInArray(uint32_t* array, bool& wasNegative) const {
uint64_t high;
uint64_t low;
if (highbits < 0) {
@@ -140,7 +140,6 @@ namespace orc {
}
}
-
/**
* Find last set bit in a 32 bit integer. Bit 1 is the LSB and bit 32 is
* the MSB. We can replace this with bsrq asm instruction on x64.
@@ -162,10 +161,10 @@ namespace orc {
*/
void shiftArrayLeft(uint32_t* array, int64_t length, int64_t bits) {
if (length > 0 && bits != 0) {
- for(int64_t i=0; i < length-1; ++i) {
- array[i] = (array[i] << bits) | (array[i+1] >> (32 - bits));
+ for (int64_t i = 0; i < length - 1; ++i) {
+ array[i] = (array[i] << bits) | (array[i + 1] >> (32 - bits));
}
- array[length-1] <<= bits;
... 48461 lines suppressed ...