You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by we...@apache.org on 2017/06/27 21:07:37 UTC
[3/3] parquet-cpp git commit: PARQUET-1045: Remove code that's being
moved to Apache Arrow in ARROW-1154
PARQUET-1045: Remove code that's being moved to Apache Arrow in ARROW-1154
I verified locally that this works on Linux with the current iteration of https://github.com/apache/arrow/pull/785, will update the commit in ThirdpartyToolchain.cmake after that is merged to get a passing build
Author: Wes McKinney <we...@twosigma.com>
Closes #364 from wesm/PARQUET-1045 and squashes the following commits:
fede8d9 [Wes McKinney] Update Arrow version to include ARROW-1159
b44b4ab [Wes McKinney] Fix benchmark. Only build Zlib on Windows when building Thrift
83d2289 [Wes McKinney] Update Arrow version
8914c16 [Wes McKinney] Remove code that was moved to Apache Arrow, refactoring
Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/2f5ef895
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/2f5ef895
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/2f5ef895
Branch: refs/heads/master
Commit: 2f5ef8957851fe13dfb1b8c67f7a6786730a404e
Parents: 84db929
Author: Wes McKinney <we...@twosigma.com>
Authored: Tue Jun 27 17:07:30 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Tue Jun 27 17:07:30 2017 -0400
----------------------------------------------------------------------
CMakeLists.txt | 2 -
benchmarks/decode_benchmark.cc | 6 +-
cmake_modules/ThirdpartyToolchain.cmake | 53 +-
src/parquet/arrow/reader.cc | 6 +-
src/parquet/arrow/writer.cc | 9 +-
src/parquet/column_reader.cc | 7 +-
src/parquet/column_reader.h | 10 +-
src/parquet/column_writer.cc | 7 +-
src/parquet/column_writer.h | 10 +-
src/parquet/encoding-internal.h | 45 +-
src/parquet/encoding-test.cc | 5 +-
src/parquet/encoding.h | 3 +-
src/parquet/types.h | 3 +-
src/parquet/util/CMakeLists.txt | 10 -
src/parquet/util/bit-stream-utils.h | 170 --
src/parquet/util/bit-stream-utils.inline.h | 258 --
src/parquet/util/bit-util-test.cc | 190 --
src/parquet/util/bit-util.h | 340 ---
src/parquet/util/bpacking.h | 3342 -----------------------
src/parquet/util/compiler-util.h | 63 -
src/parquet/util/cpu-info.cc | 208 --
src/parquet/util/cpu-info.h | 92 -
src/parquet/util/hash-util.h | 258 --
src/parquet/util/logging.h | 105 +-
src/parquet/util/memory.cc | 4 +-
src/parquet/util/memory.h | 2 +
src/parquet/util/rle-encoding.h | 599 ----
src/parquet/util/rle-test.cc | 460 ----
src/parquet/util/sse-util.h | 237 --
29 files changed, 93 insertions(+), 6411 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/2f5ef895/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 47984e6..0627b00 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -618,8 +618,6 @@ set(LIBPARQUET_SRCS
src/parquet/parquet_constants.cpp
src/parquet/parquet_types.cpp
-
- src/parquet/util/cpu-info.cc
src/parquet/util/memory.cc
)
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/2f5ef895/benchmarks/decode_benchmark.cc
----------------------------------------------------------------------
diff --git a/benchmarks/decode_benchmark.cc b/benchmarks/decode_benchmark.cc
index 44776a8..5514d8b 100644
--- a/benchmarks/decode_benchmark.cc
+++ b/benchmarks/decode_benchmark.cc
@@ -40,10 +40,10 @@ class DeltaBitPackEncoder {
uint8_t* Encode(int* encoded_len) {
uint8_t* result = new uint8_t[10 * 1024 * 1024];
- int num_mini_blocks = parquet::BitUtil::Ceil(num_values() - 1, mini_block_size_);
+ int num_mini_blocks = arrow::BitUtil::Ceil(num_values() - 1, mini_block_size_);
uint8_t* mini_block_widths = NULL;
- parquet::BitWriter writer(result, 10 * 1024 * 1024);
+ arrow::BitWriter writer(result, 10 * 1024 * 1024);
// Writer the size of each block. We only use 1 block currently.
writer.PutVlqInt(num_mini_blocks * mini_block_size_);
@@ -83,7 +83,7 @@ class DeltaBitPackEncoder {
// The bit width for this block is the number of bits needed to store
// (max_delta - min_delta).
- int bit_width = parquet::BitUtil::NumRequiredBits(max_delta - min_delta);
+ int bit_width = arrow::BitUtil::NumRequiredBits(max_delta - min_delta);
mini_block_widths[i] = bit_width;
// Encode this mini blocking using min_delta and bit_width
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/2f5ef895/cmake_modules/ThirdpartyToolchain.cmake
----------------------------------------------------------------------
diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake
index 5c4e565..716debc 100644
--- a/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cmake_modules/ThirdpartyToolchain.cmake
@@ -109,34 +109,6 @@ set(LIBS ${LIBS} ${Boost_LIBRARIES})
# ----------------------------------------------------------------------
# ZLIB
-set(ZLIB_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/zlib_ep/src/zlib_ep-install")
-set(ZLIB_HOME "${ZLIB_PREFIX}")
-set(ZLIB_INCLUDE_DIR "${ZLIB_PREFIX}/include")
-if (MSVC)
- if (${UPPERCASE_BUILD_TYPE} STREQUAL "DEBUG")
- set(ZLIB_STATIC_LIB_NAME zlibstaticd.lib)
- else()
- set(ZLIB_STATIC_LIB_NAME zlibstatic.lib)
- endif()
-else()
- set(ZLIB_STATIC_LIB_NAME libz.a)
-endif()
-set(ZLIB_STATIC_LIB "${ZLIB_PREFIX}/lib/${ZLIB_STATIC_LIB_NAME}")
-set(ZLIB_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
- -DCMAKE_INSTALL_PREFIX=${ZLIB_PREFIX}
- -DCMAKE_C_FLAGS=${EP_C_FLAGS}
- -DBUILD_SHARED_LIBS=OFF)
-ExternalProject_Add(zlib_ep
- URL "http://zlib.net/fossils/zlib-1.2.8.tar.gz"
- BUILD_BYPRODUCTS "${ZLIB_STATIC_LIB}"
- ${ZLIB_BUILD_BYPRODUCTS}
- CMAKE_ARGS ${ZLIB_CMAKE_ARGS})
-
-include_directories(SYSTEM ${ZLIB_INCLUDE_DIR})
-add_library(zlibstatic STATIC IMPORTED)
-set_target_properties(zlibstatic PROPERTIES IMPORTED_LOCATION ${ZLIB_STATIC_LIB})
-add_dependencies(zlibstatic zlib_ep)
-
# ----------------------------------------------------------------------
# Thrift
@@ -144,6 +116,29 @@ add_dependencies(zlibstatic zlib_ep)
find_package(Thrift)
if (NOT THRIFT_FOUND)
+ set(ZLIB_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/zlib_ep/src/zlib_ep-install")
+ set(ZLIB_HOME "${ZLIB_PREFIX}")
+ set(ZLIB_INCLUDE_DIR "${ZLIB_PREFIX}/include")
+ if (MSVC)
+ if (${UPPERCASE_BUILD_TYPE} STREQUAL "DEBUG")
+ set(ZLIB_STATIC_LIB_NAME zlibstaticd.lib)
+ else()
+ set(ZLIB_STATIC_LIB_NAME zlibstatic.lib)
+ endif()
+ else()
+ set(ZLIB_STATIC_LIB_NAME libz.a)
+ endif()
+ set(ZLIB_STATIC_LIB "${ZLIB_PREFIX}/lib/${ZLIB_STATIC_LIB_NAME}")
+ set(ZLIB_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+ -DCMAKE_INSTALL_PREFIX=${ZLIB_PREFIX}
+ -DCMAKE_C_FLAGS=${EP_C_FLAGS}
+ -DBUILD_SHARED_LIBS=OFF)
+ ExternalProject_Add(zlib_ep
+ URL "http://zlib.net/fossils/zlib-1.2.8.tar.gz"
+ BUILD_BYPRODUCTS "${ZLIB_STATIC_LIB}"
+ ${ZLIB_BUILD_BYPRODUCTS}
+ CMAKE_ARGS ${ZLIB_CMAKE_ARGS})
+
set(THRIFT_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/thrift_ep/src/thrift_ep-install")
set(THRIFT_HOME "${THRIFT_PREFIX}")
set(THRIFT_INCLUDE_DIR "${THRIFT_PREFIX}/include")
@@ -341,7 +336,7 @@ if (NOT ARROW_FOUND)
-DARROW_BUILD_TESTS=OFF)
if ("$ENV{PARQUET_ARROW_VERSION}" STREQUAL "")
- set(ARROW_VERSION "98f7cac6e162d9775d615d07b9867c1ec0030f82")
+ set(ARROW_VERSION "a58893882ac8acd1ac4a5036685cbf09a9a09673")
else()
set(ARROW_VERSION "$ENV{PARQUET_ARROW_VERSION}")
endif()
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/2f5ef895/src/parquet/arrow/reader.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/reader.cc b/src/parquet/arrow/reader.cc
index a3a26c9..dd1c9d2 100644
--- a/src/parquet/arrow/reader.cc
+++ b/src/parquet/arrow/reader.cc
@@ -26,12 +26,12 @@
#include <thread>
#include <vector>
+#include "arrow/api.h"
+#include "arrow/util/bit-util.h"
+
#include "parquet/arrow/schema.h"
-#include "parquet/util/bit-util.h"
#include "parquet/util/schema-util.h"
-#include "arrow/api.h"
-
using arrow::Array;
using arrow::BooleanArray;
using arrow::Column;
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/2f5ef895/src/parquet/arrow/writer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index b8cb45c..1e3f6de 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -21,14 +21,13 @@
#include <string>
#include <vector>
-#include "parquet/util/bit-util.h"
-#include "parquet/util/logging.h"
-
-#include "parquet/arrow/schema.h"
-
#include "arrow/api.h"
+#include "arrow/util/bit-util.h"
#include "arrow/visitor_inline.h"
+#include "parquet/arrow/schema.h"
+#include "parquet/util/logging.h"
+
using arrow::Array;
using arrow::BinaryArray;
using arrow::FixedSizeBinaryArray;
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/2f5ef895/src/parquet/column_reader.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column_reader.cc b/src/parquet/column_reader.cc
index f63f6f1..ce6936d 100644
--- a/src/parquet/column_reader.cc
+++ b/src/parquet/column_reader.cc
@@ -21,10 +21,11 @@
#include <cstdint>
#include <memory>
+#include "arrow/util/rle-encoding.h"
+
#include "parquet/column_page.h"
#include "parquet/encoding-internal.h"
#include "parquet/properties.h"
-#include "parquet/util/rle-encoding.h"
using arrow::MemoryPool;
@@ -45,7 +46,7 @@ int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level,
num_bytes = *reinterpret_cast<const int32_t*>(data);
const uint8_t* decoder_data = data + sizeof(int32_t);
if (!rle_decoder_) {
- rle_decoder_.reset(new RleDecoder(decoder_data, num_bytes, bit_width_));
+ rle_decoder_.reset(new ::arrow::RleDecoder(decoder_data, num_bytes, bit_width_));
} else {
rle_decoder_->Reset(decoder_data, num_bytes, bit_width_);
}
@@ -55,7 +56,7 @@ int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level,
num_bytes =
static_cast<int32_t>(BitUtil::Ceil(num_buffered_values * bit_width_, 8));
if (!bit_packed_decoder_) {
- bit_packed_decoder_.reset(new BitReader(data, num_bytes));
+ bit_packed_decoder_.reset(new ::arrow::BitReader(data, num_bytes));
} else {
bit_packed_decoder_->Reset(data, num_bytes);
}
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/2f5ef895/src/parquet/column_reader.h
----------------------------------------------------------------------
diff --git a/src/parquet/column_reader.h b/src/parquet/column_reader.h
index f4b8b02..e733d67 100644
--- a/src/parquet/column_reader.h
+++ b/src/parquet/column_reader.h
@@ -36,11 +36,15 @@
#include "parquet/util/memory.h"
#include "parquet/util/visibility.h"
-namespace parquet {
+namespace arrow {
class BitReader;
class RleDecoder;
+} // namespace arrow
+
+namespace parquet {
+
class PARQUET_EXPORT LevelDecoder {
public:
LevelDecoder();
@@ -58,8 +62,8 @@ class PARQUET_EXPORT LevelDecoder {
int bit_width_;
int num_values_remaining_;
Encoding::type encoding_;
- std::unique_ptr<RleDecoder> rle_decoder_;
- std::unique_ptr<BitReader> bit_packed_decoder_;
+ std::unique_ptr<::arrow::RleDecoder> rle_decoder_;
+ std::unique_ptr<::arrow::BitReader> bit_packed_decoder_;
};
class PARQUET_EXPORT ColumnReader {
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/2f5ef895/src/parquet/column_writer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column_writer.cc b/src/parquet/column_writer.cc
index c13d4a0..21550da 100644
--- a/src/parquet/column_writer.cc
+++ b/src/parquet/column_writer.cc
@@ -17,15 +17,20 @@
#include "parquet/column_writer.h"
+#include "arrow/util/bit-util.h"
+#include "arrow/util/rle-encoding.h"
+
#include "parquet/encoding-internal.h"
#include "parquet/properties.h"
#include "parquet/statistics.h"
#include "parquet/util/logging.h"
#include "parquet/util/memory.h"
-#include "parquet/util/rle-encoding.h"
namespace parquet {
+using BitWriter = ::arrow::BitWriter;
+using RleEncoder = ::arrow::RleEncoder;
+
LevelEncoder::LevelEncoder() {}
LevelEncoder::~LevelEncoder() {}
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/2f5ef895/src/parquet/column_writer.h
----------------------------------------------------------------------
diff --git a/src/parquet/column_writer.h b/src/parquet/column_writer.h
index 4e113de..1637780 100644
--- a/src/parquet/column_writer.h
+++ b/src/parquet/column_writer.h
@@ -30,11 +30,15 @@
#include "parquet/util/memory.h"
#include "parquet/util/visibility.h"
-namespace parquet {
+namespace arrow {
class BitWriter;
class RleEncoder;
+} // namespace arrow
+
+namespace parquet {
+
class PARQUET_EXPORT LevelEncoder {
public:
LevelEncoder();
@@ -61,8 +65,8 @@ class PARQUET_EXPORT LevelEncoder {
int bit_width_;
int rle_length_;
Encoding::type encoding_;
- std::unique_ptr<RleEncoder> rle_encoder_;
- std::unique_ptr<BitWriter> bit_packed_encoder_;
+ std::unique_ptr<::arrow::RleEncoder> rle_encoder_;
+ std::unique_ptr<::arrow::BitWriter> bit_packed_encoder_;
};
static constexpr int WRITE_BATCH_SIZE = 1000;
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/2f5ef895/src/parquet/encoding-internal.h
----------------------------------------------------------------------
diff --git a/src/parquet/encoding-internal.h b/src/parquet/encoding-internal.h
index 61b8e24..88d781f 100644
--- a/src/parquet/encoding-internal.h
+++ b/src/parquet/encoding-internal.h
@@ -24,21 +24,23 @@
#include <memory>
#include <vector>
-#include <arrow/util/bit-util.h>
+#include "arrow/util/bit-stream-utils.h"
+#include "arrow/util/bit-util.h"
+#include "arrow/util/cpu-info.h"
+#include "arrow/util/hash-util.h"
+#include "arrow/util/rle-encoding.h"
#include "parquet/encoding.h"
#include "parquet/exception.h"
#include "parquet/schema.h"
#include "parquet/types.h"
-#include "parquet/util/bit-stream-utils.inline.h"
-#include "parquet/util/bit-util.h"
-#include "parquet/util/cpu-info.h"
-#include "parquet/util/hash-util.h"
#include "parquet/util/memory.h"
-#include "parquet/util/rle-encoding.h"
namespace parquet {
+namespace BitUtil = ::arrow::BitUtil;
+using HashUtil = ::arrow::HashUtil;
+
class ColumnDescriptor;
// ----------------------------------------------------------------------
@@ -136,7 +138,7 @@ class PlainDecoder<BooleanType> : public Decoder<BooleanType> {
virtual void SetData(int num_values, const uint8_t* data, int len) {
num_values_ = num_values;
- bit_reader_ = BitReader(data, len);
+ bit_reader_ = ::arrow::BitReader(data, len);
}
// Two flavors of bool decoding
@@ -161,7 +163,7 @@ class PlainDecoder<BooleanType> : public Decoder<BooleanType> {
}
private:
- BitReader bit_reader_;
+ ::arrow::BitReader bit_reader_;
};
// ----------------------------------------------------------------------
@@ -196,7 +198,7 @@ class PlainEncoder<BooleanType> : public Encoder<BooleanType> {
bits_available_(kInMemoryDefaultCapacity * 8),
bits_buffer_(AllocateBuffer(pool, kInMemoryDefaultCapacity)),
values_sink_(new InMemoryOutputStream(pool)) {
- bit_writer_.reset(new BitWriter(
+ bit_writer_.reset(new ::arrow::BitWriter(
bits_buffer_->mutable_data(), static_cast<int>(bits_buffer_->size())));
}
@@ -260,7 +262,7 @@ class PlainEncoder<BooleanType> : public Encoder<BooleanType> {
protected:
int bits_available_;
- std::unique_ptr<BitWriter> bit_writer_;
+ std::unique_ptr<::arrow::BitWriter> bit_writer_;
std::shared_ptr<PoolBuffer> bits_buffer_;
std::unique_ptr<InMemoryOutputStream> values_sink_;
};
@@ -325,12 +327,13 @@ class DictionaryDecoder : public Decoder<Type> {
uint8_t bit_width = *data;
++data;
--len;
- idx_decoder_ = RleDecoder(data, len, bit_width);
+ idx_decoder_ = ::arrow::RleDecoder(data, len, bit_width);
}
int Decode(T* buffer, int max_values) override {
max_values = std::min(max_values, num_values_);
- int decoded_values = idx_decoder_.GetBatchWithDict(dictionary_, buffer, max_values);
+ int decoded_values =
+ idx_decoder_.GetBatchWithDict(dictionary_.data(), buffer, max_values);
if (decoded_values != max_values) { ParquetException::EofException(); }
num_values_ -= max_values;
return max_values;
@@ -338,8 +341,8 @@ class DictionaryDecoder : public Decoder<Type> {
int DecodeSpaced(T* buffer, int num_values, int null_count, const uint8_t* valid_bits,
int64_t valid_bits_offset) override {
- int decoded_values = idx_decoder_.GetBatchWithDictSpaced(
- dictionary_, buffer, num_values, null_count, valid_bits, valid_bits_offset);
+ int decoded_values = idx_decoder_.GetBatchWithDictSpaced(dictionary_.data(), buffer,
+ num_values, null_count, valid_bits, valid_bits_offset);
if (decoded_values != num_values) { ParquetException::EofException(); }
return decoded_values;
}
@@ -354,7 +357,7 @@ class DictionaryDecoder : public Decoder<Type> {
// pointers).
std::shared_ptr<PoolBuffer> byte_array_data_;
- RleDecoder idx_decoder_;
+ ::arrow::RleDecoder idx_decoder_;
};
template <typename Type>
@@ -446,7 +449,7 @@ class DictEncoder : public Encoder<DType> {
dict_encoded_size_(0),
type_length_(desc->type_length()) {
hash_slots_.Assign(hash_table_size_, HASH_SLOT_EMPTY);
- if (!CpuInfo::initialized()) { CpuInfo::Init(); }
+ if (!::arrow::CpuInfo::initialized()) { ::arrow::CpuInfo::Init(); }
}
virtual ~DictEncoder() { DCHECK(buffered_indices_.empty()); }
@@ -464,9 +467,9 @@ class DictEncoder : public Encoder<DType> {
// reserve
// an extra "RleEncoder::MinBufferSize" bytes. These extra bytes won't be used
// but not reserving them would cause the encoder to fail.
- return 1 + RleEncoder::MaxBufferSize(
+ return 1 + ::arrow::RleEncoder::MaxBufferSize(
bit_width(), static_cast<int>(buffered_indices_.size())) +
- RleEncoder::MinBufferSize(bit_width());
+ ::arrow::RleEncoder::MinBufferSize(bit_width());
}
/// The minimum bit width required to encode the currently buffered indices.
@@ -727,7 +730,7 @@ inline int DictEncoder<DType>::WriteIndices(uint8_t* buffer, int buffer_len) {
++buffer;
--buffer_len;
- RleEncoder encoder(buffer, buffer_len, bit_width());
+ ::arrow::RleEncoder encoder(buffer, buffer_len, bit_width());
for (int index : buffered_indices_) {
if (!encoder.Put(index)) return -1;
}
@@ -756,7 +759,7 @@ class DeltaBitPackDecoder : public Decoder<DType> {
virtual void SetData(int num_values, const uint8_t* data, int len) {
num_values_ = num_values;
- decoder_ = BitReader(data, len);
+ decoder_ = ::arrow::BitReader(data, len);
values_current_block_ = 0;
values_current_mini_block_ = 0;
}
@@ -819,7 +822,7 @@ class DeltaBitPackDecoder : public Decoder<DType> {
return max_values;
}
- BitReader decoder_;
+ ::arrow::BitReader decoder_;
int32_t values_current_block_;
int32_t num_mini_blocks_;
uint64_t values_per_mini_block_;
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/2f5ef895/src/parquet/encoding-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/encoding-test.cc b/src/parquet/encoding-test.cc
index 2e78036..dcd813d 100644
--- a/src/parquet/encoding-test.cc
+++ b/src/parquet/encoding-test.cc
@@ -22,10 +22,11 @@
#include <string>
#include <vector>
+#include "arrow/util/bit-util.h"
+
#include "parquet/encoding-internal.h"
#include "parquet/schema.h"
#include "parquet/types.h"
-#include "parquet/util/bit-util.h"
#include "parquet/util/memory.h"
#include "parquet/util/test-common.h"
@@ -64,7 +65,7 @@ TEST(VectorBooleanTest, TestEncodeDecode) {
ASSERT_EQ(nvalues, values_decoded);
for (int i = 0; i < nvalues; ++i) {
- ASSERT_EQ(draws[i], BitUtil::GetArrayBit(decode_data, i)) << i;
+ ASSERT_EQ(draws[i], BitUtil::GetBit(decode_data, i)) << i;
}
}
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/2f5ef895/src/parquet/encoding.h
----------------------------------------------------------------------
diff --git a/src/parquet/encoding.h b/src/parquet/encoding.h
index 7c51cfd..1417e98 100644
--- a/src/parquet/encoding.h
+++ b/src/parquet/encoding.h
@@ -21,12 +21,11 @@
#include <cstdint>
#include <memory>
-#include <arrow/util/bit-util.h>
+#include "arrow/util/bit-util.h"
#include "parquet/exception.h"
#include "parquet/schema.h"
#include "parquet/types.h"
-#include "parquet/util/bit-util.h"
#include "parquet/util/memory.h"
namespace parquet {
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/2f5ef895/src/parquet/types.h
----------------------------------------------------------------------
diff --git a/src/parquet/types.h b/src/parquet/types.h
index 7ec3825..38015c4 100644
--- a/src/parquet/types.h
+++ b/src/parquet/types.h
@@ -24,7 +24,8 @@
#include <sstream>
#include <string>
-#include "parquet/util/compiler-util.h"
+#include "arrow/util/compiler-util.h"
+
#include "parquet/util/visibility.h"
namespace parquet {
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/2f5ef895/src/parquet/util/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/parquet/util/CMakeLists.txt b/src/parquet/util/CMakeLists.txt
index e8fbdc7..4717266 100644
--- a/src/parquet/util/CMakeLists.txt
+++ b/src/parquet/util/CMakeLists.txt
@@ -17,19 +17,11 @@
# Headers: util
install(FILES
- bit-stream-utils.h
- bit-stream-utils.inline.h
- bit-util.h
buffer-builder.h
- compiler-util.h
- cpu-info.h
- hash-util.h
logging.h
macros.h
memory.h
- rle-encoding.h
stopwatch.h
- sse-util.h
visibility.h
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/parquet/util")
@@ -52,7 +44,5 @@ if (PARQUET_BUILD_BENCHMARKS)
endif()
endif()
-ADD_PARQUET_TEST(bit-util-test)
ADD_PARQUET_TEST(comparison-test)
ADD_PARQUET_TEST(memory-test)
-ADD_PARQUET_TEST(rle-test)
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/2f5ef895/src/parquet/util/bit-stream-utils.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/bit-stream-utils.h b/src/parquet/util/bit-stream-utils.h
deleted file mode 100644
index 497a960..0000000
--- a/src/parquet/util/bit-stream-utils.h
+++ /dev/null
@@ -1,170 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// From Apache Impala as of 2016-01-29
-
-#ifndef PARQUET_UTIL_BIT_STREAM_UTILS_H
-#define PARQUET_UTIL_BIT_STREAM_UTILS_H
-
-#include <algorithm>
-#include <cstdint>
-#include <string.h>
-
-#include "parquet/util/bit-util.h"
-#include "parquet/util/compiler-util.h"
-#include "parquet/util/logging.h"
-
-namespace parquet {
-
-/// Utility class to write bit/byte streams. This class can write data to either be
-/// bit packed or byte aligned (and a single stream that has a mix of both).
-/// This class does not allocate memory.
-class BitWriter {
- public:
- /// buffer: buffer to write bits to. Buffer should be preallocated with
- /// 'buffer_len' bytes.
- BitWriter(uint8_t* buffer, int buffer_len) : buffer_(buffer), max_bytes_(buffer_len) {
- Clear();
- }
-
- void Clear() {
- buffered_values_ = 0;
- byte_offset_ = 0;
- bit_offset_ = 0;
- }
-
- /// The number of current bytes written, including the current byte (i.e. may include a
- /// fraction of a byte). Includes buffered values.
- int bytes_written() const {
- return byte_offset_ + static_cast<int>(BitUtil::Ceil(bit_offset_, 8));
- }
- uint8_t* buffer() const { return buffer_; }
- int buffer_len() const { return max_bytes_; }
-
- /// Writes a value to buffered_values_, flushing to buffer_ if necessary. This is bit
- /// packed. Returns false if there was not enough space. num_bits must be <= 32.
- bool PutValue(uint64_t v, int num_bits);
-
- /// Writes v to the next aligned byte using num_bytes. If T is larger than
- /// num_bytes, the extra high-order bytes will be ignored. Returns false if
- /// there was not enough space.
- template <typename T>
- bool PutAligned(T v, int num_bytes);
-
- /// Write a Vlq encoded int to the buffer. Returns false if there was not enough
- /// room. The value is written byte aligned.
- /// For more details on vlq:
- /// en.wikipedia.org/wiki/Variable-length_quantity
- bool PutVlqInt(uint32_t v);
-
- // Writes an int zigzag encoded.
- bool PutZigZagVlqInt(int32_t v);
-
- /// Get a pointer to the next aligned byte and advance the underlying buffer
- /// by num_bytes.
- /// Returns NULL if there was not enough space.
- uint8_t* GetNextBytePtr(int num_bytes = 1);
-
- /// Flushes all buffered values to the buffer. Call this when done writing to
- /// the buffer. If 'align' is true, buffered_values_ is reset and any future
- /// writes will be written to the next byte boundary.
- void Flush(bool align = false);
-
- private:
- uint8_t* buffer_;
- int max_bytes_;
-
- /// Bit-packed values are initially written to this variable before being memcpy'd to
- /// buffer_. This is faster than writing values byte by byte directly to buffer_.
- uint64_t buffered_values_;
-
- int byte_offset_; // Offset in buffer_
- int bit_offset_; // Offset in buffered_values_
-};
-
-/// Utility class to read bit/byte stream. This class can read bits or bytes
-/// that are either byte aligned or not. It also has utilities to read multiple
-/// bytes in one read (e.g. encoded int).
-class BitReader {
- public:
- /// 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'.
- BitReader(const uint8_t* buffer, int buffer_len)
- : buffer_(buffer), max_bytes_(buffer_len), byte_offset_(0), bit_offset_(0) {
- int num_bytes = std::min(8, max_bytes_ - byte_offset_);
- memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes);
- }
-
- BitReader() : buffer_(NULL), max_bytes_(0) {}
-
- void Reset(const uint8_t* buffer, int buffer_len) {
- buffer_ = buffer;
- max_bytes_ = buffer_len;
- byte_offset_ = 0;
- bit_offset_ = 0;
- int num_bytes = std::min(8, max_bytes_ - byte_offset_);
- memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes);
- }
-
- /// Gets the next value from the buffer. Returns true if 'v' could be read or false if
- /// there are not enough bytes left. num_bits must be <= 32.
- template <typename T>
- bool GetValue(int num_bits, T* v);
-
- /// Get a number of values from the buffer. Return the number of values actually read.
- template <typename T>
- int GetBatch(int num_bits, T* v, int batch_size);
-
- /// Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T
- /// needs to be a little-endian native type and big enough to store
- /// 'num_bytes'. The value is assumed to be byte-aligned so the stream will
- /// be advanced to the start of the next byte before 'v' is read. Returns
- /// false if there are not enough bytes left.
- template <typename T>
- bool GetAligned(int num_bytes, T* v);
-
- /// Reads a vlq encoded int from the stream. The encoded int must start at
- /// the beginning of a byte. Return false if there were not enough bytes in
- /// the buffer.
- bool GetVlqInt(int32_t* v);
-
- // Reads a zigzag encoded int `into` v.
- bool GetZigZagVlqInt(int32_t* v);
-
- /// Returns the number of bytes left in the stream, not including the current
- /// byte (i.e., there may be an additional fraction of a byte).
- int bytes_left() {
- return max_bytes_ - (byte_offset_ + static_cast<int>(BitUtil::Ceil(bit_offset_, 8)));
- }
-
- /// Maximum byte length of a vlq encoded int
- static const int MAX_VLQ_BYTE_LEN = 5;
-
- private:
- const uint8_t* buffer_;
- int max_bytes_;
-
- /// Bytes are memcpy'd from buffer_ and values are read from this variable. This is
- /// faster than reading values byte by byte directly from buffer_.
- uint64_t buffered_values_;
-
- int byte_offset_; // Offset in buffer_
- int bit_offset_; // Offset in buffered_values_
-};
-
-} // namespace parquet
-
-#endif // PARQUET_UTIL_BIT_STREAM_UTILS_H
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/2f5ef895/src/parquet/util/bit-stream-utils.inline.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/bit-stream-utils.inline.h b/src/parquet/util/bit-stream-utils.inline.h
deleted file mode 100644
index 5db1639..0000000
--- a/src/parquet/util/bit-stream-utils.inline.h
+++ /dev/null
@@ -1,258 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// From Apache Impala as of 2016-01-29
-
-#ifndef PARQUET_UTIL_BIT_STREAM_UTILS_INLINE_H
-#define PARQUET_UTIL_BIT_STREAM_UTILS_INLINE_H
-
-#include <algorithm>
-
-#include "parquet/util/bit-stream-utils.h"
-#include "parquet/util/bpacking.h"
-
-namespace parquet {
-
-inline bool BitWriter::PutValue(uint64_t v, int num_bits) {
- // TODO: revisit this limit if necessary (can be raised to 64 by fixing some edge cases)
- DCHECK_LE(num_bits, 32);
- DCHECK_EQ(v >> num_bits, 0) << "v = " << v << ", num_bits = " << num_bits;
-
- if (UNLIKELY(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8)) return false;
-
- buffered_values_ |= v << bit_offset_;
- bit_offset_ += num_bits;
-
- if (UNLIKELY(bit_offset_ >= 64)) {
- // Flush buffered_values_ and write out bits of v that did not fit
- memcpy(buffer_ + byte_offset_, &buffered_values_, 8);
- buffered_values_ = 0;
- byte_offset_ += 8;
- bit_offset_ -= 64;
- buffered_values_ = v >> (num_bits - bit_offset_);
- }
- DCHECK_LT(bit_offset_, 64);
- return true;
-}
-
-inline void BitWriter::Flush(bool align) {
- int num_bytes = static_cast<int>(BitUtil::Ceil(bit_offset_, 8));
- DCHECK_LE(byte_offset_ + num_bytes, max_bytes_);
- memcpy(buffer_ + byte_offset_, &buffered_values_, num_bytes);
-
- if (align) {
- buffered_values_ = 0;
- byte_offset_ += num_bytes;
- bit_offset_ = 0;
- }
-}
-
-inline uint8_t* BitWriter::GetNextBytePtr(int num_bytes) {
- Flush(/* align */ true);
- DCHECK_LE(byte_offset_, max_bytes_);
- if (byte_offset_ + num_bytes > max_bytes_) return NULL;
- uint8_t* ptr = buffer_ + byte_offset_;
- byte_offset_ += num_bytes;
- return ptr;
-}
-
-template <typename T>
-inline bool BitWriter::PutAligned(T val, int num_bytes) {
- uint8_t* ptr = GetNextBytePtr(num_bytes);
- if (ptr == NULL) return false;
- memcpy(ptr, &val, num_bytes);
- return true;
-}
-
-inline bool BitWriter::PutVlqInt(uint32_t v) {
- bool result = true;
- while ((v & 0xFFFFFF80) != 0L) {
- result &= PutAligned<uint8_t>((v & 0x7F) | 0x80, 1);
- v >>= 7;
- }
- result &= PutAligned<uint8_t>(v & 0x7F, 1);
- return result;
-}
-
-template <typename T>
-inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer,
- int* bit_offset, int* byte_offset, uint64_t* buffered_values) {
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4800)
-#endif
- *v = static_cast<T>(
- BitUtil::TrailingBits(*buffered_values, *bit_offset + num_bits) >> *bit_offset);
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
- *bit_offset += num_bits;
- if (*bit_offset >= 64) {
- *byte_offset += 8;
- *bit_offset -= 64;
-
- int bytes_remaining = max_bytes - *byte_offset;
- if (LIKELY(bytes_remaining >= 8)) {
- memcpy(buffered_values, buffer + *byte_offset, 8);
- } else {
- memcpy(buffered_values, buffer + *byte_offset, bytes_remaining);
- }
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4800 4805)
-#endif
- // Read bits of v that crossed into new buffered_values_
- *v |= BitUtil::TrailingBits(*buffered_values, *bit_offset)
- << (num_bits - *bit_offset);
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
- DCHECK_LE(*bit_offset, 64);
- }
-}
-
-template <typename T>
-inline bool BitReader::GetValue(int num_bits, T* v) {
- return GetBatch(num_bits, v, 1) == 1;
-}
-
-template <typename T>
-inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
- DCHECK(buffer_ != NULL);
- // TODO: revisit this limit if necessary
- DCHECK_LE(num_bits, 32);
- DCHECK_LE(num_bits, static_cast<int>(sizeof(T) * 8));
-
- int bit_offset = bit_offset_;
- int byte_offset = byte_offset_;
- uint64_t buffered_values = buffered_values_;
- int max_bytes = max_bytes_;
- const uint8_t* buffer = buffer_;
-
- uint64_t needed_bits = num_bits * batch_size;
- uint64_t remaining_bits = (max_bytes - byte_offset) * 8 - bit_offset;
- if (remaining_bits < needed_bits) {
- batch_size = static_cast<int>(remaining_bits) / num_bits;
- }
-
- int i = 0;
- if (UNLIKELY(bit_offset != 0)) {
- for (; i < batch_size && bit_offset != 0; ++i) {
- GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset,
- &buffered_values);
- }
- }
-
- if (sizeof(T) == 4) {
- int num_unpacked = unpack32(reinterpret_cast<const uint32_t*>(buffer + byte_offset),
- reinterpret_cast<uint32_t*>(v + i), batch_size - i, num_bits);
- i += num_unpacked;
- byte_offset += num_unpacked * num_bits / 8;
- } else {
- const int buffer_size = 1024;
- uint32_t unpack_buffer[buffer_size];
- while (i < batch_size) {
- int unpack_size = std::min(buffer_size, batch_size - i);
- int num_unpacked = unpack32(reinterpret_cast<const uint32_t*>(buffer + byte_offset),
- unpack_buffer, unpack_size, num_bits);
- if (num_unpacked == 0) { break; }
- for (int k = 0; k < num_unpacked; ++k) {
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4800)
-#endif
- v[i + k] = unpack_buffer[k];
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
- }
- i += num_unpacked;
- byte_offset += num_unpacked * num_bits / 8;
- }
- }
-
- int bytes_remaining = max_bytes - byte_offset;
- if (bytes_remaining >= 8) {
- memcpy(&buffered_values, buffer + byte_offset, 8);
- } else {
- memcpy(&buffered_values, buffer + byte_offset, bytes_remaining);
- }
-
- for (; i < batch_size; ++i) {
- GetValue_(
- num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset, &buffered_values);
- }
-
- bit_offset_ = bit_offset;
- byte_offset_ = byte_offset;
- buffered_values_ = buffered_values;
-
- return batch_size;
-}
-
-template <typename T>
-inline bool BitReader::GetAligned(int num_bytes, T* v) {
- DCHECK_LE(num_bytes, static_cast<int>(sizeof(T)));
- int bytes_read = static_cast<int>(BitUtil::Ceil(bit_offset_, 8));
- if (UNLIKELY(byte_offset_ + bytes_read + num_bytes > max_bytes_)) return false;
-
- // Advance byte_offset to next unread byte and read num_bytes
- byte_offset_ += bytes_read;
- memcpy(v, buffer_ + byte_offset_, num_bytes);
- byte_offset_ += num_bytes;
-
- // Reset buffered_values_
- bit_offset_ = 0;
- int bytes_remaining = max_bytes_ - byte_offset_;
- if (LIKELY(bytes_remaining >= 8)) {
- memcpy(&buffered_values_, buffer_ + byte_offset_, 8);
- } else {
- memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining);
- }
- return true;
-}
-
-inline bool BitReader::GetVlqInt(int32_t* v) {
- *v = 0;
- int shift = 0;
- int num_bytes = 0;
- uint8_t byte = 0;
- do {
- if (!GetAligned<uint8_t>(1, &byte)) return false;
- *v |= (byte & 0x7F) << shift;
- shift += 7;
- DCHECK_LE(++num_bytes, MAX_VLQ_BYTE_LEN);
- } while ((byte & 0x80) != 0);
- return true;
-}
-
-inline bool BitWriter::PutZigZagVlqInt(int32_t v) {
- uint32_t u = (v << 1) ^ (v >> 31);
- return PutVlqInt(u);
-}
-
-inline bool BitReader::GetZigZagVlqInt(int32_t* v) {
- int32_t u_signed;
- if (!GetVlqInt(&u_signed)) return false;
- uint32_t u = static_cast<uint32_t>(u_signed);
- *reinterpret_cast<uint32_t*>(v) = (u >> 1) ^ -(static_cast<int32_t>(u & 1));
- return true;
-}
-
-} // namespace parquet
-
-#endif // PARQUET_UTIL_BIT_STREAM_UTILS_INLINE_H
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/2f5ef895/src/parquet/util/bit-util-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/util/bit-util-test.cc b/src/parquet/util/bit-util-test.cc
deleted file mode 100644
index bc3e182..0000000
--- a/src/parquet/util/bit-util-test.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// From Apache Impala as of 2016-01-29
-
-#include <gtest/gtest.h>
-#include <limits.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <boost/utility.hpp>
-
-#include <iostream>
-
-#include "parquet/util/bit-stream-utils.inline.h"
-#include "parquet/util/bit-util.h"
-#include "parquet/util/cpu-info.h"
-
-namespace parquet {
-
-static void ensure_cpu_info_initialized() {
- if (!CpuInfo::initialized()) { CpuInfo::Init(); }
-}
-
-TEST(BitUtil, Ceil) {
- EXPECT_EQ(BitUtil::Ceil(0, 1), 0);
- EXPECT_EQ(BitUtil::Ceil(1, 1), 1);
- EXPECT_EQ(BitUtil::Ceil(1, 2), 1);
- EXPECT_EQ(BitUtil::Ceil(1, 8), 1);
- EXPECT_EQ(BitUtil::Ceil(7, 8), 1);
- EXPECT_EQ(BitUtil::Ceil(8, 8), 1);
- EXPECT_EQ(BitUtil::Ceil(9, 8), 2);
- EXPECT_EQ(BitUtil::Ceil(9, 9), 1);
- EXPECT_EQ(BitUtil::Ceil(10000000000, 10), 1000000000);
- EXPECT_EQ(BitUtil::Ceil(10, 10000000000), 1);
- EXPECT_EQ(BitUtil::Ceil(100000000000, 10000000000), 10);
-}
-
-TEST(BitUtil, RoundUp) {
- EXPECT_EQ(BitUtil::RoundUp(0, 1), 0);
- EXPECT_EQ(BitUtil::RoundUp(1, 1), 1);
- EXPECT_EQ(BitUtil::RoundUp(1, 2), 2);
- EXPECT_EQ(BitUtil::RoundUp(6, 2), 6);
- EXPECT_EQ(BitUtil::RoundUp(7, 3), 9);
- EXPECT_EQ(BitUtil::RoundUp(9, 9), 9);
- EXPECT_EQ(BitUtil::RoundUp(10000000001, 10), 10000000010);
- EXPECT_EQ(BitUtil::RoundUp(10, 10000000000), 10000000000);
- EXPECT_EQ(BitUtil::RoundUp(100000000000, 10000000000), 100000000000);
-}
-
-TEST(BitUtil, RoundDown) {
- EXPECT_EQ(BitUtil::RoundDown(0, 1), 0);
- EXPECT_EQ(BitUtil::RoundDown(1, 1), 1);
- EXPECT_EQ(BitUtil::RoundDown(1, 2), 0);
- EXPECT_EQ(BitUtil::RoundDown(6, 2), 6);
- EXPECT_EQ(BitUtil::RoundDown(7, 3), 6);
- EXPECT_EQ(BitUtil::RoundDown(9, 9), 9);
- EXPECT_EQ(BitUtil::RoundDown(10000000001, 10), 10000000000);
- EXPECT_EQ(BitUtil::RoundDown(10, 10000000000), 0);
- EXPECT_EQ(BitUtil::RoundDown(100000000000, 10000000000), 100000000000);
-}
-
-TEST(BitUtil, Popcount) {
- ensure_cpu_info_initialized();
-
- EXPECT_EQ(BitUtil::Popcount(BOOST_BINARY(0 1 0 1 0 1 0 1)), 4);
- EXPECT_EQ(BitUtil::PopcountNoHw(BOOST_BINARY(0 1 0 1 0 1 0 1)), 4);
- EXPECT_EQ(BitUtil::Popcount(BOOST_BINARY(1 1 1 1 0 1 0 1)), 6);
- EXPECT_EQ(BitUtil::PopcountNoHw(BOOST_BINARY(1 1 1 1 0 1 0 1)), 6);
- EXPECT_EQ(BitUtil::Popcount(BOOST_BINARY(1 1 1 1 1 1 1 1)), 8);
- EXPECT_EQ(BitUtil::PopcountNoHw(BOOST_BINARY(1 1 1 1 1 1 1 1)), 8);
- EXPECT_EQ(BitUtil::Popcount(0), 0);
- EXPECT_EQ(BitUtil::PopcountNoHw(0), 0);
-}
-
-TEST(BitUtil, TrailingBits) {
- EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 0), 0);
- EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 1), 1);
- EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 64),
- BOOST_BINARY(1 1 1 1 1 1 1 1));
- EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 100),
- BOOST_BINARY(1 1 1 1 1 1 1 1));
- EXPECT_EQ(BitUtil::TrailingBits(0, 1), 0);
- EXPECT_EQ(BitUtil::TrailingBits(0, 64), 0);
- EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 0), 0);
- EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 63), 0);
- EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 64), 1LL << 63);
-}
-
-TEST(BitUtil, ByteSwap) {
- EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint32_t>(0)), 0);
- EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint32_t>(0x11223344)), 0x44332211);
-
- EXPECT_EQ(BitUtil::ByteSwap(static_cast<int32_t>(0)), 0);
- EXPECT_EQ(BitUtil::ByteSwap(static_cast<int32_t>(0x11223344)), 0x44332211);
-
- EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint64_t>(0)), 0);
- EXPECT_EQ(
- BitUtil::ByteSwap(static_cast<uint64_t>(0x1122334455667788)), 0x8877665544332211);
-
- EXPECT_EQ(BitUtil::ByteSwap(static_cast<int64_t>(0)), 0);
- EXPECT_EQ(
- BitUtil::ByteSwap(static_cast<int64_t>(0x1122334455667788)), 0x8877665544332211);
-
- EXPECT_EQ(BitUtil::ByteSwap(static_cast<int16_t>(0)), 0);
- EXPECT_EQ(BitUtil::ByteSwap(static_cast<int16_t>(0x1122)), 0x2211);
-
- EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint16_t>(0)), 0);
- EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint16_t>(0x1122)), 0x2211);
-}
-
-TEST(BitUtil, Log2) {
- EXPECT_EQ(BitUtil::Log2(1), 0);
- EXPECT_EQ(BitUtil::Log2(2), 1);
- EXPECT_EQ(BitUtil::Log2(3), 2);
- EXPECT_EQ(BitUtil::Log2(4), 2);
- EXPECT_EQ(BitUtil::Log2(5), 3);
- EXPECT_EQ(BitUtil::Log2(INT_MAX), 31);
- EXPECT_EQ(BitUtil::Log2(UINT_MAX), 32);
- EXPECT_EQ(BitUtil::Log2(ULLONG_MAX), 64);
-}
-
-TEST(BitUtil, RoundUpToPowerOf2) {
- EXPECT_EQ(BitUtil::RoundUpToPowerOf2(7, 8), 8);
- EXPECT_EQ(BitUtil::RoundUpToPowerOf2(8, 8), 8);
- EXPECT_EQ(BitUtil::RoundUpToPowerOf2(9, 8), 16);
-}
-
-TEST(BitUtil, RoundDownToPowerOf2) {
- EXPECT_EQ(BitUtil::RoundDownToPowerOf2(7, 8), 0);
- EXPECT_EQ(BitUtil::RoundDownToPowerOf2(8, 8), 8);
- EXPECT_EQ(BitUtil::RoundDownToPowerOf2(9, 8), 8);
-}
-
-TEST(BitUtil, RoundUpDown) {
- EXPECT_EQ(BitUtil::RoundUpNumBytes(7), 1);
- EXPECT_EQ(BitUtil::RoundUpNumBytes(8), 1);
- EXPECT_EQ(BitUtil::RoundUpNumBytes(9), 2);
- EXPECT_EQ(BitUtil::RoundDownNumBytes(7), 0);
- EXPECT_EQ(BitUtil::RoundDownNumBytes(8), 1);
- EXPECT_EQ(BitUtil::RoundDownNumBytes(9), 1);
-
- EXPECT_EQ(BitUtil::RoundUpNumi32(31), 1);
- EXPECT_EQ(BitUtil::RoundUpNumi32(32), 1);
- EXPECT_EQ(BitUtil::RoundUpNumi32(33), 2);
- EXPECT_EQ(BitUtil::RoundDownNumi32(31), 0);
- EXPECT_EQ(BitUtil::RoundDownNumi32(32), 1);
- EXPECT_EQ(BitUtil::RoundDownNumi32(33), 1);
-
- EXPECT_EQ(BitUtil::RoundUpNumi64(63), 1);
- EXPECT_EQ(BitUtil::RoundUpNumi64(64), 1);
- EXPECT_EQ(BitUtil::RoundUpNumi64(65), 2);
- EXPECT_EQ(BitUtil::RoundDownNumi64(63), 0);
- EXPECT_EQ(BitUtil::RoundDownNumi64(64), 1);
- EXPECT_EQ(BitUtil::RoundDownNumi64(65), 1);
-}
-
-void TestZigZag(int32_t v) {
- uint8_t buffer[BitReader::MAX_VLQ_BYTE_LEN];
- BitWriter writer(buffer, sizeof(buffer));
- BitReader reader(buffer, sizeof(buffer));
- writer.PutZigZagVlqInt(v);
- int32_t result;
- EXPECT_TRUE(reader.GetZigZagVlqInt(&result));
- EXPECT_EQ(v, result);
-}
-
-TEST(BitStreamUtil, ZigZag) {
- TestZigZag(0);
- TestZigZag(1);
- TestZigZag(-1);
- TestZigZag(std::numeric_limits<int32_t>::max());
- TestZigZag(-std::numeric_limits<int32_t>::max());
-}
-
-} // namespace parquet
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/2f5ef895/src/parquet/util/bit-util.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/bit-util.h b/src/parquet/util/bit-util.h
deleted file mode 100644
index e315b5f..0000000
--- a/src/parquet/util/bit-util.h
+++ /dev/null
@@ -1,340 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// From Apache Impala as of 2016-01-29
-
-#ifndef PARQUET_UTIL_BIT_UTIL_H
-#define PARQUET_UTIL_BIT_UTIL_H
-
-#if defined(__APPLE__)
-#include <machine/endian.h>
-#elif defined(_WIN32)
-#define __LITTLE_ENDIAN 1
-#else
-#include <endian.h>
-#endif
-
-#if defined(_MSC_VER)
-#define PARQUET_BYTE_SWAP64 _byteswap_uint64
-#define PARQUET_BYTE_SWAP32 _byteswap_ulong
-#else
-#define PARQUET_BYTE_SWAP64 __builtin_bswap64
-#define PARQUET_BYTE_SWAP32 __builtin_bswap32
-#endif
-
-#include <cstdint>
-
-#include "parquet/util/compiler-util.h"
-
-#ifdef PARQUET_USE_SSE
-#include "parquet/util/cpu-info.h"
-#include "parquet/util/sse-util.h"
-#endif
-
-namespace parquet {
-
-#define INIT_BITSET(valid_bits_vector, valid_bits_index) \
- int byte_offset_##valid_bits_vector = (valid_bits_index) / 8; \
- int bit_offset_##valid_bits_vector = (valid_bits_index) % 8; \
- uint8_t bitset_##valid_bits_vector = valid_bits_vector[byte_offset_##valid_bits_vector];
-
-#define READ_NEXT_BITSET(valid_bits_vector) \
- bit_offset_##valid_bits_vector++; \
- if (bit_offset_##valid_bits_vector == 8) { \
- bit_offset_##valid_bits_vector = 0; \
- byte_offset_##valid_bits_vector++; \
- bitset_##valid_bits_vector = valid_bits_vector[byte_offset_##valid_bits_vector]; \
- }
-
-// TODO(wesm): The source from Impala was depending on boost::make_unsigned
-//
-// We add a partial stub implementation here
-
-template <typename T>
-struct make_unsigned {};
-
-template <>
-struct make_unsigned<int8_t> {
- typedef uint8_t type;
-};
-
-template <>
-struct make_unsigned<int16_t> {
- typedef uint16_t type;
-};
-
-template <>
-struct make_unsigned<int32_t> {
- typedef uint32_t type;
-};
-
-template <>
-struct make_unsigned<int64_t> {
- typedef uint64_t type;
-};
-
-/// Utility class to do standard bit tricks
-class BitUtil {
- public:
- /// Returns the ceil of value/divisor
- static inline int64_t Ceil(int64_t value, int64_t divisor) {
- return value / divisor + (value % divisor != 0);
- }
-
- /// Returns 'value' rounded up to the nearest multiple of 'factor'
- static inline int64_t RoundUp(int64_t value, int64_t factor) {
- return (value + (factor - 1)) / factor * factor;
- }
-
- /// Returns 'value' rounded down to the nearest multiple of 'factor'
- static inline int64_t RoundDown(int64_t value, int64_t factor) {
- return (value / factor) * factor;
- }
-
- /// Returns the smallest power of two that contains v. Taken from
- /// http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
- /// TODO: Pick a better name, as it is not clear what happens when the input is
- /// already a power of two.
- static inline int64_t NextPowerOfTwo(int64_t v) {
- --v;
- v |= v >> 1;
- v |= v >> 2;
- v |= v >> 4;
- v |= v >> 8;
- v |= v >> 16;
- v |= v >> 32;
- ++v;
- return v;
- }
-
- /// Returns 'value' rounded up to the nearest multiple of 'factor' when factor is
- /// a power of two
- static inline int RoundUpToPowerOf2(int value, int factor) {
- // DCHECK((factor > 0) && ((factor & (factor - 1)) == 0));
- return (value + (factor - 1)) & ~(factor - 1);
- }
-
- static inline int RoundDownToPowerOf2(int value, int factor) {
- // DCHECK((factor > 0) && ((factor & (factor - 1)) == 0));
- return value & ~(factor - 1);
- }
-
- /// Specialized round up and down functions for frequently used factors,
- /// like 8 (bits->bytes), 32 (bits->i32), and 64 (bits->i64).
- /// Returns the rounded up number of bytes that fit the number of bits.
- static inline uint32_t RoundUpNumBytes(uint32_t bits) { return (bits + 7) >> 3; }
-
- /// Returns the rounded down number of bytes that fit the number of bits.
- static inline uint32_t RoundDownNumBytes(uint32_t bits) { return bits >> 3; }
-
- /// Returns the rounded up to 32 multiple. Used for conversions of bits to i32.
- static inline uint32_t RoundUpNumi32(uint32_t bits) { return (bits + 31) >> 5; }
-
- /// Returns the rounded up 32 multiple.
- static inline uint32_t RoundDownNumi32(uint32_t bits) { return bits >> 5; }
-
- /// Returns the rounded up to 64 multiple. Used for conversions of bits to i64.
- static inline uint32_t RoundUpNumi64(uint32_t bits) { return (bits + 63) >> 6; }
-
- /// Returns the rounded down to 64 multiple.
- static inline uint32_t RoundDownNumi64(uint32_t bits) { return bits >> 6; }
-
- /// Non hw accelerated pop count.
- /// TODO: we don't use this in any perf sensitive code paths currently. There
- /// might be a much faster way to implement this.
- static inline int PopcountNoHw(uint64_t x) {
- int count = 0;
- for (; x != 0; ++count)
- x &= x - 1;
- return count;
- }
-
- /// Returns the number of set bits in x
- static inline int Popcount(uint64_t x) {
-#ifdef PARQUET_USE_SSE
- if (LIKELY(CpuInfo::IsSupported(CpuInfo::POPCNT))) {
- return POPCNT_popcnt_u64(x);
- } else {
- return PopcountNoHw(x);
- }
-#else
- return PopcountNoHw(x);
-#endif
- }
-
- // Compute correct population count for various-width signed integers
- template <typename T>
- static inline int PopcountSigned(T v) {
- // Converting to same-width unsigned then extending preserves the bit pattern.
- return BitUtil::Popcount(static_cast<typename make_unsigned<T>::type>(v));
- }
-
- /// Returns the 'num_bits' least-significant bits of 'v'.
- static inline uint64_t TrailingBits(uint64_t v, int num_bits) {
- if (UNLIKELY(num_bits == 0)) return 0;
- if (UNLIKELY(num_bits >= 64)) return v;
- int n = 64 - num_bits;
- return (v << n) >> n;
- }
-
- /// Returns ceil(log2(x)).
- /// TODO: this could be faster if we use __builtin_clz. Fix this if this ever shows up
- /// in a hot path.
- static inline int Log2(uint64_t x) {
- // DCHECK_GT(x, 0);
- if (x == 1) return 0;
- // Compute result = ceil(log2(x))
- // = floor(log2(x - 1)) + 1, for x > 1
- // by finding the position of the most significant bit (1-indexed) of x - 1
- // (floor(log2(n)) = MSB(n) (0-indexed))
- --x;
- int result = 1;
- while (x >>= 1)
- ++result;
- return result;
- }
-
- /// Swaps the byte order (i.e. endianess)
- static inline int64_t ByteSwap(int64_t value) { return PARQUET_BYTE_SWAP64(value); }
- static inline uint64_t ByteSwap(uint64_t value) {
- return static_cast<uint64_t>(PARQUET_BYTE_SWAP64(value));
- }
- static inline int32_t ByteSwap(int32_t value) { return PARQUET_BYTE_SWAP32(value); }
- static inline uint32_t ByteSwap(uint32_t value) {
- return static_cast<uint32_t>(PARQUET_BYTE_SWAP32(value));
- }
- static inline int16_t ByteSwap(int16_t value) {
- return (((value >> 8) & 0xff) | ((value & 0xff) << 8));
- }
- static inline uint16_t ByteSwap(uint16_t value) {
- return static_cast<uint16_t>(ByteSwap(static_cast<int16_t>(value)));
- }
-
- /// Write the swapped bytes into dst. Src and st cannot overlap.
- static inline void ByteSwap(void* dst, const void* src, int len) {
- switch (len) {
- case 1:
- *reinterpret_cast<int8_t*>(dst) = *reinterpret_cast<const int8_t*>(src);
- return;
- case 2:
- *reinterpret_cast<int16_t*>(dst) =
- ByteSwap(*reinterpret_cast<const int16_t*>(src));
- return;
- case 4:
- *reinterpret_cast<int32_t*>(dst) =
- ByteSwap(*reinterpret_cast<const int32_t*>(src));
- return;
- case 8:
- *reinterpret_cast<int64_t*>(dst) =
- ByteSwap(*reinterpret_cast<const int64_t*>(src));
- return;
- default:
- break;
- }
-
- uint8_t* d = reinterpret_cast<uint8_t*>(dst);
- const uint8_t* s = reinterpret_cast<const uint8_t*>(src);
- for (int i = 0; i < len; ++i) {
- d[i] = s[len - i - 1];
- }
- }
-
-/// Converts to big endian format (if not already in big endian) from the
-/// machine's native endian format.
-#if __BYTE_ORDER == __LITTLE_ENDIAN
- static inline int64_t ToBigEndian(int64_t value) { return ByteSwap(value); }
- static inline uint64_t ToBigEndian(uint64_t value) { return ByteSwap(value); }
- static inline int32_t ToBigEndian(int32_t value) { return ByteSwap(value); }
- static inline uint32_t ToBigEndian(uint32_t value) { return ByteSwap(value); }
- static inline int16_t ToBigEndian(int16_t value) { return ByteSwap(value); }
- static inline uint16_t ToBigEndian(uint16_t value) { return ByteSwap(value); }
-#else
- static inline int64_t ToBigEndian(int64_t val) { return val; }
- static inline uint64_t ToBigEndian(uint64_t val) { return val; }
- static inline int32_t ToBigEndian(int32_t val) { return val; }
- static inline uint32_t ToBigEndian(uint32_t val) { return val; }
- static inline int16_t ToBigEndian(int16_t val) { return val; }
- static inline uint16_t ToBigEndian(uint16_t val) { return val; }
-#endif
-
-/// Converts from big endian format to the machine's native endian format.
-#if __BYTE_ORDER == __LITTLE_ENDIAN
- static inline int64_t FromBigEndian(int64_t value) { return ByteSwap(value); }
- static inline uint64_t FromBigEndian(uint64_t value) { return ByteSwap(value); }
- static inline int32_t FromBigEndian(int32_t value) { return ByteSwap(value); }
- static inline uint32_t FromBigEndian(uint32_t value) { return ByteSwap(value); }
- static inline int16_t FromBigEndian(int16_t value) { return ByteSwap(value); }
- static inline uint16_t FromBigEndian(uint16_t value) { return ByteSwap(value); }
-#else
- static inline int64_t FromBigEndian(int64_t val) { return val; }
- static inline uint64_t FromBigEndian(uint64_t val) { return val; }
- static inline int32_t FromBigEndian(int32_t val) { return val; }
- static inline uint32_t FromBigEndian(uint32_t val) { return val; }
- static inline int16_t FromBigEndian(int16_t val) { return val; }
- static inline uint16_t FromBigEndian(uint16_t val) { return val; }
-#endif
-
- // Logical right shift for signed integer types
- // This is needed because the C >> operator does arithmetic right shift
- // Negative shift amounts lead to undefined behavior
- template <typename T>
- static T ShiftRightLogical(T v, int shift) {
- // Conversion to unsigned ensures most significant bits always filled with 0's
- return static_cast<typename make_unsigned<T>::type>(v) >> shift;
- }
-
- // Get an specific bit of a numeric type
- template <typename T>
- static inline int8_t GetBit(T v, int bitpos) {
- T masked = v & (static_cast<T>(0x1) << bitpos);
- return static_cast<int8_t>(ShiftRightLogical(masked, bitpos));
- }
-
- // Set a specific bit to 1
- // Behavior when bitpos is negative is undefined
- template <typename T>
- static T SetBit(T v, int bitpos) {
- return v | (static_cast<T>(0x1) << bitpos);
- }
-
- static inline bool GetArrayBit(const uint8_t* bits, int i) {
- return (bits[i / 8] & (1 << (i % 8))) != 0;
- }
-
- static inline void SetArrayBit(uint8_t* bits, int i, bool is_set) {
- bits[i / 8] |= (1 << (i % 8)) * is_set;
- }
-
- // Set a specific bit to 0
- // Behavior when bitpos is negative is undefined
- template <typename T>
- static T UnsetBit(T v, int bitpos) {
- return v & ~(static_cast<T>(0x1) << bitpos);
- }
-
- // Returns the minimum number of bits needed to represent the value of 'x'
- static inline int NumRequiredBits(uint64_t x) {
- for (int i = 63; i >= 0; --i) {
- if (x & (UINT64_C(1) << i)) return i + 1;
- }
- return 0;
- }
-};
-
-} // namespace parquet
-
-#endif // PARQUET_UTIL_BIT_UTIL_H