You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by we...@apache.org on 2016/09/05 19:50:03 UTC
parquet-cpp git commit: PARQUET-681: Add tool to scan a parquet file
Repository: parquet-cpp
Updated Branches:
refs/heads/master fa6e47616 -> c67ee3e52
PARQUET-681: Add tool to scan a parquet file
Added a ReadBatchValues() API to the Column class.
Added a parquet-scan tool
Separated examples into benchmarks/tools
added clang tidy and clang format to benchmarks and tools
Author: Deepak Majeti <de...@hpe.com>
Closes #144 from majetideepak/parquetscan and squashes the following commits:
cc7f183 [Deepak Majeti] Removed GetRemainingInPage API
44da480 [Deepak Majeti] add scan all in public api
20829b8 [Deepak Majeti] clang-format
da62354 [Deepak Majeti] ScanAllValues API
e385f61 [Deepak Majeti] put clang-* in the root directory
9ff785c [Deepak Majeti] use c++ random
d854bde [Deepak Majeti] parquet scan tool
Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/c67ee3e5
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/c67ee3e5
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/c67ee3e5
Branch: refs/heads/master
Commit: c67ee3e521b93531f4b59d3183a04ad5d316cbae
Parents: fa6e476
Author: Deepak Majeti <de...@hpe.com>
Authored: Mon Sep 5 15:49:36 2016 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Mon Sep 5 15:49:36 2016 -0400
----------------------------------------------------------------------
.clang-format | 65 +++++
.clang-tidy | 13 +
.clang-tidy-ignore | 1 +
CMakeLists.txt | 15 +-
benchmarks/CMakeLists.txt | 29 +++
benchmarks/decode_benchmark.cc | 461 ++++++++++++++++++++++++++++++++++++
example/CMakeLists.txt | 36 ---
example/decode_benchmark.cc | 460 -----------------------------------
example/parquet-dump-schema.cc | 39 ---
example/parquet_reader.cc | 70 ------
src/.clang-format | 65 -----
src/.clang-tidy | 13 -
src/.clang-tidy-ignore | 1 -
src/parquet/api/reader.h | 1 +
src/parquet/column/reader.h | 1 -
src/parquet/column/scan-all.h | 67 ++++++
src/parquet/column/scanner.h | 1 -
src/parquet/file/reader.cc | 6 +-
src/parquet/schema/types.cc | 8 +-
src/parquet/types-test.cc | 62 +++--
src/parquet/types.cc | 33 ++-
src/parquet/types.h | 10 +-
tools/CMakeLists.txt | 34 +++
tools/parquet-dump-schema.cc | 36 +++
tools/parquet-scan.cc | 108 +++++++++
tools/parquet_reader.cc | 67 ++++++
26 files changed, 962 insertions(+), 740 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/.clang-format
----------------------------------------------------------------------
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..7d5b3cf
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,65 @@
+---
+Language: Cpp
+# BasedOnStyle: Google
+AccessModifierOffset: -1
+AlignAfterOpenBracket: false
+AlignConsecutiveAssignments: false
+AlignEscapedNewlinesLeft: true
+AlignOperands: true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Inline
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: true
+BinPackParameters: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+ColumnLimit: 90
+CommentPragmas: '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat: false
+ExperimentalAutoDetectBinPacking: false
+ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
+IndentCaseLabels: true
+IndentWidth: 2
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd: ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakBeforeFirstCallParameter: 1000
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp11
+TabWidth: 8
+UseTab: Never
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/.clang-tidy
----------------------------------------------------------------------
diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 0000000..6fc3742
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,13 @@
+---
+Checks: 'clang-diagnostic-*,clang-analyzer-*,-clang-analyzer-alpha*,google-.*,modernize-.*,readablity-.*'
+HeaderFilterRegex: 'parquet/.*'
+AnalyzeTemporaryDtors: true
+CheckOptions:
+ - key: google-readability-braces-around-statements.ShortStatementLines
+ value: '1'
+ - key: google-readability-function-size.StatementThreshold
+ value: '800'
+ - key: google-readability-namespace-comments.ShortNamespaceLines
+ value: '10'
+ - key: google-readability-namespace-comments.SpacesBeforeComments
+ value: '2'
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/.clang-tidy-ignore
----------------------------------------------------------------------
diff --git a/.clang-tidy-ignore b/.clang-tidy-ignore
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/.clang-tidy-ignore
@@ -0,0 +1 @@
+
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5c26e79..86d0684 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -404,7 +404,7 @@ if (UNIX)
--verbose=2
--linelength=90
--filter=-whitespace/comments,-readability/todo,-build/header_guard,-runtime/references,-readability/check,-build/c++11
- `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/parquet\\/thrift/g'`)
+ `find ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/tools ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks -name \\*.cc -or -name \\*.h | sed -e '/parquet\\/thrift/g'`)
endif (UNIX)
############################################################
@@ -414,11 +414,11 @@ endif (UNIX)
if (${CLANG_FORMAT_FOUND})
# runs clang format and updates files in place.
add_custom_target(format ${BUILD_SUPPORT_DIR}/run-clang-format.sh ${CMAKE_CURRENT_SOURCE_DIR} ${CLANG_FORMAT_BIN} 1
- `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`)
+ `find ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/tools ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`)
# runs clang format and exits with a non-zero exit code if any files need to be reformatted
add_custom_target(check-format ${BUILD_SUPPORT_DIR}/run-clang-format.sh ${CMAKE_CURRENT_SOURCE_DIR} ${CLANG_FORMAT_BIN} 0
- `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`)
+ `find ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/tools ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`)
endif()
@@ -429,10 +429,10 @@ endif()
if (${CLANG_TIDY_FOUND})
# runs clang-tidy and attempts to fix any warning automatically
add_custom_target(clang-tidy ${BUILD_SUPPORT_DIR}/run-clang-tidy.sh ${CLANG_TIDY_BIN} ${CMAKE_BINARY_DIR}/compile_commands.json 1
- `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc | sed -e '/_types/g' | sed -e '/_constants/g'`)
+ `find ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/tools ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks -name \\*.cc | sed -e '/_types/g' | sed -e '/_constants/g'`)
# runs clang-tidy and exits with a non-zero exit code if any errors are found.
add_custom_target(check-clang-tidy ${BUILD_SUPPORT_DIR}/run-clang-tidy.sh ${CLANG_TIDY_BIN} ${CMAKE_BINARY_DIR}/compile_commands.json
- 0 `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc |grep -v -F -f ${CMAKE_CURRENT_SOURCE_DIR}/src/.clang-tidy-ignore`)
+ 0 `find ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/tools ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks -name \\*.cc |grep -v -F -f ${CMAKE_CURRENT_SOURCE_DIR}/.clang-tidy-ignore`)
endif()
@@ -575,11 +575,12 @@ add_subdirectory(src/parquet/schema)
add_subdirectory(src/parquet/thrift)
add_subdirectory(src/parquet/util)
-# Ensure that thrift compilation is done befor using its generated headers
+# Ensure that thrift compilation is done before using its generated headers
# in parquet code.
add_dependencies(parquet_objlib parquet_thrift)
-add_subdirectory(example)
+add_subdirectory(benchmarks)
+add_subdirectory(tools)
add_custom_target(clean-all
COMMAND ${CMAKE_BUILD_TOOL} clean
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/benchmarks/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
new file mode 100644
index 0000000..1df5dea
--- /dev/null
+++ b/benchmarks/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+SET(LINK_LIBS
+ snappystatic
+ thriftstatic)
+
+if (PARQUET_BUILD_EXECUTABLES)
+ add_executable(decode_benchmark decode_benchmark.cc)
+
+ # This uses private APIs
+ target_link_libraries(decode_benchmark ${LINK_LIBS}
+ parquet_static)
+
+endif()
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/benchmarks/decode_benchmark.cc
----------------------------------------------------------------------
diff --git a/benchmarks/decode_benchmark.cc b/benchmarks/decode_benchmark.cc
new file mode 100644
index 0000000..d3748ca
--- /dev/null
+++ b/benchmarks/decode_benchmark.cc
@@ -0,0 +1,461 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <stdio.h>
+#include <iostream>
+#include <random>
+
+#include "parquet/compression/codec.h"
+#include "parquet/encodings/plain-encoding.h"
+#include "parquet/encodings/dictionary-encoding.h"
+#include "parquet/encodings/delta-bit-pack-encoding.h"
+#include "parquet/encodings/delta-byte-array-encoding.h"
+#include "parquet/encodings/delta-length-byte-array-encoding.h"
+#include "parquet/util/stopwatch.h"
+
+/**
+ * Test bed for encodings and some utilities to measure their throughput.
+ * TODO: this file needs some major cleanup.
+ */
+
+class DeltaBitPackEncoder {
+ public:
+ explicit DeltaBitPackEncoder(int mini_block_size = 8) {
+ mini_block_size_ = mini_block_size;
+ }
+
+ void Add(int64_t v) { values_.push_back(v); }
+
+ uint8_t* Encode(int* encoded_len) {
+ uint8_t* result = new uint8_t[10 * 1024 * 1024];
+ int num_mini_blocks = parquet::BitUtil::Ceil(num_values() - 1, mini_block_size_);
+ uint8_t* mini_block_widths = NULL;
+
+ parquet::BitWriter writer(result, 10 * 1024 * 1024);
+
+ // Writer the size of each block. We only use 1 block currently.
+ writer.PutVlqInt(num_mini_blocks * mini_block_size_);
+
+ // Write the number of mini blocks.
+ writer.PutVlqInt(num_mini_blocks);
+
+ // Write the number of values.
+ writer.PutVlqInt(num_values() - 1);
+
+ // Write the first value.
+ writer.PutZigZagVlqInt(values_[0]);
+
+ // Compute the values as deltas and the min delta.
+ int64_t min_delta = std::numeric_limits<int64_t>::max();
+ for (int i = values_.size() - 1; i > 0; --i) {
+ values_[i] -= values_[i - 1];
+ min_delta = std::min(min_delta, values_[i]);
+ }
+
+ // Write out the min delta.
+ writer.PutZigZagVlqInt(min_delta);
+
+ // We need to save num_mini_blocks bytes to store the bit widths of the mini
+ // blocks.
+ mini_block_widths = writer.GetNextBytePtr(num_mini_blocks);
+
+ int idx = 1;
+ for (int i = 0; i < num_mini_blocks; ++i) {
+ int n = std::min(mini_block_size_, num_values() - idx);
+
+ // Compute the max delta in this mini block.
+ int64_t max_delta = std::numeric_limits<int64_t>::min();
+ for (int j = 0; j < n; ++j) {
+ max_delta = std::max(values_[idx + j], max_delta);
+ }
+
+ // The bit width for this block is the number of bits needed to store
+ // (max_delta - min_delta).
+ int bit_width = parquet::BitUtil::NumRequiredBits(max_delta - min_delta);
+ mini_block_widths[i] = bit_width;
+
+ // Encode this mini blocking using min_delta and bit_width
+ for (int j = 0; j < n; ++j) {
+ writer.PutValue(values_[idx + j] - min_delta, bit_width);
+ }
+
+ // Pad out the last block.
+ for (int j = n; j < mini_block_size_; ++j) {
+ writer.PutValue(0, bit_width);
+ }
+ idx += n;
+ }
+
+ writer.Flush();
+ *encoded_len = writer.bytes_written();
+ return result;
+ }
+
+ int num_values() const { return values_.size(); }
+
+ private:
+ int mini_block_size_;
+ std::vector<int64_t> values_;
+};
+
+class DeltaLengthByteArrayEncoder {
+ public:
+ explicit DeltaLengthByteArrayEncoder(int mini_block_size = 8)
+ : len_encoder_(mini_block_size),
+ buffer_(new uint8_t[10 * 1024 * 1024]),
+ offset_(0),
+ plain_encoded_len_(0) {}
+
+ void Add(const std::string& s) {
+ Add(reinterpret_cast<const uint8_t*>(s.data()), s.size());
+ }
+
+ void Add(const uint8_t* ptr, int len) {
+ plain_encoded_len_ += len + sizeof(int);
+ len_encoder_.Add(len);
+ memcpy(buffer_ + offset_, ptr, len);
+ offset_ += len;
+ }
+
+ uint8_t* Encode(int* encoded_len) {
+ uint8_t* encoded_lengths = len_encoder_.Encode(encoded_len);
+ memmove(buffer_ + *encoded_len + sizeof(int), buffer_, offset_);
+ memcpy(buffer_, encoded_len, sizeof(int));
+ memcpy(buffer_ + sizeof(int), encoded_lengths, *encoded_len);
+ *encoded_len += offset_ + sizeof(int);
+ return buffer_;
+ }
+
+ int num_values() const { return len_encoder_.num_values(); }
+ int plain_encoded_len() const { return plain_encoded_len_; }
+
+ private:
+ DeltaBitPackEncoder len_encoder_;
+ uint8_t* buffer_;
+ int offset_;
+ int plain_encoded_len_;
+};
+
+class DeltaByteArrayEncoder {
+ public:
+ DeltaByteArrayEncoder() : plain_encoded_len_(0) {}
+
+ void Add(const std::string& s) {
+ plain_encoded_len_ += s.size() + sizeof(int);
+ int min_len = std::min(s.size(), last_value_.size());
+ int prefix_len = 0;
+ for (int i = 0; i < min_len; ++i) {
+ if (s[i] == last_value_[i]) {
+ ++prefix_len;
+ } else {
+ break;
+ }
+ }
+ prefix_len_encoder_.Add(prefix_len);
+ suffix_encoder_.Add(
+ reinterpret_cast<const uint8_t*>(s.data()) + prefix_len, s.size() - prefix_len);
+ last_value_ = s;
+ }
+
+ uint8_t* Encode(int* encoded_len) {
+ int prefix_buffer_len;
+ uint8_t* prefix_buffer = prefix_len_encoder_.Encode(&prefix_buffer_len);
+ int suffix_buffer_len;
+ uint8_t* suffix_buffer = suffix_encoder_.Encode(&suffix_buffer_len);
+
+ uint8_t* buffer = new uint8_t[10 * 1024 * 1024];
+ memcpy(buffer, &prefix_buffer_len, sizeof(int));
+ memcpy(buffer + sizeof(int), prefix_buffer, prefix_buffer_len);
+ memcpy(buffer + sizeof(int) + prefix_buffer_len, suffix_buffer, suffix_buffer_len);
+ *encoded_len = sizeof(int) + prefix_buffer_len + suffix_buffer_len;
+ return buffer;
+ }
+
+ int num_values() const { return prefix_len_encoder_.num_values(); }
+ int plain_encoded_len() const { return plain_encoded_len_; }
+
+ private:
+ DeltaBitPackEncoder prefix_len_encoder_;
+ DeltaLengthByteArrayEncoder suffix_encoder_;
+ std::string last_value_;
+ int plain_encoded_len_;
+};
+
+uint64_t TestPlainIntEncoding(const uint8_t* data, int num_values, int batch_size) {
+ uint64_t result = 0;
+ parquet::PlainDecoder<parquet::Int64Type> decoder(nullptr);
+ decoder.SetData(num_values, data, num_values * sizeof(int64_t));
+ int64_t values[batch_size];
+ for (int i = 0; i < num_values;) {
+ int n = decoder.Decode(values, batch_size);
+ for (int j = 0; j < n; ++j) {
+ result += values[j];
+ }
+ i += n;
+ }
+ return result;
+}
+
+uint64_t TestBinaryPackedEncoding(const char* name, const std::vector<int64_t>& values,
+ int benchmark_iters = -1, int benchmark_batch_size = 1) {
+ int mini_block_size;
+ if (values.size() < 8) {
+ mini_block_size = 8;
+ } else if (values.size() < 16) {
+ mini_block_size = 16;
+ } else {
+ mini_block_size = 32;
+ }
+ parquet::DeltaBitPackDecoder<parquet::Int64Type> decoder(nullptr);
+ DeltaBitPackEncoder encoder(mini_block_size);
+ for (size_t i = 0; i < values.size(); ++i) {
+ encoder.Add(values[i]);
+ }
+
+ int raw_len = encoder.num_values() * sizeof(int);
+ int len;
+ uint8_t* buffer = encoder.Encode(&len);
+
+ if (benchmark_iters == -1) {
+ printf("%s\n", name);
+ printf(" Raw len: %d\n", raw_len);
+ printf(" Encoded len: %d (%0.2f%%)\n", len, len * 100 / static_cast<float>(raw_len));
+ decoder.SetData(encoder.num_values(), buffer, len);
+ for (int i = 0; i < encoder.num_values(); ++i) {
+ int64_t x = 0;
+ decoder.Decode(&x, 1);
+ if (values[i] != x) {
+ std::cerr << "Bad: " << i << std::endl;
+ std::cerr << " " << x << " != " << values[i] << std::endl;
+ break;
+ }
+ }
+ return 0;
+ } else {
+ printf("%s\n", name);
+ printf(" Raw len: %d\n", raw_len);
+ printf(" Encoded len: %d (%0.2f%%)\n", len, len * 100 / static_cast<float>(raw_len));
+
+ uint64_t result = 0;
+ int64_t buf[benchmark_batch_size];
+ parquet::StopWatch sw;
+ sw.Start();
+ for (int k = 0; k < benchmark_iters; ++k) {
+ decoder.SetData(encoder.num_values(), buffer, len);
+ for (size_t i = 0; i < values.size();) {
+ int n = decoder.Decode(buf, benchmark_batch_size);
+ for (int j = 0; j < n; ++j) {
+ result += buf[j];
+ }
+ i += n;
+ }
+ }
+ uint64_t elapsed = sw.Stop();
+ double num_ints = values.size() * benchmark_iters * 1000.;
+ printf("%s rate (batch size = %2d): %0.3fM per second.\n", name, benchmark_batch_size,
+ num_ints / elapsed);
+ return result;
+ }
+}
+
+#define TEST(NAME, FN, DATA, BATCH_SIZE) \
+ sw.Start(); \
+ for (int i = 0; i < NUM_ITERS; ++i) { \
+ FN(reinterpret_cast<uint8_t*>(&DATA[0]), NUM_VALUES, BATCH_SIZE); \
+ } \
+ elapsed = sw.Stop(); \
+ printf("%s rate (batch size = %2d): %0.3fM per second.\n", NAME, BATCH_SIZE, \
+ mult / elapsed);
+
+void TestPlainIntCompressed(parquet::Codec* codec, const std::vector<int64_t>& data,
+ int num_iters, int batch_size) {
+ const uint8_t* raw_data = reinterpret_cast<const uint8_t*>(&data[0]);
+ int uncompressed_len = data.size() * sizeof(int64_t);
+ uint8_t* decompressed_data = new uint8_t[uncompressed_len];
+
+ int max_compressed_size = codec->MaxCompressedLen(uncompressed_len, raw_data);
+ uint8_t* compressed_data = new uint8_t[max_compressed_size];
+ int compressed_len =
+ codec->Compress(uncompressed_len, raw_data, max_compressed_size, compressed_data);
+
+ printf("\n%s:\n Uncompressed len: %d\n Compressed len: %d\n", codec->name(),
+ uncompressed_len, compressed_len);
+
+ double mult = num_iters * data.size() * 1000.;
+ parquet::StopWatch sw;
+ sw.Start();
+ uint64_t r = 0;
+ for (int i = 0; i < num_iters; ++i) {
+ codec->Decompress(
+ compressed_len, compressed_data, uncompressed_len, decompressed_data);
+ r += TestPlainIntEncoding(decompressed_data, data.size(), batch_size);
+ }
+ int64_t elapsed = sw.Stop();
+ printf("Compressed(%s) plain int rate (batch size = %2d): %0.3fM per second.\n",
+ codec->name(), batch_size, mult / elapsed);
+
+ delete[] compressed_data;
+ delete[] decompressed_data;
+}
+
+void TestBinaryPacking() {
+ std::vector<int64_t> values;
+ values.clear();
+ for (int i = 0; i < 100; ++i)
+ values.push_back(0);
+ TestBinaryPackedEncoding("Zeros", values);
+
+ values.clear();
+ for (int i = 1; i <= 5; ++i)
+ values.push_back(i);
+ TestBinaryPackedEncoding("Example 1", values);
+
+ values.clear();
+ values.push_back(7);
+ values.push_back(5);
+ values.push_back(3);
+ values.push_back(1);
+ values.push_back(2);
+ values.push_back(3);
+ values.push_back(4);
+ values.push_back(5);
+ TestBinaryPackedEncoding("Example 2", values);
+
+ // Test rand ints between 0 and 10K
+ values.clear();
+ int seed = 0;
+ std::mt19937 gen(seed);
+ std::uniform_int_distribution<int> d(0, 10000);
+ for (int i = 0; i < 500000; ++i) {
+ values.push_back(d(gen));
+ }
+ TestBinaryPackedEncoding("Rand [0, 10000)", values);
+
+ // Test rand ints between 0 and 100
+ values.clear();
+ std::uniform_int_distribution<int> d1(0, 100);
+ for (int i = 0; i < 500000; ++i) {
+ values.push_back(d1(gen));
+ }
+ TestBinaryPackedEncoding("Rand [0, 100)", values);
+}
+
+void TestDeltaLengthByteArray() {
+ parquet::DeltaLengthByteArrayDecoder decoder(nullptr);
+ DeltaLengthByteArrayEncoder encoder;
+
+ std::vector<std::string> values;
+ values.push_back("Hello");
+ values.push_back("World");
+ values.push_back("Foobar");
+ values.push_back("ABCDEF");
+
+ for (size_t i = 0; i < values.size(); ++i) {
+ encoder.Add(values[i]);
+ }
+
+ int len = 0;
+ uint8_t* buffer = encoder.Encode(&len);
+ printf("DeltaLengthByteArray\n Raw len: %d\n Encoded len: %d\n",
+ encoder.plain_encoded_len(), len);
+ decoder.SetData(encoder.num_values(), buffer, len);
+ for (int i = 0; i < encoder.num_values(); ++i) {
+ parquet::ByteArray v = {0, NULL};
+ decoder.Decode(&v, 1);
+ std::string r = std::string(reinterpret_cast<const char*>(v.ptr), v.len);
+ if (r != values[i]) { std::cout << "Bad " << r << " != " << values[i] << std::endl; }
+ }
+}
+
+void TestDeltaByteArray() {
+ parquet::DeltaByteArrayDecoder decoder(nullptr);
+ DeltaByteArrayEncoder encoder;
+
+ std::vector<std::string> values;
+
+ // Wikipedia example
+ values.push_back("myxa");
+ values.push_back("myxophyta");
+ values.push_back("myxopod");
+ values.push_back("nab");
+ values.push_back("nabbed");
+ values.push_back("nabbing");
+ values.push_back("nabit");
+ values.push_back("nabk");
+ values.push_back("nabob");
+ values.push_back("nacarat");
+ values.push_back("nacelle");
+
+ for (size_t i = 0; i < values.size(); ++i) {
+ encoder.Add(values[i]);
+ }
+
+ int len = 0;
+ uint8_t* buffer = encoder.Encode(&len);
+ printf("DeltaLengthByteArray\n Raw len: %d\n Encoded len: %d\n",
+ encoder.plain_encoded_len(), len);
+ decoder.SetData(encoder.num_values(), buffer, len);
+ for (int i = 0; i < encoder.num_values(); ++i) {
+ parquet::ByteArray v;
+ decoder.Decode(&v, 1);
+ std::string r = std::string(reinterpret_cast<const char*>(v.ptr), v.len);
+ if (r != values[i]) { std::cout << "Bad " << r << " != " << values[i] << std::endl; }
+ }
+}
+
+int main(int argc, char** argv) {
+ TestBinaryPacking();
+ TestDeltaLengthByteArray();
+ TestDeltaByteArray();
+
+ parquet::StopWatch sw;
+ uint64_t elapsed = 0;
+
+ const int NUM_VALUES = 1024 * 1024;
+ const int NUM_ITERS = 10;
+ const double mult = NUM_VALUES * NUM_ITERS * 1000.;
+
+ std::vector<int64_t> plain_int_data;
+ plain_int_data.resize(NUM_VALUES);
+
+ TEST("Plain decoder", TestPlainIntEncoding, plain_int_data, 1);
+ TEST("Plain decoder", TestPlainIntEncoding, plain_int_data, 16);
+ TEST("Plain decoder", TestPlainIntEncoding, plain_int_data, 32);
+ TEST("Plain decoder", TestPlainIntEncoding, plain_int_data, 64);
+
+ // Test rand ints between 0 and 10K
+ std::vector<int64_t> values;
+ int seed = 0;
+ std::mt19937 gen(seed);
+ std::uniform_int_distribution<int> d(0, 10000);
+ for (int i = 0; i < 1000000; ++i) {
+ values.push_back(d(gen));
+ }
+ TestBinaryPackedEncoding("Rand 0-10K", values, 100, 1);
+ TestBinaryPackedEncoding("Rand 0-10K", values, 100, 16);
+ TestBinaryPackedEncoding("Rand 0-10K", values, 100, 32);
+ TestBinaryPackedEncoding("Rand 0-10K", values, 100, 64);
+
+ parquet::SnappyCodec snappy_codec;
+
+ TestPlainIntCompressed(&snappy_codec, values, 100, 1);
+ TestPlainIntCompressed(&snappy_codec, values, 100, 16);
+ TestPlainIntCompressed(&snappy_codec, values, 100, 32);
+ TestPlainIntCompressed(&snappy_codec, values, 100, 64);
+
+ return 0;
+}
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/example/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
deleted file mode 100644
index d622d09..0000000
--- a/example/CMakeLists.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-SET(LINK_LIBS
- snappystatic
- thriftstatic)
-
-if (PARQUET_BUILD_EXECUTABLES)
- add_executable(decode_benchmark decode_benchmark.cc)
-
- # This uses private APIs
- target_link_libraries(decode_benchmark ${LINK_LIBS}
- parquet_static)
-
- add_executable(parquet-dump-schema parquet-dump-schema.cc)
- target_link_libraries(parquet-dump-schema ${LINK_LIBS}
- parquet_static)
-
- add_executable(parquet_reader parquet_reader.cc)
- target_link_libraries(parquet_reader ${LINK_LIBS}
- parquet_static)
-endif()
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/example/decode_benchmark.cc
----------------------------------------------------------------------
diff --git a/example/decode_benchmark.cc b/example/decode_benchmark.cc
deleted file mode 100644
index 4eb1975..0000000
--- a/example/decode_benchmark.cc
+++ /dev/null
@@ -1,460 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <iostream>
-#include <stdio.h>
-
-#include "parquet/compression/codec.h"
-#include "parquet/encodings/plain-encoding.h"
-#include "parquet/encodings/dictionary-encoding.h"
-#include "parquet/encodings/delta-bit-pack-encoding.h"
-#include "parquet/encodings/delta-byte-array-encoding.h"
-#include "parquet/encodings/delta-length-byte-array-encoding.h"
-#include "parquet/util/stopwatch.h"
-
-using namespace parquet;
-using namespace std;
-
-/**
- * Test bed for encodings and some utilities to measure their throughput.
- * TODO: this file needs some major cleanup.
- */
-
-class DeltaBitPackEncoder {
- public:
- DeltaBitPackEncoder(int mini_block_size = 8) {
- mini_block_size_ = mini_block_size;
- }
-
- void Add(int64_t v) {
- values_.push_back(v);
- }
-
- uint8_t* Encode(int* encoded_len) {
- uint8_t* result = new uint8_t[10 * 1024 * 1024];
- int num_mini_blocks = BitUtil::Ceil(num_values() - 1, mini_block_size_);
- uint8_t* mini_block_widths = NULL;
-
- BitWriter writer(result, 10 * 1024 * 1024);
-
- // Writer the size of each block. We only use 1 block currently.
- writer.PutVlqInt(num_mini_blocks * mini_block_size_);
-
- // Write the number of mini blocks.
- writer.PutVlqInt(num_mini_blocks);
-
- // Write the number of values.
- writer.PutVlqInt(num_values() - 1);
-
- // Write the first value.
- writer.PutZigZagVlqInt(values_[0]);
-
- // Compute the values as deltas and the min delta.
- int64_t min_delta = numeric_limits<int64_t>::max();
- for (int i = values_.size() - 1; i > 0; --i) {
- values_[i] -= values_[i - 1];
- min_delta = min(min_delta, values_[i]);
- }
-
- // Write out the min delta.
- writer.PutZigZagVlqInt(min_delta);
-
- // We need to save num_mini_blocks bytes to store the bit widths of the mini blocks.
- mini_block_widths = writer.GetNextBytePtr(num_mini_blocks);
-
- int idx = 1;
- for (int i = 0; i < num_mini_blocks; ++i) {
- int n = min(mini_block_size_, num_values() - idx);
-
- // Compute the max delta in this mini block.
- int64_t max_delta = numeric_limits<int64_t>::min();
- for (int j = 0; j < n; ++j) {
- max_delta = max(values_[idx + j], max_delta);
- }
-
- // The bit width for this block is the number of bits needed to store
- // (max_delta - min_delta).
- int bit_width = BitUtil::NumRequiredBits(max_delta - min_delta);
- mini_block_widths[i] = bit_width;
-
- // Encode this mini blocking using min_delta and bit_width
- for (int j = 0; j < n; ++j) {
- writer.PutValue(values_[idx + j] - min_delta, bit_width);
- }
-
- // Pad out the last block.
- for (int j = n; j < mini_block_size_; ++j) {
- writer.PutValue(0, bit_width);
- }
- idx += n;
- }
-
- writer.Flush();
- *encoded_len = writer.bytes_written();
- return result;
- }
-
- int num_values() const { return values_.size(); }
-
- private:
- int mini_block_size_;
- vector<int64_t> values_;
-};
-
-class DeltaLengthByteArrayEncoder {
- public:
- DeltaLengthByteArrayEncoder(int mini_block_size = 8) :
- len_encoder_(mini_block_size),
- buffer_(new uint8_t[10 * 1024 * 1024]),
- offset_(0),
- plain_encoded_len_(0) {
- }
-
- void Add(const string& s) {
- Add(reinterpret_cast<const uint8_t*>(s.data()), s.size());
- }
-
- void Add(const uint8_t* ptr, int len) {
- plain_encoded_len_ += len + sizeof(int);
- len_encoder_.Add(len);
- memcpy(buffer_ + offset_, ptr, len);
- offset_ += len;
- }
-
- uint8_t* Encode(int* encoded_len) {
- uint8_t* encoded_lengths = len_encoder_.Encode(encoded_len);
- memmove(buffer_ + *encoded_len + sizeof(int), buffer_, offset_);
- memcpy(buffer_, encoded_len, sizeof(int));
- memcpy(buffer_ + sizeof(int), encoded_lengths, *encoded_len);
- *encoded_len += offset_ + sizeof(int);
- return buffer_;
- }
-
- int num_values() const { return len_encoder_.num_values(); }
- int plain_encoded_len() const { return plain_encoded_len_; }
-
- private:
- DeltaBitPackEncoder len_encoder_;
- uint8_t* buffer_;
- int offset_;
- int plain_encoded_len_;
-};
-
-class DeltaByteArrayEncoder {
- public:
- DeltaByteArrayEncoder() : plain_encoded_len_(0) {}
-
- void Add(const string& s) {
- plain_encoded_len_ += s.size() + sizeof(int);
- int min_len = min(s.size(), last_value_.size());
- int prefix_len = 0;
- for (int i = 0; i < min_len; ++i) {
- if (s[i] == last_value_[i]) {
- ++prefix_len;
- } else {
- break;
- }
- }
- prefix_len_encoder_.Add(prefix_len);
- suffix_encoder_.Add(reinterpret_cast<const uint8_t*>(s.data()) + prefix_len,
- s.size() - prefix_len);
- last_value_ = s;
- }
-
- uint8_t* Encode(int* encoded_len) {
- int prefix_buffer_len;
- uint8_t* prefix_buffer = prefix_len_encoder_.Encode(&prefix_buffer_len);
- int suffix_buffer_len;
- uint8_t* suffix_buffer = suffix_encoder_.Encode(&suffix_buffer_len);
-
- uint8_t* buffer = new uint8_t[10 * 1024 * 1024];
- memcpy(buffer, &prefix_buffer_len, sizeof(int));
- memcpy(buffer + sizeof(int), prefix_buffer, prefix_buffer_len);
- memcpy(buffer + sizeof(int) + prefix_buffer_len, suffix_buffer, suffix_buffer_len);
- *encoded_len = sizeof(int) + prefix_buffer_len + suffix_buffer_len;
- return buffer;
- }
-
- int num_values() const { return prefix_len_encoder_.num_values(); }
- int plain_encoded_len() const { return plain_encoded_len_; }
-
- private:
- DeltaBitPackEncoder prefix_len_encoder_;
- DeltaLengthByteArrayEncoder suffix_encoder_;
- string last_value_;
- int plain_encoded_len_;
-};
-
-
-uint64_t TestPlainIntEncoding(const uint8_t* data, int num_values, int batch_size) {
- uint64_t result = 0;
- PlainDecoder<Int64Type> decoder(nullptr);
- decoder.SetData(num_values, data, num_values * sizeof(int64_t));
- int64_t values[batch_size];
- for (int i = 0; i < num_values;) {
- int n = decoder.Decode(values, batch_size);
- for (int j = 0; j < n; ++j) {
- result += values[j];
- }
- i += n;
- }
- return result;
-}
-
-uint64_t TestBinaryPackedEncoding(const char* name, const vector<int64_t>& values,
- int benchmark_iters = -1, int benchmark_batch_size = 1) {
- int mini_block_size;
- if (values.size() < 8) {
- mini_block_size = 8;
- } else if (values.size() < 16) {
- mini_block_size = 16;
- } else {
- mini_block_size = 32;
- }
- DeltaBitPackDecoder<Int64Type> decoder(nullptr);
- DeltaBitPackEncoder encoder(mini_block_size);
- for (size_t i = 0; i < values.size(); ++i) {
- encoder.Add(values[i]);
- }
-
- int raw_len = encoder.num_values() * sizeof(int);
- int len;
- uint8_t* buffer = encoder.Encode(&len);
-
- if (benchmark_iters == -1) {
- printf("%s\n", name);
- printf(" Raw len: %d\n", raw_len);
- printf(" Encoded len: %d (%0.2f%%)\n", len, len * 100 / (float)raw_len);
- decoder.SetData(encoder.num_values(), buffer, len);
- for (int i = 0; i < encoder.num_values(); ++i) {
- int64_t x = 0;
- decoder.Decode(&x, 1);
- if (values[i] != x) {
- cerr << "Bad: " << i << endl;
- cerr << " " << x << " != " << values[i] << endl;
- break;
- }
- }
- return 0;
- } else {
- printf("%s\n", name);
- printf(" Raw len: %d\n", raw_len);
- printf(" Encoded len: %d (%0.2f%%)\n", len, len * 100 / (float)raw_len);
-
- uint64_t result = 0;
- int64_t buf[benchmark_batch_size];
- StopWatch sw;
- sw.Start();\
- for (int k = 0; k < benchmark_iters; ++k) {
- decoder.SetData(encoder.num_values(), buffer, len);
- for (size_t i = 0; i < values.size();) {
- int n = decoder.Decode(buf, benchmark_batch_size);
- for (int j = 0; j < n; ++j) {
- result += buf[j];
- }
- i += n;
- }
- }
- uint64_t elapsed = sw.Stop();
- double num_ints = values.size() * benchmark_iters * 1000.;
- printf("%s rate (batch size = %2d): %0.3fM per second.\n",
- name, benchmark_batch_size, num_ints / elapsed);
- return result;
- }
-}
-
-#define TEST(NAME, FN, DATA, BATCH_SIZE)\
- sw.Start();\
- for (int i = 0; i < NUM_ITERS; ++i) {\
- FN(reinterpret_cast<uint8_t*>(&DATA[0]), NUM_VALUES, BATCH_SIZE);\
- }\
- elapsed = sw.Stop();\
- printf("%s rate (batch size = %2d): %0.3fM per second.\n",\
- NAME, BATCH_SIZE, mult / elapsed);
-
-void TestPlainIntCompressed(Codec* codec, const vector<int64_t>& data, int num_iters, int batch_size) {
- const uint8_t* raw_data = reinterpret_cast<const uint8_t*>(&data[0]);
- int uncompressed_len = data.size() * sizeof(int64_t);
- uint8_t* decompressed_data = new uint8_t[uncompressed_len];
-
- int max_compressed_size = codec->MaxCompressedLen(uncompressed_len, raw_data);
- uint8_t* compressed_data = new uint8_t[max_compressed_size];
- int compressed_len = codec->Compress(uncompressed_len, raw_data,
- max_compressed_size, compressed_data);
-
- printf("\n%s:\n Uncompressed len: %d\n Compressed len: %d\n",
- codec->name(), uncompressed_len, compressed_len);
-
- double mult = num_iters * data.size() * 1000.;
- StopWatch sw;
- sw.Start();
- uint64_t r = 0;
- for (int i = 0; i < num_iters; ++i) {
- codec->Decompress(compressed_len, compressed_data,
- uncompressed_len, decompressed_data);
- r += TestPlainIntEncoding(decompressed_data, data.size(), batch_size);
- }
- int64_t elapsed = sw.Stop();\
- printf("Compressed(%s) plain int rate (batch size = %2d): %0.3fM per second.\n",
- codec->name(), batch_size, mult / elapsed);
-
- delete[] compressed_data;
- delete[] decompressed_data;
-}
-
-void TestBinaryPacking() {
- vector<int64_t> values;
- values.clear();
- for (int i = 0; i < 100; ++i) values.push_back(0);
- TestBinaryPackedEncoding("Zeros", values);
-
- values.clear();
- for (int i = 1; i <= 5; ++i) values.push_back(i);
- TestBinaryPackedEncoding("Example 1", values);
-
- values.clear();
- values.push_back(7);
- values.push_back(5);
- values.push_back(3);
- values.push_back(1);
- values.push_back(2);
- values.push_back(3);
- values.push_back(4);
- values.push_back(5);
- TestBinaryPackedEncoding("Example 2", values);
-
- // Test rand ints between 0 and 10K
- values.clear();
- for (int i = 0; i < 500000; ++i) {
- values.push_back(rand() % (10000));
- }
- TestBinaryPackedEncoding("Rand [0, 10000)", values);
-
- // Test rand ints between 0 and 100
- values.clear();
- for (int i = 0; i < 500000; ++i) {
- values.push_back(rand() % 100);
- }
- TestBinaryPackedEncoding("Rand [0, 100)", values);
-}
-
-void TestDeltaLengthByteArray() {
- DeltaLengthByteArrayDecoder decoder(nullptr);
- DeltaLengthByteArrayEncoder encoder;
-
- vector<string> values;
- values.push_back("Hello");
- values.push_back("World");
- values.push_back("Foobar");
- values.push_back("ABCDEF");
-
- for (size_t i = 0; i < values.size(); ++i) {
- encoder.Add(values[i]);
- }
-
- int len = 0;
- uint8_t* buffer = encoder.Encode(&len);
- printf("DeltaLengthByteArray\n Raw len: %d\n Encoded len: %d\n",
- encoder.plain_encoded_len(), len);
- decoder.SetData(encoder.num_values(), buffer, len);
- for (int i = 0; i < encoder.num_values(); ++i) {
- ByteArray v = {0, NULL};
- decoder.Decode(&v, 1);
- string r = string((char*)v.ptr, v.len);
- if (r != values[i]) {
- cout << "Bad " << r << " != " << values[i] << endl;
- }
- }
-}
-
-void TestDeltaByteArray() {
- DeltaByteArrayDecoder decoder(nullptr);
- DeltaByteArrayEncoder encoder;
-
- vector<string> values;
-
- // Wikipedia example
- values.push_back("myxa");
- values.push_back("myxophyta");
- values.push_back("myxopod");
- values.push_back("nab");
- values.push_back("nabbed");
- values.push_back("nabbing");
- values.push_back("nabit");
- values.push_back("nabk");
- values.push_back("nabob");
- values.push_back("nacarat");
- values.push_back("nacelle");
-
- for (size_t i = 0; i < values.size(); ++i) {
- encoder.Add(values[i]);
- }
-
- int len = 0;
- uint8_t* buffer = encoder.Encode(&len);
- printf("DeltaLengthByteArray\n Raw len: %d\n Encoded len: %d\n",
- encoder.plain_encoded_len(), len);
- decoder.SetData(encoder.num_values(), buffer, len);
- for (int i = 0; i < encoder.num_values(); ++i) {
- ByteArray v;
- decoder.Decode(&v, 1);
- string r = string((char*)v.ptr, v.len);
- if (r != values[i]) {
- cout << "Bad " << r << " != " << values[i] << endl;
- }
- }
-}
-
-int main(int argc, char** argv) {
- TestBinaryPacking();
- TestDeltaLengthByteArray();
- TestDeltaByteArray();
-
- StopWatch sw;
- uint64_t elapsed = 0;
-
- const int NUM_VALUES = 1024 * 1024;
- const int NUM_ITERS = 10;
- const double mult = NUM_VALUES * NUM_ITERS * 1000.;
-
- vector<int64_t> plain_int_data;
- plain_int_data.resize(NUM_VALUES);
-
- TEST("Plain decoder", TestPlainIntEncoding, plain_int_data, 1);
- TEST("Plain decoder", TestPlainIntEncoding, plain_int_data, 16);
- TEST("Plain decoder", TestPlainIntEncoding, plain_int_data, 32);
- TEST("Plain decoder", TestPlainIntEncoding, plain_int_data, 64);
-
- // Test rand ints between 0 and 10K
- vector<int64_t> values;
- for (int i = 0; i < 1000000; ++i) {
- values.push_back(rand() % 10000);
- }
- TestBinaryPackedEncoding("Rand 0-10K", values, 100, 1);
- TestBinaryPackedEncoding("Rand 0-10K", values, 100, 16);
- TestBinaryPackedEncoding("Rand 0-10K", values, 100, 32);
- TestBinaryPackedEncoding("Rand 0-10K", values, 100, 64);
-
- SnappyCodec snappy_codec;
-
- TestPlainIntCompressed(&snappy_codec, values, 100, 1);
- TestPlainIntCompressed(&snappy_codec, values, 100, 16);
- TestPlainIntCompressed(&snappy_codec, values, 100, 32);
- TestPlainIntCompressed(&snappy_codec, values, 100, 64);
-
- return 0;
-}
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/example/parquet-dump-schema.cc
----------------------------------------------------------------------
diff --git a/example/parquet-dump-schema.cc b/example/parquet-dump-schema.cc
deleted file mode 100644
index ed7b570..0000000
--- a/example/parquet-dump-schema.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <iostream>
-
-#include <parquet/api/reader.h>
-#include <parquet/api/schema.h>
-
-using namespace parquet;
-
-int main(int argc, char** argv) {
- std::string filename = argv[1];
-
- try {
- std::unique_ptr<ParquetFileReader> reader = ParquetFileReader::OpenFile(filename);
- PrintSchema(reader->metadata()->schema_descriptor()->schema().get(), std::cout);
- } catch (const std::exception& e) {
- std::cerr << "Parquet error: "
- << e.what()
- << std::endl;
- return -1;
- }
-
- return 0;
-}
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/example/parquet_reader.cc
----------------------------------------------------------------------
diff --git a/example/parquet_reader.cc b/example/parquet_reader.cc
deleted file mode 100644
index 2baa2b1..0000000
--- a/example/parquet_reader.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <iostream>
-#include <memory>
-#include <list>
-
-#include <parquet/api/reader.h>
-
-using namespace parquet;
-
-int main(int argc, char** argv) {
- if (argc > 5 || argc < 2) {
- std::cerr << "Usage: parquet_reader [--only-metadata] [--no-memory-map] [--columns=...] <file>"
- << std::endl;
- return -1;
- }
-
- std::string filename;
- bool print_values = true;
- bool memory_map = true;
-
- // Read command-line options
- const std::string COLUMNS_PREFIX = "--columns=";
- std::list<int> columns;
-
- char *param, *value;
- for (int i = 1; i < argc; i++) {
- if ((param = std::strstr(argv[i], "--only-metadata"))) {
- print_values = false;
- } else if ((param = std::strstr(argv[i], "--no-memory-map"))) {
- memory_map = false;
- } else if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) {
- value = std::strtok(param+COLUMNS_PREFIX.length(), "," );
- while (value) {
- columns.push_back(std::atoi(value));
- value = std::strtok(nullptr, "," );
- }
- } else {
- filename = argv[i];
- }
- }
-
- try {
- std::unique_ptr<ParquetFileReader> reader = ParquetFileReader::OpenFile(filename,
- memory_map);
- reader->DebugPrint(std::cout, columns, print_values);
- } catch (const std::exception& e) {
- std::cerr << "Parquet error: "
- << e.what()
- << std::endl;
- return -1;
- }
-
- return 0;
-}
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/.clang-format
----------------------------------------------------------------------
diff --git a/src/.clang-format b/src/.clang-format
deleted file mode 100644
index 7d5b3cf..0000000
--- a/src/.clang-format
+++ /dev/null
@@ -1,65 +0,0 @@
----
-Language: Cpp
-# BasedOnStyle: Google
-AccessModifierOffset: -1
-AlignAfterOpenBracket: false
-AlignConsecutiveAssignments: false
-AlignEscapedNewlinesLeft: true
-AlignOperands: true
-AlignTrailingComments: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: true
-AllowShortCaseLabelsOnASingleLine: false
-AllowShortFunctionsOnASingleLine: Inline
-AllowShortIfStatementsOnASingleLine: true
-AllowShortLoopsOnASingleLine: false
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: true
-BinPackArguments: true
-BinPackParameters: true
-BreakBeforeBinaryOperators: None
-BreakBeforeBraces: Attach
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
-ColumnLimit: 90
-CommentPragmas: '^ IWYU pragma:'
-ConstructorInitializerAllOnOneLineOrOnePerLine: true
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
-Cpp11BracedListStyle: true
-DerivePointerAlignment: false
-DisableFormat: false
-ExperimentalAutoDetectBinPacking: false
-ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
-IndentCaseLabels: true
-IndentWidth: 2
-IndentWrappedFunctionNames: false
-KeepEmptyLinesAtTheStartOfBlocks: false
-MacroBlockBegin: ''
-MacroBlockEnd: ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBlockIndentWidth: 2
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: false
-PenaltyBreakBeforeFirstCallParameter: 1000
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 200
-PointerAlignment: Left
-SpaceAfterCStyleCast: false
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeParens: ControlStatements
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 2
-SpacesInAngles: false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard: Cpp11
-TabWidth: 8
-UseTab: Never
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/.clang-tidy
----------------------------------------------------------------------
diff --git a/src/.clang-tidy b/src/.clang-tidy
deleted file mode 100644
index 6fc3742..0000000
--- a/src/.clang-tidy
+++ /dev/null
@@ -1,13 +0,0 @@
----
-Checks: 'clang-diagnostic-*,clang-analyzer-*,-clang-analyzer-alpha*,google-.*,modernize-.*,readablity-.*'
-HeaderFilterRegex: 'parquet/.*'
-AnalyzeTemporaryDtors: true
-CheckOptions:
- - key: google-readability-braces-around-statements.ShortStatementLines
- value: '1'
- - key: google-readability-function-size.StatementThreshold
- value: '800'
- - key: google-readability-namespace-comments.ShortNamespaceLines
- value: '10'
- - key: google-readability-namespace-comments.SpacesBeforeComments
- value: '2'
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/.clang-tidy-ignore
----------------------------------------------------------------------
diff --git a/src/.clang-tidy-ignore b/src/.clang-tidy-ignore
deleted file mode 100644
index 8b13789..0000000
--- a/src/.clang-tidy-ignore
+++ /dev/null
@@ -1 +0,0 @@
-
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/api/reader.h
----------------------------------------------------------------------
diff --git a/src/parquet/api/reader.h b/src/parquet/api/reader.h
index 1e0c5e3..0bd4a24 100644
--- a/src/parquet/api/reader.h
+++ b/src/parquet/api/reader.h
@@ -20,6 +20,7 @@
// Column reader API
#include "parquet/column/reader.h"
+#include "parquet/column/scan-all.h"
#include "parquet/exception.h"
#include "parquet/file/reader.h"
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/column/reader.h
----------------------------------------------------------------------
diff --git a/src/parquet/column/reader.h b/src/parquet/column/reader.h
index 25df2b4..8243e85 100644
--- a/src/parquet/column/reader.h
+++ b/src/parquet/column/reader.h
@@ -67,7 +67,6 @@ class PARQUET_EXPORT ColumnReader {
int64_t ReadDefinitionLevels(int64_t batch_size, int16_t* levels);
// Read multiple repetition levels into preallocated memory
- //
// Returns the number of decoded repetition levels
int64_t ReadRepetitionLevels(int64_t batch_size, int16_t* levels);
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/column/scan-all.h
----------------------------------------------------------------------
diff --git a/src/parquet/column/scan-all.h b/src/parquet/column/scan-all.h
new file mode 100644
index 0000000..fd63bff
--- /dev/null
+++ b/src/parquet/column/scan-all.h
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef PARQUET_SCAN_ALL_H
+#define PARQUET_SCAN_ALL_H
+
+#include "parquet/column/reader.h"
+
+template <typename RType>
+int64_t ScanAll(int32_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+ uint8_t* values, int64_t* values_buffered, parquet::ColumnReader* reader) {
+ typedef typename RType::T Type;
+ auto typed_reader = static_cast<RType*>(reader);
+ auto vals = reinterpret_cast<Type*>(&values[0]);
+ return typed_reader->ReadBatch(
+ batch_size, def_levels, rep_levels, vals, values_buffered);
+}
+
+int64_t ScanAllValues(int32_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+ uint8_t* values, int64_t* values_buffered, parquet::ColumnReader* reader) {
+ switch (reader->type()) {
+ case parquet::Type::BOOLEAN:
+ return ScanAll<parquet::BoolReader>(
+ batch_size, def_levels, rep_levels, values, values_buffered, reader);
+ case parquet::Type::INT32:
+ return ScanAll<parquet::Int32Reader>(
+ batch_size, def_levels, rep_levels, values, values_buffered, reader);
+ case parquet::Type::INT64:
+ return ScanAll<parquet::Int64Reader>(
+ batch_size, def_levels, rep_levels, values, values_buffered, reader);
+ case parquet::Type::INT96:
+ return ScanAll<parquet::Int96Reader>(
+ batch_size, def_levels, rep_levels, values, values_buffered, reader);
+ case parquet::Type::FLOAT:
+ return ScanAll<parquet::FloatReader>(
+ batch_size, def_levels, rep_levels, values, values_buffered, reader);
+ case parquet::Type::DOUBLE:
+ return ScanAll<parquet::DoubleReader>(
+ batch_size, def_levels, rep_levels, values, values_buffered, reader);
+ case parquet::Type::BYTE_ARRAY:
+ return ScanAll<parquet::ByteArrayReader>(
+ batch_size, def_levels, rep_levels, values, values_buffered, reader);
+ case parquet::Type::FIXED_LEN_BYTE_ARRAY:
+ return ScanAll<parquet::FixedLenByteArrayReader>(
+ batch_size, def_levels, rep_levels, values, values_buffered, reader);
+ default:
+ parquet::ParquetException::NYI("type reader not implemented");
+ }
+ // Unreachable code, but supress compiler warning
+ return 0;
+}
+
+#endif // PARQUET_SCAN_ALL_H
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/column/scanner.h
----------------------------------------------------------------------
diff --git a/src/parquet/column/scanner.h b/src/parquet/column/scanner.h
index bc5c5ce..4373c7a 100644
--- a/src/parquet/column/scanner.h
+++ b/src/parquet/column/scanner.h
@@ -48,7 +48,6 @@ class PARQUET_EXPORT Scanner {
value_offset_(0),
values_buffered_(0),
reader_(reader) {
- // TODO: don't allocate for required fields
def_levels_.resize(descr()->max_definition_level() > 0 ? batch_size_ : 0);
rep_levels_.resize(descr()->max_repetition_level() > 0 ? batch_size_ : 0);
}
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/file/reader.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/reader.cc b/src/parquet/file/reader.cc
index 50db1c0..62481f7 100644
--- a/src/parquet/file/reader.cc
+++ b/src/parquet/file/reader.cc
@@ -144,7 +144,7 @@ void ParquetFileReader::DebugPrint(
for (auto i : selected_columns) {
const ColumnDescriptor* descr = file_metadata->schema_descriptor()->Column(i);
stream << "Column " << i << ": " << descr->name() << " ("
- << type_to_string(descr->physical_type()) << ")" << std::endl;
+ << TypeToString(descr->physical_type()) << ")" << std::endl;
}
for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
@@ -173,10 +173,10 @@ void ParquetFileReader::DebugPrint(
stream << " Statistics Not Set";
}
stream << std::endl
- << " compression: " << compression_to_string(column_chunk->compression())
+ << " compression: " << CompressionToString(column_chunk->compression())
<< ", encodings: ";
for (auto encoding : column_chunk->encodings()) {
- stream << encoding_to_string(encoding) << " ";
+ stream << EncodingToString(encoding) << " ";
}
stream << std::endl
<< " uncompressed size: " << column_chunk->total_uncompressed_size()
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/schema/types.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/types.cc b/src/parquet/schema/types.cc
index 2e5d151..043ef00 100644
--- a/src/parquet/schema/types.cc
+++ b/src/parquet/schema/types.cc
@@ -101,7 +101,7 @@ PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetitio
case LogicalType::JSON:
case LogicalType::BSON:
if (type != Type::BYTE_ARRAY) {
- ss << logical_type_to_string(logical_type);
+ ss << LogicalTypeToString(logical_type);
ss << " can only annotate BYTE_ARRAY fields";
throw ParquetException(ss.str());
}
@@ -138,7 +138,7 @@ PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetitio
case LogicalType::INT_16:
case LogicalType::INT_32:
if (type != Type::INT32) {
- ss << logical_type_to_string(logical_type);
+ ss << LogicalTypeToString(logical_type);
ss << " can only annotate INT32";
throw ParquetException(ss.str());
}
@@ -149,7 +149,7 @@ PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetitio
case LogicalType::UINT_64:
case LogicalType::INT_64:
if (type != Type::INT64) {
- ss << logical_type_to_string(logical_type);
+ ss << LogicalTypeToString(logical_type);
ss << " can only annotate INT64";
throw ParquetException(ss.str());
}
@@ -167,7 +167,7 @@ PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetitio
}
break;
default:
- ss << logical_type_to_string(logical_type);
+ ss << LogicalTypeToString(logical_type);
ss << " can not be applied to a primitive type";
throw ParquetException(ss.str());
}
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/types-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/types-test.cc b/src/parquet/types-test.cc
index 59ed456..06f202e 100644
--- a/src/parquet/types-test.cc
+++ b/src/parquet/types-test.cc
@@ -24,43 +24,41 @@
namespace parquet {
TEST(TestTypeToString, PhysicalTypes) {
- ASSERT_STREQ("BOOLEAN", type_to_string(Type::BOOLEAN).c_str());
- ASSERT_STREQ("INT32", type_to_string(Type::INT32).c_str());
- ASSERT_STREQ("INT64", type_to_string(Type::INT64).c_str());
- ASSERT_STREQ("INT96", type_to_string(Type::INT96).c_str());
- ASSERT_STREQ("FLOAT", type_to_string(Type::FLOAT).c_str());
- ASSERT_STREQ("DOUBLE", type_to_string(Type::DOUBLE).c_str());
- ASSERT_STREQ("BYTE_ARRAY", type_to_string(Type::BYTE_ARRAY).c_str());
- ASSERT_STREQ(
- "FIXED_LEN_BYTE_ARRAY", type_to_string(Type::FIXED_LEN_BYTE_ARRAY).c_str());
+ ASSERT_STREQ("BOOLEAN", TypeToString(Type::BOOLEAN).c_str());
+ ASSERT_STREQ("INT32", TypeToString(Type::INT32).c_str());
+ ASSERT_STREQ("INT64", TypeToString(Type::INT64).c_str());
+ ASSERT_STREQ("INT96", TypeToString(Type::INT96).c_str());
+ ASSERT_STREQ("FLOAT", TypeToString(Type::FLOAT).c_str());
+ ASSERT_STREQ("DOUBLE", TypeToString(Type::DOUBLE).c_str());
+ ASSERT_STREQ("BYTE_ARRAY", TypeToString(Type::BYTE_ARRAY).c_str());
+ ASSERT_STREQ("FIXED_LEN_BYTE_ARRAY", TypeToString(Type::FIXED_LEN_BYTE_ARRAY).c_str());
}
TEST(TestLogicalTypeToString, LogicalTypes) {
- ASSERT_STREQ("NONE", logical_type_to_string(LogicalType::NONE).c_str());
- ASSERT_STREQ("UTF8", logical_type_to_string(LogicalType::UTF8).c_str());
- ASSERT_STREQ(
- "MAP_KEY_VALUE", logical_type_to_string(LogicalType::MAP_KEY_VALUE).c_str());
- ASSERT_STREQ("LIST", logical_type_to_string(LogicalType::LIST).c_str());
- ASSERT_STREQ("ENUM", logical_type_to_string(LogicalType::ENUM).c_str());
- ASSERT_STREQ("DECIMAL", logical_type_to_string(LogicalType::DECIMAL).c_str());
- ASSERT_STREQ("DATE", logical_type_to_string(LogicalType::DATE).c_str());
- ASSERT_STREQ("TIME_MILLIS", logical_type_to_string(LogicalType::TIME_MILLIS).c_str());
- ASSERT_STREQ("TIME_MICROS", logical_type_to_string(LogicalType::TIME_MICROS).c_str());
+ ASSERT_STREQ("NONE", LogicalTypeToString(LogicalType::NONE).c_str());
+ ASSERT_STREQ("UTF8", LogicalTypeToString(LogicalType::UTF8).c_str());
+ ASSERT_STREQ("MAP_KEY_VALUE", LogicalTypeToString(LogicalType::MAP_KEY_VALUE).c_str());
+ ASSERT_STREQ("LIST", LogicalTypeToString(LogicalType::LIST).c_str());
+ ASSERT_STREQ("ENUM", LogicalTypeToString(LogicalType::ENUM).c_str());
+ ASSERT_STREQ("DECIMAL", LogicalTypeToString(LogicalType::DECIMAL).c_str());
+ ASSERT_STREQ("DATE", LogicalTypeToString(LogicalType::DATE).c_str());
+ ASSERT_STREQ("TIME_MILLIS", LogicalTypeToString(LogicalType::TIME_MILLIS).c_str());
+ ASSERT_STREQ("TIME_MICROS", LogicalTypeToString(LogicalType::TIME_MICROS).c_str());
ASSERT_STREQ(
- "TIMESTAMP_MILLIS", logical_type_to_string(LogicalType::TIMESTAMP_MILLIS).c_str());
+ "TIMESTAMP_MILLIS", LogicalTypeToString(LogicalType::TIMESTAMP_MILLIS).c_str());
ASSERT_STREQ(
- "TIMESTAMP_MICROS", logical_type_to_string(LogicalType::TIMESTAMP_MICROS).c_str());
- ASSERT_STREQ("UINT_8", logical_type_to_string(LogicalType::UINT_8).c_str());
- ASSERT_STREQ("UINT_16", logical_type_to_string(LogicalType::UINT_16).c_str());
- ASSERT_STREQ("UINT_32", logical_type_to_string(LogicalType::UINT_32).c_str());
- ASSERT_STREQ("UINT_64", logical_type_to_string(LogicalType::UINT_64).c_str());
- ASSERT_STREQ("INT_8", logical_type_to_string(LogicalType::INT_8).c_str());
- ASSERT_STREQ("INT_16", logical_type_to_string(LogicalType::INT_16).c_str());
- ASSERT_STREQ("INT_32", logical_type_to_string(LogicalType::INT_32).c_str());
- ASSERT_STREQ("INT_64", logical_type_to_string(LogicalType::INT_64).c_str());
- ASSERT_STREQ("JSON", logical_type_to_string(LogicalType::JSON).c_str());
- ASSERT_STREQ("BSON", logical_type_to_string(LogicalType::BSON).c_str());
- ASSERT_STREQ("INTERVAL", logical_type_to_string(LogicalType::INTERVAL).c_str());
+ "TIMESTAMP_MICROS", LogicalTypeToString(LogicalType::TIMESTAMP_MICROS).c_str());
+ ASSERT_STREQ("UINT_8", LogicalTypeToString(LogicalType::UINT_8).c_str());
+ ASSERT_STREQ("UINT_16", LogicalTypeToString(LogicalType::UINT_16).c_str());
+ ASSERT_STREQ("UINT_32", LogicalTypeToString(LogicalType::UINT_32).c_str());
+ ASSERT_STREQ("UINT_64", LogicalTypeToString(LogicalType::UINT_64).c_str());
+ ASSERT_STREQ("INT_8", LogicalTypeToString(LogicalType::INT_8).c_str());
+ ASSERT_STREQ("INT_16", LogicalTypeToString(LogicalType::INT_16).c_str());
+ ASSERT_STREQ("INT_32", LogicalTypeToString(LogicalType::INT_32).c_str());
+ ASSERT_STREQ("INT_64", LogicalTypeToString(LogicalType::INT_64).c_str());
+ ASSERT_STREQ("JSON", LogicalTypeToString(LogicalType::JSON).c_str());
+ ASSERT_STREQ("BSON", LogicalTypeToString(LogicalType::BSON).c_str());
+ ASSERT_STREQ("INTERVAL", LogicalTypeToString(LogicalType::INTERVAL).c_str());
}
TEST(TypePrinter, StatisticsTypes) {
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/types.cc
----------------------------------------------------------------------
diff --git a/src/parquet/types.cc b/src/parquet/types.cc
index 7fc5017..97c769b 100644
--- a/src/parquet/types.cc
+++ b/src/parquet/types.cc
@@ -62,7 +62,7 @@ std::string FormatStatValue(Type::type parquet_type, const char* val) {
return result.str();
}
-std::string encoding_to_string(Encoding::type t) {
+std::string EncodingToString(Encoding::type t) {
switch (t) {
case Encoding::PLAIN:
return "PLAIN";
@@ -94,7 +94,7 @@ std::string encoding_to_string(Encoding::type t) {
}
}
-std::string compression_to_string(Compression::type t) {
+std::string CompressionToString(Compression::type t) {
switch (t) {
case Compression::UNCOMPRESSED:
return "UNCOMPRESSED";
@@ -114,7 +114,7 @@ std::string compression_to_string(Compression::type t) {
}
}
-std::string type_to_string(Type::type t) {
+std::string TypeToString(Type::type t) {
switch (t) {
case Type::BOOLEAN:
return "BOOLEAN";
@@ -146,7 +146,7 @@ std::string type_to_string(Type::type t) {
}
}
-std::string logical_type_to_string(LogicalType::type t) {
+std::string LogicalTypeToString(LogicalType::type t) {
switch (t) {
case LogicalType::NONE:
return "NONE";
@@ -196,4 +196,29 @@ std::string logical_type_to_string(LogicalType::type t) {
return "UNKNOWN";
}
}
+
+int GetTypeByteSize(Type::type parquet_type) {
+ switch (parquet_type) {
+ case Type::BOOLEAN:
+ return type_traits<BooleanType::type_num>::value_byte_size;
+ case Type::INT32:
+ return type_traits<Int32Type::type_num>::value_byte_size;
+ case Type::INT64:
+ return type_traits<Int64Type::type_num>::value_byte_size;
+ case Type::INT96:
+ return type_traits<Int96Type::type_num>::value_byte_size;
+ case Type::DOUBLE:
+ return type_traits<DoubleType::type_num>::value_byte_size;
+ case Type::FLOAT:
+ return type_traits<FloatType::type_num>::value_byte_size;
+ case Type::BYTE_ARRAY:
+ return type_traits<ByteArrayType::type_num>::value_byte_size;
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return type_traits<FLBAType::type_num>::value_byte_size;
+ default:
+ return 0;
+ }
+ return 0;
+}
+
} // namespace parquet
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/src/parquet/types.h
----------------------------------------------------------------------
diff --git a/src/parquet/types.h b/src/parquet/types.h
index cb67820..a4285be 100644
--- a/src/parquet/types.h
+++ b/src/parquet/types.h
@@ -269,15 +269,17 @@ inline std::string format_fwf(int width) {
return ss.str();
}
-std::string compression_to_string(Compression::type t);
+std::string CompressionToString(Compression::type t);
-std::string encoding_to_string(Encoding::type t);
+std::string EncodingToString(Encoding::type t);
-std::string logical_type_to_string(LogicalType::type t);
+std::string LogicalTypeToString(LogicalType::type t);
-std::string type_to_string(Type::type t);
+std::string TypeToString(Type::type t);
std::string FormatStatValue(Type::type parquet_type, const char* val);
+
+int GetTypeByteSize(Type::type t);
} // namespace parquet
#endif // PARQUET_TYPES_H
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/tools/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
new file mode 100644
index 0000000..5c4eaa8
--- /dev/null
+++ b/tools/CMakeLists.txt
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+SET(LINK_LIBS
+ snappystatic
+ thriftstatic)
+
+if (PARQUET_BUILD_EXECUTABLES)
+ add_executable(parquet-dump-schema parquet-dump-schema.cc)
+ target_link_libraries(parquet-dump-schema ${LINK_LIBS}
+ parquet_static)
+
+ add_executable(parquet_reader parquet_reader.cc)
+ target_link_libraries(parquet_reader ${LINK_LIBS}
+ parquet_static)
+
+ add_executable(parquet-scan parquet-scan.cc)
+ target_link_libraries(parquet-scan ${LINK_LIBS}
+ parquet_static)
+endif()
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/tools/parquet-dump-schema.cc
----------------------------------------------------------------------
diff --git a/tools/parquet-dump-schema.cc b/tools/parquet-dump-schema.cc
new file mode 100644
index 0000000..deef2fd
--- /dev/null
+++ b/tools/parquet-dump-schema.cc
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <iostream>
+
+#include "parquet/api/reader.h"
+#include "parquet/api/schema.h"
+
+int main(int argc, char** argv) {
+ std::string filename = argv[1];
+
+ try {
+ std::unique_ptr<parquet::ParquetFileReader> reader =
+ parquet::ParquetFileReader::OpenFile(filename);
+ PrintSchema(reader->metadata()->schema_descriptor()->schema().get(), std::cout);
+ } catch (const std::exception& e) {
+ std::cerr << "Parquet error: " << e.what() << std::endl;
+ return -1;
+ }
+
+ return 0;
+}
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/tools/parquet-scan.cc
----------------------------------------------------------------------
diff --git a/tools/parquet-scan.cc b/tools/parquet-scan.cc
new file mode 100644
index 0000000..d146a1d
--- /dev/null
+++ b/tools/parquet-scan.cc
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <ctime>
+#include <iostream>
+#include <memory>
+#include <list>
+
+#include "parquet/api/reader.h"
+
+int main(int argc, char** argv) {
+ if (argc > 4 || argc < 1) {
+ std::cerr << "Usage: parquet-scan [--batch-size=] [--columns=...] <file>"
+ << std::endl;
+ return -1;
+ }
+
+ std::string filename;
+
+ // Read command-line options
+ int batch_size = 256;
+ const std::string COLUMNS_PREFIX = "--columns=";
+ const std::string BATCH_SIZE_PREFIX = "--batch-size=";
+ std::vector<int> columns;
+ int num_columns = 0;
+
+ char *param, *value;
+ for (int i = 1; i < argc; i++) {
+ if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) {
+ value = std::strtok(param + COLUMNS_PREFIX.length(), ",");
+ while (value) {
+ columns.push_back(std::atoi(value));
+ value = std::strtok(nullptr, ",");
+ num_columns++;
+ }
+ } else if ((param = std::strstr(argv[i], BATCH_SIZE_PREFIX.c_str()))) {
+ value = std::strtok(param + BATCH_SIZE_PREFIX.length(), " ");
+ if (value) { batch_size = std::atoi(value); }
+ } else {
+ filename = argv[i];
+ }
+ }
+
+ std::vector<int16_t> rep_levels(batch_size);
+ std::vector<int16_t> def_levels(batch_size);
+ try {
+ double total_time;
+ std::clock_t start_time = std::clock();
+ std::unique_ptr<parquet::ParquetFileReader> reader =
+ parquet::ParquetFileReader::OpenFile(filename);
+ // columns are not specified explicitly. Add all columns
+ if (num_columns == 0) {
+ num_columns = reader->metadata()->num_columns();
+ columns.resize(num_columns);
+ for (int i = 0; i < num_columns; i++) {
+ columns[i] = i;
+ }
+ }
+
+ int64_t total_rows[num_columns];
+
+ for (int r = 0; r < reader->metadata()->num_row_groups(); ++r) {
+ auto group_reader = reader->RowGroup(r);
+ int col = 0;
+ for (auto i : columns) {
+ total_rows[col] = 0;
+ std::shared_ptr<parquet::ColumnReader> col_reader = group_reader->Column(i);
+ size_t value_byte_size = GetTypeByteSize(col_reader->descr()->physical_type());
+ std::vector<uint8_t> values(batch_size * value_byte_size);
+
+ int64_t values_read = 0;
+ while (col_reader->HasNext()) {
+ total_rows[col] += ScanAllValues(batch_size, def_levels.data(),
+ rep_levels.data(), values.data(), &values_read, col_reader.get());
+ }
+ col++;
+ }
+ }
+
+ total_time = (std::clock() - start_time) / static_cast<double>(CLOCKS_PER_SEC);
+ for (int ct = 1; ct < num_columns; ++ct) {
+ if (total_rows[0] != total_rows[ct]) {
+ std::cerr << "Parquet error: Total rows among columns do not match" << std::endl;
+ }
+ }
+ std::cout << total_rows[0] << " rows scanned in " << total_time << " seconds."
+ << std::endl;
+ } catch (const std::exception& e) {
+ std::cerr << "Parquet error: " << e.what() << std::endl;
+ return -1;
+ }
+
+ return 0;
+}
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/c67ee3e5/tools/parquet_reader.cc
----------------------------------------------------------------------
diff --git a/tools/parquet_reader.cc b/tools/parquet_reader.cc
new file mode 100644
index 0000000..ced84d5
--- /dev/null
+++ b/tools/parquet_reader.cc
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <iostream>
+#include <memory>
+#include <list>
+
+#include "parquet/api/reader.h"
+
+int main(int argc, char** argv) {
+ if (argc > 5 || argc < 2) {
+ std::cerr << "Usage: parquet_reader [--only-metadata] [--no-memory-map] "
+ "[--columns=...] <file>"
+ << std::endl;
+ return -1;
+ }
+
+ std::string filename;
+ bool print_values = true;
+ bool memory_map = true;
+
+ // Read command-line options
+ const std::string COLUMNS_PREFIX = "--columns=";
+ std::list<int> columns;
+
+ char *param, *value;
+ for (int i = 1; i < argc; i++) {
+ if ((param = std::strstr(argv[i], "--only-metadata"))) {
+ print_values = false;
+ } else if ((param = std::strstr(argv[i], "--no-memory-map"))) {
+ memory_map = false;
+ } else if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) {
+ value = std::strtok(param + COLUMNS_PREFIX.length(), ",");
+ while (value) {
+ columns.push_back(std::atoi(value));
+ value = std::strtok(nullptr, ",");
+ }
+ } else {
+ filename = argv[i];
+ }
+ }
+
+ try {
+ std::unique_ptr<parquet::ParquetFileReader> reader =
+ parquet::ParquetFileReader::OpenFile(filename, memory_map);
+ reader->DebugPrint(std::cout, columns, print_values);
+ } catch (const std::exception& e) {
+ std::cerr << "Parquet error: " << e.what() << std::endl;
+ return -1;
+ }
+
+ return 0;
+}