You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/09/27 12:19:39 UTC
[arrow] branch master updated (d54f13d -> 723a437)
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git.
from d54f13d ARROW-3327: [Python] Use local Arrow checkout instead of separate clone
new 6beeaf4 PARQUET-681: Add tool to scan a parquet file
new d1e8433 PARQUET-711: Use metadata builders in parquet writer
new 5d0f5ac PARQUET-728: Incorporate upstream Arrow API changes
new de3f844 PARQUET-702: Add a writer + reader example with detailed comments
new 713c6cd PARQUET-818: Refactoring to utilize common IO, buffer, memory management abstractions and implementations
new 58ad1a8 PARQUET-807: Allow user to retain ownership of parquet::FileMetaData.
new 22a96d9 PARQUET-892: Specify public link targets for parquet_static so that transitive dependencies are linked in executables
new 4b444c6 PARQUET-909: Reduce buffer allocations (mallocs) on critical path
new bfb1e9a PARQUET-508: Add ParquetFilePrinter
new 174cc09 PARQUET-958: [C++] Print Parquet metadata in JSON format
new 3e15021 PARQUET-595: API for KeyValue metadata
new ebf0507 PARQUET-679: Local Windows build and Appveyor support
new 67f84a9 PARQUET-991: Resolve msvc warnings; Appveyor treats msvc warnings as …
new 4d21fd3 PARQUET-1029: [C++] Some extern template symbols not being exported in gcc
new 80dc883 PARQUET-1048: Apache Arrow static transitive dependencies
new 09cd545 PARQUET-1053: Fix unused result warnings due to unchecked Statuses
new d02cd9d PARQUET-1068: Modify .clang-format to use straight Google format with 90-character line width
new 144699c PARQUET-1083: Factor logic in parquet-scan.cc into a library function to help with perf testing
new eefd36e PARQUET-1177: Add PARQUET_BUILD_WARNING_LEVEL option and more rigorous Clang warnings
new 3a9dbdf PARQUET-1196: Example parquet_arrow project
new fca0625 PARQUET-1270: Install executable tools
new 37207bd PARQUET-1256: Add --print-key-value-metadata option to parquet_reader tool
new 0749154 PARQUET-1372: Add an API to allow writing RowGroups based on size
new 723a437 PARQUET-1427: [C++] Incorporate with build system, parquet target. Fix parquet-arrow example per repo changes
The 24 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
Summary of changes:
ci/travis_before_script_cpp.sh | 4 +-
cpp/CHANGELOG_PARQUET.md | 501 +++++++++++++++++++++
cpp/CMakeLists.txt | 10 +
.../parquet/low-level-api}/CMakeLists.txt | 23 +-
.../parquet/low-level-api/reader-writer.cc | 409 +++++++++++++++++
.../parquet/low-level-api/reader-writer2.cc | 430 ++++++++++++++++++
cpp/examples/parquet/low-level-api/reader_writer.h | 71 +++
cpp/examples/parquet/parquet-arrow/CMakeLists.txt | 46 ++
cpp/examples/parquet/parquet-arrow/README.md | 20 +
.../parquet-arrow/cmake_modules/FindArrow.cmake | 1 +
.../parquet-arrow}/cmake_modules/FindParquet.cmake | 2 +-
.../parquet/parquet-arrow/src/reader-writer.cc | 134 ++++++
.../python/util => tools/parquet}/CMakeLists.txt | 32 +-
.../parquet/parquet-dump-schema.cc} | 18 +-
cpp/tools/parquet/parquet-scan.cc | 78 ++++
cpp/tools/parquet/parquet_reader.cc | 79 ++++
dev/release/rat_exclude_files.txt | 2 +
17 files changed, 1823 insertions(+), 37 deletions(-)
create mode 100644 cpp/CHANGELOG_PARQUET.md
copy cpp/{src/parquet/arrow => examples/parquet/low-level-api}/CMakeLists.txt (62%)
create mode 100644 cpp/examples/parquet/low-level-api/reader-writer.cc
create mode 100644 cpp/examples/parquet/low-level-api/reader-writer2.cc
create mode 100644 cpp/examples/parquet/low-level-api/reader_writer.h
create mode 100644 cpp/examples/parquet/parquet-arrow/CMakeLists.txt
create mode 100644 cpp/examples/parquet/parquet-arrow/README.md
create mode 120000 cpp/examples/parquet/parquet-arrow/cmake_modules/FindArrow.cmake
copy cpp/{ => examples/parquet/parquet-arrow}/cmake_modules/FindParquet.cmake (98%)
create mode 100644 cpp/examples/parquet/parquet-arrow/src/reader-writer.cc
copy cpp/{src/arrow/python/util => tools/parquet}/CMakeLists.txt (61%)
copy cpp/{src/arrow/util/benchmark_main.cc => tools/parquet/parquet-dump-schema.cc} (65%)
create mode 100644 cpp/tools/parquet/parquet-scan.cc
create mode 100644 cpp/tools/parquet/parquet_reader.cc
[arrow] 16/24: PARQUET-1053: Fix unused result warnings due to
unchecked Statuses
Posted by we...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 09cd54540c0b86323de4ccc2a5e918009845677b
Author: Phillip Cloud <cp...@gmail.com>
AuthorDate: Mon Jul 10 22:03:25 2017 -0400
PARQUET-1053: Fix unused result warnings due to unchecked Statuses
Author: Phillip Cloud <cp...@gmail.com>
Closes #369 from cpcloud/PARQUET-1053 and squashes the following commits:
e0598b4 [Phillip Cloud] PARQUET-1053: Fix unused result warnings due to unchecked Statuses
Change-Id: I91d267f56685c0e1267b6069c5bff52f2a1eca15
---
cpp/examples/parquet/reader-writer.cc | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/cpp/examples/parquet/reader-writer.cc b/cpp/examples/parquet/reader-writer.cc
index 6f21f6c..210968c 100644
--- a/cpp/examples/parquet/reader-writer.cc
+++ b/cpp/examples/parquet/reader-writer.cc
@@ -22,6 +22,7 @@
#include <memory>
#include <arrow/io/file.h>
+#include <arrow/util/logging.h>
#include <parquet/api/reader.h>
#include <parquet/api/writer.h>
@@ -216,7 +217,7 @@ int main(int argc, char** argv) {
file_writer->Close();
// Write the bytes to file
- out_file->Close();
+ DCHECK(out_file->Close().ok());
} catch (const std::exception& e) {
std::cerr << "Parquet write error: " << e.what() << std::endl;
return -1;
[arrow] 05/24: PARQUET-818: Refactoring to utilize common IO, buffer,
memory management abstractions and implementations
Posted by we...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 713c6cda9f55b5e60bc5efbb77f4dc46badf79eb
Author: Wes McKinney <we...@twosigma.com>
AuthorDate: Fri Dec 30 11:36:05 2016 -0500
PARQUET-818: Refactoring to utilize common IO, buffer, memory management abstractions and implementations
This refactoring is a bit of a bloodbath, but I've attempted to preserve as much API backwards compatibility as possible.
Several points
* Arrow does not use exceptions, so will need to be very careful about making sure that no Status goes unchecked. I've tried to get most of them, but might have missed some
* parquet-cpp still exposes an abstract file read and write API as before, but this makes it easy to pass in an Arrow file handle (e.g. HDFS, OS files, memory maps, etc.)
* Custom memory allocators will need to subclass `arrow::MemoryPool` instead. If this becomes onerous for some reason, we can try to find alternatives, but basically it's the exact same class as `parquet::MemoryAllocator`
Does not require any upstream changes in Arrow.
Author: Wes McKinney <we...@twosigma.com>
Closes #210 from wesm/arrow-consolidation and squashes the following commits:
ef81084 [Wes McKinney] Configurable Arrow linkage. Slight .travis.yml cleaning
50b44f0 [Wes McKinney] Make some const refs
8438f86 [Wes McKinney] Revert ParquetFileReader::Open to use std::unique_ptr<RandomAccessFile>
671d981 [Wes McKinney] Actually tee output to console
ca8df13 [Wes McKinney] Do not hide test output from travis logs
f516115 [Wes McKinney] Add public link libs to dependencies to avoid race conditions with external projects
414c75f [Wes McKinney] README cleanups
be1acb5 [Wes McKinney] Move thirdparty ep's / setup to separate cmake module
46342ea [Wes McKinney] Remove unneeded ParquetAllocator interface, cleaning
b546f08 [Wes McKinney] Use MemoryAllocator alias within parquet core
8c1226d [Wes McKinney] Add Arrow to list of third party deps. Needs to be added to thirdparty
f9d8a2a [Wes McKinney] Check some unchecked Statuses
0d04820 [Wes McKinney] Fix benchmark builds. Do not fail in benchmarks if gtest.h is included due to <tr1/tuple> issue
ee312af [Wes McKinney] cpplint
6a05cd9 [Wes McKinney] Update installed header files
8d962f1 [Wes McKinney] Build and unit tests pass again
c82e2b4 [Wes McKinney] More refactoring
6ec5b71 [Wes McKinney] Re-expose original abstract IO interfaces, add Arrow subclasses that wrap inptu
c320c95 [Wes McKinney] clang-format
f10080c [Wes McKinney] Fix missed include
6ade22f [Wes McKinney] First cut refactoring, not fully compiling yet
Change-Id: Ibc4238396c035e185346c2562a0c6dc029c254ff
---
cpp/examples/parquet/reader-writer.cc | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/cpp/examples/parquet/reader-writer.cc b/cpp/examples/parquet/reader-writer.cc
index cc066ac..0289eed 100644
--- a/cpp/examples/parquet/reader-writer.cc
+++ b/cpp/examples/parquet/reader-writer.cc
@@ -21,6 +21,8 @@
#include <list>
#include <memory>
+#include <arrow/io/file.h>
+
#include <parquet/api/reader.h>
#include <parquet/api/writer.h>
@@ -101,8 +103,9 @@ int main(int argc, char** argv) {
// parquet::REPEATED fields require both definition and repetition level values
try {
// Create a local file output stream instance.
- std::shared_ptr<parquet::OutputStream> out_file =
- std::make_shared<parquet::LocalFileOutputStream>(PARQUET_FILENAME);
+ using FileClass = ::arrow::io::FileOutputStream;
+ std::shared_ptr<FileClass> out_file;
+ PARQUET_THROW_NOT_OK(FileClass::Open(PARQUET_FILENAME, &out_file));
// Setup the parquet schema
std::shared_ptr<GroupNode> schema = SetupSchema();
[arrow] 10/24: PARQUET-958: [C++] Print Parquet metadata in JSON
format
Posted by we...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 174cc09ab31ade4c7d51869e662374ad71487d29
Author: Deepak Majeti <de...@hpe.com>
AuthorDate: Tue Apr 25 08:34:58 2017 +0200
PARQUET-958: [C++] Print Parquet metadata in JSON format
Made minor formatting changes to DebugPrint
No support to print values. Only the metadata is JSON formatted in this patch.
Author: Deepak Majeti <de...@hpe.com>
Closes #310 from majetideepak/PARQUET-958 and squashes the following commits:
4d9cbbd [Deepak Majeti] change DebugPrint to take filename
3c78bc0 [Deepak Majeti] use raw string
97f016a [Deepak Majeti] add test and clang format
ec12ddb [Deepak Majeti] add JSONPrint
9c697e2 [Deepak Majeti] fix CMake flag for benchmarks
Change-Id: Iaf4f4ba609ea5d26a061c9905e5770580ea6b0ed
---
cpp/tools/parquet/parquet_reader.cc | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/cpp/tools/parquet/parquet_reader.cc b/cpp/tools/parquet/parquet_reader.cc
index 25f81c1..7ef59dc 100644
--- a/cpp/tools/parquet/parquet_reader.cc
+++ b/cpp/tools/parquet/parquet_reader.cc
@@ -23,7 +23,7 @@
int main(int argc, char** argv) {
if (argc > 5 || argc < 2) {
- std::cerr << "Usage: parquet_reader [--only-metadata] [--no-memory-map] "
+ std::cerr << "Usage: parquet_reader [--only-metadata] [--no-memory-map] [--json]"
"[--columns=...] <file>"
<< std::endl;
return -1;
@@ -32,6 +32,7 @@ int main(int argc, char** argv) {
std::string filename;
bool print_values = true;
bool memory_map = true;
+ bool format_json = false;
// Read command-line options
const std::string COLUMNS_PREFIX = "--columns=";
@@ -43,6 +44,8 @@ int main(int argc, char** argv) {
print_values = false;
} else if ((param = std::strstr(argv[i], "--no-memory-map"))) {
memory_map = false;
+ } else if ((param = std::strstr(argv[i], "--json"))) {
+ format_json = true;
} else if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) {
value = std::strtok(param + COLUMNS_PREFIX.length(), ",");
while (value) {
@@ -58,7 +61,11 @@ int main(int argc, char** argv) {
std::unique_ptr<parquet::ParquetFileReader> reader =
parquet::ParquetFileReader::OpenFile(filename, memory_map);
parquet::ParquetFilePrinter printer(reader.get());
- printer.DebugPrint(std::cout, columns, print_values);
+ if (format_json) {
+ printer.JSONPrint(std::cout, columns, filename.c_str());
+ } else {
+ printer.DebugPrint(std::cout, columns, print_values, filename.c_str());
+ }
} catch (const std::exception& e) {
std::cerr << "Parquet error: " << e.what() << std::endl;
return -1;
[arrow] 13/24: PARQUET-991: Resolve msvc warnings; Appveyor treats msvc warnings as …
Posted by we...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 67f84a997df5adfab3a12868027e459587aa6a44
Author: Max Risuhin <ri...@gmail.com>
AuthorDate: Mon May 29 15:55:59 2017 -0400
PARQUET-991: Resolve msvc warnings; Appveyor treats msvc warnings as …
…errors (/WX flag)
Author: Max Risuhin <ri...@gmail.com>
Closes #340 from MaxRis/PARQUET-991 and squashes the following commits:
98a2544 [Max Risuhin] PARQUET-991: Resolve msvc warnings; Appveyor treats msvc warnings as errors (/WX flag)
Change-Id: I8fe6ca37debbd3b300e7aa51c5d015da11ecf79d
---
cpp/examples/parquet/reader-writer.cc | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/cpp/examples/parquet/reader-writer.cc b/cpp/examples/parquet/reader-writer.cc
index 9118c88..6f21f6c 100644
--- a/cpp/examples/parquet/reader-writer.cc
+++ b/cpp/examples/parquet/reader-writer.cc
@@ -30,7 +30,7 @@
* This example describes writing and reading Parquet Files in C++ and serves as a
* reference to the API.
* The file contains all the physical data types supported by Parquet.
-**/
+ **/
/* Parquet is a structured columnar file format
* Parquet File = "Parquet data" + "Parquet Metadata"
@@ -42,7 +42,7 @@
* complex (nested) type (internal nodes)
* For specific details, please refer the format here:
* https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
-**/
+ **/
constexpr int NUM_ROWS_PER_ROW_GROUP = 500;
constexpr int FIXED_LENGTH = 10;
@@ -168,7 +168,7 @@ int main(int argc, char** argv) {
parquet::FloatWriter* float_writer =
static_cast<parquet::FloatWriter*>(rg_writer->NextColumn());
for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
- float value = i * 1.1;
+ float value = i * 1.1f;
float_writer->WriteBatch(1, nullptr, nullptr, &value);
}
@@ -367,7 +367,7 @@ int main(int argc, char** argv) {
// There are no NULL values in the rows written
assert(values_read == 1);
// Verify the value written
- float expected_value = i * 1.1;
+ float expected_value = i * 1.1f;
assert(value == expected_value);
i++;
}
[arrow] 12/24: PARQUET-679: Local Windows build and Appveyor support
Posted by we...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit ebf0507cdc9fa792f582bf56c04bf2bbc3782584
Author: Max Risuhin <ri...@gmail.com>
AuthorDate: Tue May 2 09:29:15 2017 -0400
PARQUET-679: Local Windows build and Appveyor support
Author: Max Risuhin <ri...@gmail.com>
Closes #313 from MaxRis/PARQUET-679 and squashes the following commits:
c5c4a8c [Max Risuhin] PARQUET-679: Local Windows build and Appveyor support
Change-Id: I6ca8937537e035bce8b9c83fb811ea6d58c6d3f0
---
cpp/tools/parquet/parquet-scan.cc | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cpp/tools/parquet/parquet-scan.cc b/cpp/tools/parquet/parquet-scan.cc
index f0bbb8e..8ab15a4 100644
--- a/cpp/tools/parquet/parquet-scan.cc
+++ b/cpp/tools/parquet/parquet-scan.cc
@@ -71,7 +71,7 @@ int main(int argc, char** argv) {
}
}
- int64_t total_rows[num_columns];
+ std::vector<int64_t> total_rows(num_columns);
for (int r = 0; r < reader->metadata()->num_row_groups(); ++r) {
auto group_reader = reader->RowGroup(r);
[arrow] 24/24: PARQUET-1427: [C++] Incorporate with build system,
parquet target. Fix parquet-arrow example per repo changes
Posted by we...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 723a437802143fd00d97c101caaf6deabea3f8c6
Author: Wes McKinney <we...@apache.org>
AuthorDate: Wed Sep 26 10:44:42 2018 -0400
PARQUET-1427: [C++] Incorporate with build system, parquet target. Fix
parquet-arrow example per repo changes
Change-Id: I17239a300b376b0b7bcf7dead0fc1ac758af2206
---
ci/travis_before_script_cpp.sh | 4 +-
cpp/CHANGELOG_PARQUET.md | 501 +++++++++++++++++++++
cpp/CMakeLists.txt | 10 +
cpp/examples/parquet/low-level-api/CMakeLists.txt | 18 +-
cpp/examples/parquet/parquet-arrow/CMakeLists.txt | 32 --
.../cmake_modules/ArrowExternalProject.cmake | 1 -
.../parquet-arrow/cmake_modules/FindArrow.cmake | 2 +-
.../parquet/parquet-arrow/src/reader-writer.cc | 2 +-
cpp/tools/parquet/CMakeLists.txt | 6 +-
dev/release/rat_exclude_files.txt | 2 +
10 files changed, 533 insertions(+), 45 deletions(-)
diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh
index 54a00f7..1729ca4 100755
--- a/ci/travis_before_script_cpp.sh
+++ b/ci/travis_before_script_cpp.sh
@@ -83,7 +83,9 @@ if [ $ARROW_TRAVIS_ORC == "1" ]; then
fi
if [ $ARROW_TRAVIS_PARQUET == "1" ]; then
- CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_PARQUET=ON"
+ CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS \
+-DARROW_PARQUET=ON \
+-DPARQUET_BUILD_EXECUTABLES=ON"
fi
if [ $ARROW_TRAVIS_VALGRIND == "1" ]; then
diff --git a/cpp/CHANGELOG_PARQUET.md b/cpp/CHANGELOG_PARQUET.md
new file mode 100644
index 0000000..06a09c2
--- /dev/null
+++ b/cpp/CHANGELOG_PARQUET.md
@@ -0,0 +1,501 @@
+Parquet C++ 1.5.0
+--------------------------------------------------------------------------------
+## Bug
+ * [PARQUET-979] - [C++] Limit size of min, max or disable stats for long binary types
+ * [PARQUET-1071] - [C++] parquet::arrow::FileWriter::Close is not idempotent
+ * [PARQUET-1349] - [C++] PARQUET_RPATH_ORIGIN is not picked by the build
+ * [PARQUET-1334] - [C++] memory_map parameter seems missleading in parquet file opener
+ * [PARQUET-1333] - [C++] Reading of files with dictionary size 0 fails on Windows with bad_alloc
+ * [PARQUET-1283] - [C++] FormatStatValue appends trailing space to string and int96
+ * [PARQUET-1270] - [C++] Executable tools do not get installed
+ * [PARQUET-1272] - [C++] ScanFileContents reports wrong row count for nested columns
+ * [PARQUET-1268] - [C++] Conversion of Arrow null list columns fails
+ * [PARQUET-1255] - [C++] Exceptions thrown in some tests
+ * [PARQUET-1358] - [C++] index_page_offset should be unset as it is not supported.
+ * [PARQUET-1357] - [C++] FormatStatValue truncates binary statistics on zero character
+ * [PARQUET-1319] - [C++] Pass BISON_EXECUTABLE to Thrift EP for MacOS
+ * [PARQUET-1313] - [C++] Compilation failure with VS2017
+ * [PARQUET-1315] - [C++] ColumnChunkMetaData.has_dictionary_page() should return bool, not int64_t
+ * [PARQUET-1307] - [C++] memory-test fails with latest Arrow
+ * [PARQUET-1274] - [Python] SegFault in pyarrow.parquet.write_table with specific options
+ * [PARQUET-1209] - locally defined symbol ... imported in function ..
+ * [PARQUET-1245] - [C++] Segfault when writing Arrow table with duplicate columns
+ * [PARQUET-1273] - [Python] Error writing to partitioned Parquet dataset
+ * [PARQUET-1384] - [C++] Clang compiler warnings in bloom_filter-test.cc
+
+## Improvement
+ * [PARQUET-1348] - [C++] Allow Arrow FileWriter To Write FileMetaData
+ * [PARQUET-1346] - [C++] Protect against null values data in empty Arrow array
+ * [PARQUET-1340] - [C++] Fix Travis Ci valgrind errors related to std::random_device
+ * [PARQUET-1323] - [C++] Fix compiler warnings with clang-6.0
+ * [PARQUET-1279] - Use ASSERT_NO_FATAIL_FAILURE in C++ unit tests
+ * [PARQUET-1262] - [C++] Use the same BOOST_ROOT and Boost_NAMESPACE for Thrift
+ * [PARQUET-1267] - replace "unsafe" std::equal by std::memcmp
+ * [PARQUET-1360] - [C++] Minor API + style changes follow up to PARQUET-1348
+ * [PARQUET-1166] - [API Proposal] Add GetRecordBatchReader in parquet/arrow/reader.h
+ * [PARQUET-1378] - [c++] Allow RowGroups with zero rows to be written
+ * [PARQUET-1256] - [C++] Add --print-key-value-metadata option to parquet_reader tool
+ * [PARQUET-1276] - [C++] Reduce the amount of memory used for writing null decimal values
+
+## New Feature
+ * [PARQUET-1392] - [C++] Supply row group indices to parquet::arrow::FileReader::ReadTable
+
+## Sub-task
+ * [PARQUET-1227] - Thrift crypto metadata structures
+ * [PARQUET-1332] - [C++] Add bloom filter utility class
+
+## Task
+ * [PARQUET-1350] - [C++] Use abstract ResizableBuffer instead of concrete PoolBuffer
+ * [PARQUET-1366] - [C++] Streamline use of Arrow bit-util.h
+ * [PARQUET-1308] - [C++] parquet::arrow should use thread pool, not ParallelFor
+ * [PARQUET-1382] - [C++] Prepare for arrow::test namespace removal
+ * [PARQUET-1372] - [C++] Add an API to allow writing RowGroups based on their size rather than num_rows
+
+
+Parquet C++ 1.4.0
+--------------------------------------------------------------------------------
+## Bug
+ * [PARQUET-1193] - [CPP] Implement ColumnOrder to support min_value and max_value
+ * [PARQUET-1180] - C++: Fix behaviour of num_children element of primitive nodes
+ * [PARQUET-1146] - C++: Add macOS-compatible sha512sum call to release verify script
+ * [PARQUET-1167] - [C++] FieldToNode function should return a status when throwing an exception
+ * [PARQUET-1175] - [C++] Fix usage of deprecated Arrow API
+ * [PARQUET-1113] - [C++] Incorporate fix from ARROW-1601 on bitmap read path
+ * [PARQUET-1111] - dev/release/verify-release-candidate has stale help
+ * [PARQUET-1109] - C++: Update release verification script to SHA512
+ * [PARQUET-1179] - [C++] Support Apache Thrift 0.11
+ * [PARQUET-1226] - [C++] Fix new build warnings with clang 5.0
+ * [PARQUET-1233] - [CPP ]Enable option to switch between stl classes and boost classes for thrift header
+ * [PARQUET-1205] - Fix msvc static build
+ * [PARQUET-1210] - [C++] Boost 1.66 compilation fails on Windows on linkage stage
+
+## Improvement
+ * [PARQUET-1092] - [C++] Write Arrow tables with chunked columns
+ * [PARQUET-1086] - [C++] Remove usage of arrow/util/compiler-util.h after 1.3.0 release
+ * [PARQUET-1097] - [C++] Account for Arrow API deprecation in ARROW-1511
+ * [PARQUET-1150] - C++: Hide statically linked boost symbols
+ * [PARQUET-1151] - [C++] Add build options / configuration to use static runtime libraries with MSVC
+ * [PARQUET-1147] - [C++] Account for API deprecation / change in ARROW-1671
+ * [PARQUET-1162] - C++: Update dev/README after migration to Gitbox
+ * [PARQUET-1165] - [C++] Pin clang-format version to 4.0
+ * [PARQUET-1164] - [C++] Follow API changes in ARROW-1808
+ * [PARQUET-1177] - [C++] Add more extensive compiler warnings when using Clang
+ * [PARQUET-1110] - [C++] Release verification script for Windows
+ * [PARQUET-859] - [C++] Flatten parquet/file directory
+ * [PARQUET-1220] - [C++] Don't build Thrift examples and tutorials in the ExternalProject
+ * [PARQUET-1219] - [C++] Update release-candidate script links to gitbox
+ * [PARQUET-1196] - [C++] Provide a parquet_arrow example project incl. CMake setup
+ * [PARQUET-1200] - [C++] Support reading a single Arrow column from a Parquet file
+
+## New Feature
+ * [PARQUET-1095] - [C++] Read and write Arrow decimal values
+ * [PARQUET-970] - Add Add Lz4 and Zstd compression codecs
+
+## Task
+ * [PARQUET-1221] - [C++] Extend release README
+ * [PARQUET-1225] - NaN values may lead to incorrect filtering under certain circumstances
+
+
+Parquet C++ 1.3.1
+--------------------------------------------------------------------------------
+## Bug
+ * [PARQUET-1105] - [CPP] Remove libboost_system dependency
+ * [PARQUET-1138] - [C++] Fix compilation with Arrow 0.7.1
+ * [PARQUET-1123] - [C++] Update parquet-cpp to use Arrow's AssertArraysEqual
+ * [PARQUET-1121] - C++: DictionaryArrays of NullType cannot be written
+ * [PARQUET-1139] - Add license to cmake_modules/parquet-cppConfig.cmake.in
+
+## Improvement
+ * [PARQUET-1140] - [C++] Fail on RAT errors in CI
+ * [PARQUET-1070] - Add CPack support to the build
+
+
+Parquet C++ 1.3.0
+--------------------------------------------------------------------------------
+## Bug
+ * [PARQUET-1098] - [C++] Install new header in parquet/util
+ * [PARQUET-1085] - [C++] Backwards compatibility from macro cleanup in transitive dependencies in ARROW-1452
+ * [PARQUET-1074] - [C++] Switch to long key ids in KEYs file
+ * [PARQUET-1075] - C++: Coverage upload is broken
+ * [PARQUET-1088] - [CPP] remove parquet_version.h from version control since it gets auto generated
+ * [PARQUET-1002] - [C++] Compute statistics based on Logical Types
+ * [PARQUET-1100] - [C++] Reading repeated types should decode number of records rather than number of values
+ * [PARQUET-1090] - [C++] Fix int32 overflow in Arrow table writer, add max row group size property
+ * [PARQUET-1108] - [C++] Fix Int96 comparators
+
+## Improvement
+ * [PARQUET-1104] - [C++] Upgrade to Apache Arrow 0.7.0 RC0
+ * [PARQUET-1072] - [C++] Add ARROW_NO_DEPRECATED_API to CI to check for deprecated API use
+ * [PARQUET-1096] - C++: Update sha{1, 256, 512} checksums per latest ASF release policy
+ * [PARQUET-1079] - [C++] Account for Arrow API change in ARROW-1335
+ * [PARQUET-1087] - [C++] Add wrapper for ScanFileContents in parquet::arrow that catches exceptions
+ * [PARQUET-1093] - C++: Improve Arrow level generation error message
+ * [PARQUET-1094] - C++: Add benchmark for boolean Arrow column I/O
+ * [PARQUET-1083] - [C++] Refactor core logic in parquet-scan.cc so that it can be used as a library function for benchmarking
+ * [PARQUET-1037] - Allow final RowGroup to be unfilled
+
+## New Feature
+ * [PARQUET-1078] - [C++] Add Arrow writer option to coerce timestamps to milliseconds or microseconds
+ * [PARQUET-929] - [C++] Handle arrow::DictionaryArray when writing Arrow data
+
+
+Parquet C++ 1.2.0
+--------------------------------------------------------------------------------
+## Bug
+ * [PARQUET-1029] - [C++] TypedColumnReader/TypeColumnWriter symbols are no longer being exported
+ * [PARQUET-997] - Fix override compiler warnings
+ * [PARQUET-1033] - Mismatched Read and Write
+ * [PARQUET-1007] - [C++ ] Update parquet.thrift from https://github.com/apache/parquet-format
+ * [PARQUET-1039] - PARQUET-911 Breaks Arrow
+ * [PARQUET-1038] - Key value metadata should be nullptr if not set
+ * [PARQUET-1018] - [C++] parquet.dll has runtime dependencies on one or more libraries in the build toolchain
+ * [PARQUET-1003] - [C++] Modify DEFAULT_CREATED_BY value for every new release version
+ * [PARQUET-1004] - CPP Building fails on windows
+ * [PARQUET-1040] - Missing writer method implementations
+ * [PARQUET-1054] - [C++] Account for Arrow API changes in ARROW-1199
+ * [PARQUET-1042] - C++: Compilation breaks on GCC 4.8
+ * [PARQUET-1048] - [C++] Static linking of libarrow is no longer supported
+ * [PARQUET-1013] - Fix ZLIB_INCLUDE_DIR
+ * [PARQUET-998] - C++: Release script is not usable
+ * [PARQUET-1023] - [C++] Brotli libraries are not being statically linked on Windows
+ * [PARQUET-1000] - [C++] Do not build thirdparty Arrow with /WX on MSVC
+ * [PARQUET-1052] - [C++] add_compiler_export_flags() throws warning with CMake >= 3.3
+ * [PARQUET-1069] - C++: ./dev/release/verify-release-candidate is broken due to missing Arrow dependencies
+
+## Improvement
+ * [PARQUET-996] - Improve MSVC build - ThirdpartyToolchain - Arrow
+ * [PARQUET-911] - C++: Support nested structs in parquet_arrow
+ * [PARQUET-986] - Improve MSVC build - ThirdpartyToolchain - Thrift
+ * [PARQUET-864] - [C++] Consolidate non-Parquet-specific bit utility code into Apache Arrow
+ * [PARQUET-1043] - [C++] Raise minimum supported CMake version to 3.2
+ * [PARQUET-1016] - Upgrade thirdparty Arrow to 0.4.0
+ * [PARQUET-858] - [C++] Flatten parquet/column directory, consolidate related code
+ * [PARQUET-978] - [C++] Minimizing footer reads for small(ish) metadata
+ * [PARQUET-991] - [C++] Fix compiler warnings on MSVC and build with /WX in Appveyor
+ * [PARQUET-863] - [C++] Move SIMD, CPU info, hashing, and other generic utilities into Apache Arrow
+ * [PARQUET-1053] - Fix unused result warnings due to unchecked Statuses
+ * [PARQUET-1067] - C++: Update arrow hash to 0.5.0
+ * [PARQUET-1041] - C++: Support Arrow's NullArray
+ * [PARQUET-1008] - Update TypedColumnReader::ReadBatch method to accept batch_size as int64_t
+ * [PARQUET-1044] - [C++] Use compression libraries from Apache Arrow
+ * [PARQUET-999] - Improve MSVC build - Enable PARQUET_BUILD_BENCHMARKS
+ * [PARQUET-967] - [C++] Combine libparquet/libparquet_arrow libraries
+ * [PARQUET-1045] - [C++] Refactor to account for computational utility code migration in ARROW-1154
+
+## New Feature
+ * [PARQUET-1035] - Write Int96 from Arrow Timestamp(ns)
+
+## Task
+ * [PARQUET-994] - C++: release-candidate script should not push to master
+ * [PARQUET-902] - [C++] Move compressor interfaces into Apache Arrow
+
+## Test
+ * [PARQUET-706] - [C++] Create test case that uses libparquet as a 3rd party library
+
+
+Parquet C++ 1.1.0
+--------------------------------------------------------------------------------
+## Bug
+ * [PARQUET-898] - [C++] Change Travis CI OS X image to Xcode 6.4 and fix our thirdparty build
+ * [PARQUET-976] - [C++] Pass unit test suite with MSVC, build in Appveyor
+ * [PARQUET-963] - [C++] Disallow reading struct types in Arrow reader for now
+ * [PARQUET-959] - [C++] Arrow thirdparty build fails on multiarch systems
+ * [PARQUET-962] - [C++] GTEST_MAIN_STATIC_LIB is not defined in FindGTest.cmake
+ * [PARQUET-958] - [C++] Print Parquet metadata in JSON format
+ * [PARQUET-956] - C++: BUILD_BYPRODUCTS not specified anymore for gtest
+ * [PARQUET-948] - [C++] Account for API changes in ARROW-782
+ * [PARQUET-947] - [C++] Refactor to account for ARROW-795 Arrow core library consolidation
+ * [PARQUET-965] - [C++] FIXED_LEN_BYTE_ARRAY types are unhandled in the Arrow reader
+ * [PARQUET-949] - [C++] Arrow version pinning seems to not be working properly
+ * [PARQUET-955] - [C++] pkg_check_modules will override $ARROW_HOME if it is set in the environment
+ * [PARQUET-945] - [C++] Thrift static libraries are not used with recent patch
+ * [PARQUET-943] - [C++] Overflow build error on x86
+ * [PARQUET-938] - [C++] There is a typo in cmake_modules/FindSnappy.cmake comment
+ * [PARQUET-936] - [C++] parquet::arrow::WriteTable can enter infinite loop if chunk_size is 0
+ * [PARQUET-981] - Repair usage of *_HOME 3rd party dependencies environment variables during Windows build
+ * [PARQUET-992] - [C++] parquet/compression.h leaks zlib.h
+ * [PARQUET-987] - [C++] Fix regressions caused by PARQUET-981
+ * [PARQUET-933] - [C++] Account for Arrow Table API changes coming in ARROW-728
+ * [PARQUET-915] - Support Arrow Time Types in Schema
+ * [PARQUET-914] - [C++] Throw more informative exception when user writes too many values to a column in a row group
+ * [PARQUET-923] - [C++] Account for Time metadata changes in ARROW-686
+ * [PARQUET-918] - FromParquetSchema API crashes on nested schemas
+ * [PARQUET-925] - [C++] FindArrow.cmake sets the wrong library path after ARROW-648
+ * [PARQUET-932] - [c++] Add option to build parquet library with minimal dependency
+ * [PARQUET-919] - [C++] Account for API changes in ARROW-683
+ * [PARQUET-995] - [C++] Int96 reader in parquet_arrow uses size of Int96Type instead of Int96
+
+## Improvement
+ * [PARQUET-508] - Add ParquetFilePrinter
+ * [PARQUET-595] - Add API for key-value metadata
+ * [PARQUET-897] - [C++] Only use designated public headers from libarrow
+ * [PARQUET-679] - [C++] Build and unit tests support for MSVC on Windows
+ * [PARQUET-977] - Improve MSVC build
+ * [PARQUET-957] - [C++] Add optional $PARQUET_BUILD_TOOLCHAIN environment variable option for configuring build environment
+ * [PARQUET-961] - [C++] Strip debug symbols from libparquet libraries in release builds by default
+ * [PARQUET-954] - C++: Use Brolti 0.6 release
+ * [PARQUET-953] - [C++] Change arrow::FileWriter API to be initialized from a Schema, and provide for writing multiple tables
+ * [PARQUET-941] - [C++] Stop needless Boost static library detection for CentOS 7 support
+ * [PARQUET-942] - [C++] Fix wrong variabe use in FindSnappy
+ * [PARQUET-939] - [C++] Support Thrift_HOME CMake variable like FindSnappy does as Snappy_HOME
+ * [PARQUET-940] - [C++] Fix Arrow library path detection
+ * [PARQUET-937] - [C++] Support CMake < 3.4 again for Arrow detection
+ * [PARQUET-935] - [C++] Set shared library version for .deb packages
+ * [PARQUET-934] - [C++] Support multiarch on Debian
+ * [PARQUET-984] - C++: Add abi and so version to pkg-config
+ * [PARQUET-983] - C++: Update Thirdparty hash to Arrow 0.3.0
+ * [PARQUET-989] - [C++] Link dynamically to libarrow in toolchain build, set LD_LIBRARY_PATH
+ * [PARQUET-988] - [C++] Add Linux toolchain-based build to Travis CI
+ * [PARQUET-928] - [C++] Support pkg-config
+ * [PARQUET-927] - [C++] Specify shared library version of Apache Arrow
+ * [PARQUET-931] - [C++] Add option to pin thirdparty Arrow version used in ExternalProject
+ * [PARQUET-926] - [C++] Use pkg-config to find Apache Arrow
+ * [PARQUET-917] - C++: Build parquet_arrow by default
+ * [PARQUET-910] - C++: Support TIME logical type in parquet_arrow
+ * [PARQUET-909] - [CPP]: Reduce buffer allocations (mallocs) on critical path
+
+## New Feature
+ * [PARQUET-853] - [C++] Add option to link with shared boost libraries when building Arrow in the thirdparty toolchain
+ * [PARQUET-946] - [C++] Refactoring in parquet::arrow::FileReader to be able to read a single row group
+ * [PARQUET-930] - [C++] Account for all Arrow date/time types
+
+
+Parquet C++ 1.0.0
+--------------------------------------------------------------------------------
+## Bug
+ * [PARQUET-455] - Fix compiler warnings on OS X / Clang
+ * [PARQUET-558] - Support ZSH in build scripts
+ * [PARQUET-720] - Parquet-cpp fails to link when included in multiple TUs
+ * [PARQUET-718] - Reading boolean pages written by parquet-cpp fails
+ * [PARQUET-640] - [C++] Force the use of gcc 4.9 in conda builds
+ * [PARQUET-643] - Add const modifier to schema pointer reference in ParquetFileWriter
+ * [PARQUET-672] - [C++] Build testing conda artifacts in debug mode
+ * [PARQUET-661] - [C++] Do not assume that perl is found in /usr/bin
+ * [PARQUET-659] - [C++] Instantiated template visibility is broken on clang / OS X
+ * [PARQUET-657] - [C++] Don't define DISALLOW_COPY_AND_ASSIGN if already defined
+ * [PARQUET-656] - [C++] Revert PARQUET-653
+ * [PARQUET-676] - MAX_VALUES_PER_LITERAL_RUN causes RLE encoding failure
+ * [PARQUET-614] - C++: Remove unneeded LZ4-related code
+ * [PARQUET-604] - Install writer.h headers
+ * [PARQUET-621] - C++: Uninitialised DecimalMetadata is read
+ * [PARQUET-620] - C++: Duplicate calls to ParquetFileWriter::Close cause duplicate metdata writes
+ * [PARQUET-599] - ColumnWriter::RleEncodeLevels' size estimation might be wrong
+ * [PARQUET-617] - C++: Enable conda build to work on systems with non-default C++ toolchains
+ * [PARQUET-627] - Ensure that thrift headers are generated before source compilation
+ * [PARQUET-745] - TypedRowGroupStatistics fails to PlainDecode min and max in ByteArrayType
+ * [PARQUET-738] - Update arrow version that also supports newer Xcode
+ * [PARQUET-747] - [C++] TypedRowGroupStatistics are not being exported in libparquet.so
+ * [PARQUET-711] - Use metadata builders in parquet writer
+ * [PARQUET-732] - Building a subset of dependencies does not work
+ * [PARQUET-760] - On switching from dictionary to the fallback encoding, an incorrect encoding is set
+ * [PARQUET-691] - [C++] Write ColumnChunk metadata after each column chunk in the file
+ * [PARQUET-797] - [C++] Update for API changes in ARROW-418
+ * [PARQUET-837] - [C++] SerializedFile::ParseMetaData uses Seek, followed by Read, and could have race conditions
+ * [PARQUET-827] - [C++] Incorporate addition of arrow::MemoryPool::Reallocate
+ * [PARQUET-502] - Scanner segfaults when its batch size is smaller than the number of rows
+ * [PARQUET-469] - Roll back Thrift bindings to 0.9.0
+ * [PARQUET-889] - Fix compilation when PARQUET_USE_SSE is on
+ * [PARQUET-888] - C++ Memory leak in RowGroupSerializer
+ * [PARQUET-819] - C++: Trying to install non-existing parquet/arrow/utils.h
+ * [PARQUET-736] - XCode 8.0 breaks builds
+ * [PARQUET-505] - Column reader: automatically handle large data pages
+ * [PARQUET-615] - C++: Building static or shared libparquet should not be mutually exclusive
+ * [PARQUET-658] - ColumnReader has no virtual destructor
+ * [PARQUET-799] - concurrent usage of the file reader API
+ * [PARQUET-513] - Valgrind errors are not failing the Travis CI build
+ * [PARQUET-841] - [C++] Writing wrong format version when using ParquetVersion::PARQUET_1_0
+ * [PARQUET-742] - Add missing license headers
+ * [PARQUET-741] - compression_buffer_ is reused although it shouldn't
+ * [PARQUET-700] - C++: Disable dictionary encoding for boolean columns
+ * [PARQUET-662] - [C++] ParquetException must be explicitly exported in dynamic libraries
+ * [PARQUET-704] - [C++] scan-all.h is not being installed
+ * [PARQUET-865] - C++: Pass all CXXFLAGS to Thrift ExternalProject
+ * [PARQUET-875] - [C++] Fix coveralls build given changes to thirdparty build procedure
+ * [PARQUET-709] - [C++] Fix conda dev binary builds
+ * [PARQUET-638] - [C++] Revert static linking of libstdc++ in conda builds until symbol visibility addressed
+ * [PARQUET-606] - Travis coverage is broken
+ * [PARQUET-880] - [CPP] Prevent destructors from throwing
+ * [PARQUET-886] - [C++] Revise build documentation and requirements in README.md
+ * [PARQUET-900] - C++: Fix NOTICE / LICENSE issues
+ * [PARQUET-885] - [C++] Do not search for Thrift in default system paths
+ * [PARQUET-879] - C++: ExternalProject compilation for Thrift fails on older CMake versions
+ * [PARQUET-635] - [C++] Statically link libstdc++ on Linux in conda recipe
+ * [PARQUET-710] - Remove unneeded private member variables from RowGroupReader ABI
+ * [PARQUET-766] - C++: Expose ParquetFileReader through Arrow reader as const
+ * [PARQUET-876] - C++: Correct snapshot version
+ * [PARQUET-821] - [C++] zlib download link is broken
+ * [PARQUET-818] - [C++] Refactor library to share IO, Buffer, and memory management abstractions with Apache Arrow
+ * [PARQUET-537] - LocalFileSource leaks resources
+ * [PARQUET-764] - [CPP] Parquet Writer does not write Boolean values correctly
+ * [PARQUET-812] - [C++] Failure reading BYTE_ARRAY data from file in parquet-compatibility project
+ * [PARQUET-759] - Cannot store columns consisting of empty strings
+ * [PARQUET-846] - [CPP] CpuInfo::Init() is not thread safe
+ * [PARQUET-694] - C++: Revert default data page size back to 1M
+ * [PARQUET-842] - [C++] Impala rejects DOUBLE columns if decimal metadata is set
+ * [PARQUET-708] - [C++] RleEncoder does not account for "worst case scenario" in MaxBufferSize for bit_width > 1
+ * [PARQUET-639] - Do not export DCHECK in public headers
+ * [PARQUET-828] - [C++] "version" field set improperly in file metadata
+ * [PARQUET-891] - [C++] Do not search for Snappy in default system paths
+ * [PARQUET-626] - Fix builds due to unavailable llvm.org apt mirror
+ * [PARQUET-629] - RowGroupSerializer should only close itself once
+ * [PARQUET-472] - Clean up InputStream ownership semantics in ColumnReader
+ * [PARQUET-739] - Rle-decoding uses static buffer that is shared accross threads
+ * [PARQUET-561] - ParquetFileReader::Contents PIMPL missing a virtual destructor
+ * [PARQUET-892] - [C++] Clean up link library targets in CMake files
+ * [PARQUET-454] - Address inconsistencies in boolean decoding
+ * [PARQUET-816] - [C++] Failure decoding sample dict-encoded file from parquet-compatibility project
+ * [PARQUET-565] - Use PATH instead of DIRECTORY in get_filename_component to support CMake<2.8.12
+ * [PARQUET-446] - Hide thrift dependency in parquet-cpp
+ * [PARQUET-843] - [C++] Impala unable to read files created by parquet-cpp
+ * [PARQUET-555] - Dictionary page metadata handling inconsistencies
+ * [PARQUET-908] - Fix for PARQUET-890 introduces undefined symbol in libparquet_arrow.so
+ * [PARQUET-793] - [CPP] Do not return incorrect statistics
+ * [PARQUET-887] - C++: Fix issues in release scripts arise in RC1
+
+## Improvement
+ * [PARQUET-277] - Remove boost dependency
+ * [PARQUET-500] - Enable coveralls.io for apache/parquet-cpp
+ * [PARQUET-497] - Decouple Parquet physical file structure from FileReader class
+ * [PARQUET-597] - Add data rates to benchmark output
+ * [PARQUET-522] - #include cleanup with include-what-you-use
+ * [PARQUET-515] - Add "Reset" to LevelEncoder and LevelDecoder
+ * [PARQUET-514] - Automate coveralls.io updates in Travis CI
+ * [PARQUET-551] - Handle compiler warnings due to disabled DCHECKs in release builds
+ * [PARQUET-559] - Enable InputStream as a source to the ParquetFileReader
+ * [PARQUET-562] - Simplified ZSH support in build scripts
+ * [PARQUET-538] - Improve ColumnReader Tests
+ * [PARQUET-541] - Portable build scripts
+ * [PARQUET-724] - Test more advanced properties setting
+ * [PARQUET-641] - Instantiate stringstream only if needed in SerializedPageReader::NextPage
+ * [PARQUET-636] - Expose selection for different encodings
+ * [PARQUET-603] - Implement missing information in schema descriptor
+ * [PARQUET-610] - Print ColumnMetaData for each RowGroup
+ * [PARQUET-600] - Add benchmarks for RLE-Level encoding
+ * [PARQUET-592] - Support compressed writes
+ * [PARQUET-593] - Add API for writing Page statistics
+ * [PARQUET-589] - Implement Chunked InMemoryInputStream for better memory usage
+ * [PARQUET-587] - Implement BufferReader::Read(int64_t,uint8_t*)
+ * [PARQUET-616] - C++: WriteBatch should accept const arrays
+ * [PARQUET-630] - C++: Support link flags for older CMake versions
+ * [PARQUET-634] - Consistent private linking of dependencies
+ * [PARQUET-633] - Add version to WriterProperties
+ * [PARQUET-625] - Improve RLE read performance
+ * [PARQUET-737] - Use absolute namespace in macros
+ * [PARQUET-762] - C++: Use optimistic allocation instead of Arrow Builders
+ * [PARQUET-773] - C++: Check licenses with RAT in CI
+ * [PARQUET-687] - C++: Switch to PLAIN encoding if dictionary grows too large
+ * [PARQUET-784] - C++: Reference Spark, Kudu and FrameOfReference in LICENSE
+ * [PARQUET-809] - [C++] Add API to determine if two files' schemas are compatible
+ * [PARQUET-778] - Standardize the schema output to match the parquet-mr format
+ * [PARQUET-463] - Add DCHECK* macros for assertions in debug builds
+ * [PARQUET-471] - Use the same environment setup script for Travis CI as local sandbox development
+ * [PARQUET-449] - Update to latest parquet.thrift
+ * [PARQUET-496] - Fix cpplint configuration to be more restrictive
+ * [PARQUET-468] - Add a cmake option to generate the Parquet thrift headers with the thriftc in the environment
+ * [PARQUET-482] - Organize src code file structure to have a very clear folder with public headers.
+ * [PARQUET-591] - Page size estimation during writes
+ * [PARQUET-518] - Review usages of size_t and unsigned integers generally per Google style guide
+ * [PARQUET-533] - Simplify RandomAccessSource API to combine Seek/Read
+ * [PARQUET-767] - Add release scripts for parquet-cpp
+ * [PARQUET-699] - Update parquet.thrift from https://github.com/apache/parquet-format
+ * [PARQUET-653] - [C++] Re-enable -static-libstdc++ in dev artifact builds
+ * [PARQUET-763] - C++: Expose ParquetFileReader through Arrow reader
+ * [PARQUET-857] - [C++] Flatten parquet/encodings directory
+ * [PARQUET-862] - Provide defaut cache size values if CPU info probing is not available
+ * [PARQUET-689] - C++: Compress DataPages eagerly
+ * [PARQUET-874] - [C++] Use default memory allocator from Arrow
+ * [PARQUET-267] - Detach thirdparty code from build configuration.
+ * [PARQUET-418] - Add a utility to print contents of a Parquet file to stdout
+ * [PARQUET-519] - Disable compiler warning supressions and fix all DEBUG build warnings
+ * [PARQUET-447] - Add Debug and Release build types and associated compiler flags
+ * [PARQUET-868] - C++: Build snappy with optimizations
+ * [PARQUET-894] - Fix compilation warning
+ * [PARQUET-883] - C++: Support non-standard gcc version strings
+ * [PARQUET-607] - Public Writer header
+ * [PARQUET-731] - [CPP] Add API to return metadata size and Skip reading values
+ * [PARQUET-628] - Link thrift privately
+ * [PARQUET-877] - C++: Update Arrow Hash, update Version in metadata.
+ * [PARQUET-547] - Refactor most templates to use DataType structs rather than the Type::type enum
+ * [PARQUET-882] - [CPP] Improve Application Version parsing
+ * [PARQUET-448] - Add cmake option to skip building the unit tests
+ * [PARQUET-721] - Performance benchmarks for reading into Arrow structures
+ * [PARQUET-820] - C++: Decoders should directly emit arrays with spacing for null entries
+ * [PARQUET-813] - C++: Build dependencies using CMake External project
+ * [PARQUET-488] - Add SSE-related cmake options to manage compiler flags
+ * [PARQUET-564] - Add option to run unit tests with valgrind --tool=memcheck
+ * [PARQUET-572] - Rename parquet_cpp namespace to parquet
+ * [PARQUET-829] - C++: Make use of ARROW-469
+ * [PARQUET-501] - Add an OutputStream abstraction (capable of memory allocation) for Encoder public API
+ * [PARQUET-744] - Clarifications on build instructions
+ * [PARQUET-520] - Add version of LocalFileSource that uses memory-mapping for zero-copy reads
+ * [PARQUET-556] - Extend RowGroupStatistics to include "min" "max" statistics
+ * [PARQUET-671] - Improve performance of RLE/bit-packed decoding in parquet-cpp
+ * [PARQUET-681] - Add tool to scan a parquet file
+
+## New Feature
+ * [PARQUET-499] - Complete PlainEncoder implementation for all primitive types and test end to end
+ * [PARQUET-439] - Conform all copyright headers to ASF requirements
+ * [PARQUET-436] - Implement ParquetFileWriter class entry point for generating new Parquet files
+ * [PARQUET-435] - Provide vectorized ColumnReader interface
+ * [PARQUET-438] - Update RLE encoder/decoder modules from Impala upstream changes and adapt unit tests
+ * [PARQUET-512] - Add optional google/benchmark 3rd-party dependency for performance testing
+ * [PARQUET-566] - Add method to retrieve the full column path
+ * [PARQUET-613] - C++: Add conda packaging recipe
+ * [PARQUET-605] - Expose schema node in ColumnDescriptor
+ * [PARQUET-619] - C++: Add OutputStream for local files
+ * [PARQUET-583] - Implement Parquet to Thrift schema conversion
+ * [PARQUET-582] - Conversion functions for Parquet enums to Thrift enums
+ * [PARQUET-728] - [C++] Bring parquet::arrow up to date with API changes in arrow::io
+ * [PARQUET-752] - [C++] Conform parquet_arrow to upstream API changes
+ * [PARQUET-788] - [C++] Reference Impala / Apache Impala (incubating) in LICENSE
+ * [PARQUET-808] - [C++] Add API to read file given externally-provided FileMetadata
+ * [PARQUET-807] - [C++] Add API to read file metadata only from a file handle
+ * [PARQUET-805] - C++: Read Int96 into Arrow Timestamp(ns)
+ * [PARQUET-836] - [C++] Add column selection to parquet::arrow::FileReader
+ * [PARQUET-835] - [C++] Add option to parquet::arrow to read columns in parallel using a thread pool
+ * [PARQUET-830] - [C++] Add additional configuration options to parquet::arrow::OpenFIle
+ * [PARQUET-769] - C++: Add support for Brotli Compression
+ * [PARQUET-489] - Add visibility macros to be used for public and internal APIs of libparquet
+ * [PARQUET-542] - Support memory allocation from external memory
+ * [PARQUET-844] - [C++] Consolidate encodings, schema, and compression subdirectories into fewer files
+ * [PARQUET-848] - [C++] Consolidate libparquet_thrift subcomponent
+ * [PARQUET-646] - [C++] Enable easier 3rd-party toolchain clang builds on Linux
+ * [PARQUET-598] - [C++] Test writing all primitive data types
+ * [PARQUET-442] - Convert flat SchemaElement vector to implied nested schema data structure
+ * [PARQUET-867] - [C++] Support writing sliced Arrow arrays
+ * [PARQUET-456] - Add zlib codec support
+ * [PARQUET-834] - C++: Support r/w of arrow::ListArray
+ * [PARQUET-485] - Decouple data page delimiting from column reader / scanner classes, create test fixtures
+ * [PARQUET-434] - Add a ParquetFileReader class to encapsulate some low-level details of interacting with Parquet files
+ * [PARQUET-666] - PLAIN_DICTIONARY write support
+ * [PARQUET-437] - Incorporate googletest thirdparty dependency and add cmake tools (ADD_PARQUET_TEST) to simplify adding new unit tests
+ * [PARQUET-866] - [C++] Account for API changes in ARROW-33
+ * [PARQUET-545] - Improve API to support Decimal type
+ * [PARQUET-579] - Add API for writing Column statistics
+ * [PARQUET-494] - Implement PLAIN_DICTIONARY encoding and decoding
+ * [PARQUET-618] - C++: Automatically upload conda build artifacts on commits to master
+ * [PARQUET-833] - C++: Provide API to write spaced arrays (e.g. Arrow)
+ * [PARQUET-903] - C++: Add option to set RPATH to ORIGIN
+ * [PARQUET-451] - Add a RowGroup reader interface class
+ * [PARQUET-785] - C++: List conversion for Arrow Schemas
+ * [PARQUET-712] - C++: Read into Arrow memory
+ * [PARQUET-890] - C++: Support I/O of DATE columns in parquet_arrow
+ * [PARQUET-782] - C++: Support writing to Arrow sinks
+ * [PARQUET-849] - [C++] Upgrade default Thrift in thirdparty toolchain to 0.9.3 or 0.10
+ * [PARQUET-573] - C++: Create a public API for reading and writing file metadata
+
+## Task
+ * [PARQUET-814] - C++: Remove Conda recipes
+ * [PARQUET-503] - Re-enable parquet 2.0 encodings
+ * [PARQUET-169] - Parquet-cpp: Implement support for bulk reading and writing repetition/definition levels.
+ * [PARQUET-878] - C++: Remove setup_build_env from rc-verification script
+ * [PARQUET-881] - C++: Update Arrow hash to 0.2.0-rc2
+ * [PARQUET-771] - C++: Sync KEYS file
+ * [PARQUET-901] - C++: Publish RCs in apache-parquet-VERSION in SVN
+
+## Test
+ * [PARQUET-525] - Test coverage for malformed file failure modes on the read path
+ * [PARQUET-703] - [C++] Validate num_values metadata for columns with nulls
+ * [PARQUET-507] - Improve runtime of rle-test.cc
+ * [PARQUET-549] - Add scanner and column reader tests for dictionary data pages
+ * [PARQUET-457] - Add compressed data page unit tests
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 3511485..36121c7 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -289,6 +289,14 @@ Pass multiple labels by dividing with semicolons")
Always OFF if building binaries"
OFF)
+ option(PARQUET_BUILD_EXECUTABLES
+ "Build the Parquet executable CLI tools"
+ OFF)
+
+ option(PARQUET_BUILD_EXAMPLES
+ "Build the Parquet examples"
+ OFF)
+
set(PARQUET_ARROW_LINKAGE "shared" CACHE STRING
"How to link Arrow libraries with libparquet.so. static|shared (default shared)")
@@ -764,4 +772,6 @@ endif()
if(ARROW_PARQUET)
add_subdirectory(src/parquet)
+ add_subdirectory(tools/parquet)
+ add_subdirectory(examples/parquet/low-level-api)
endif()
diff --git a/cpp/examples/parquet/low-level-api/CMakeLists.txt b/cpp/examples/parquet/low-level-api/CMakeLists.txt
index 64ba110..26e8220 100644
--- a/cpp/examples/parquet/low-level-api/CMakeLists.txt
+++ b/cpp/examples/parquet/low-level-api/CMakeLists.txt
@@ -15,11 +15,15 @@
# specific language governing permissions and limitations
# under the License.
-if (PARQUET_BUILD_EXECUTABLES)
- add_executable(reader-writer reader-writer.cc)
- add_executable(reader-writer2 reader-writer2.cc)
- target_include_directories(reader-writer PRIVATE .)
- target_include_directories(reader-writer2 PRIVATE .)
- target_link_libraries(reader-writer parquet_static)
- target_link_libraries(reader-writer2 parquet_static)
+if (PARQUET_BUILD_EXAMPLES)
+ add_executable(parquet-reader-writer reader-writer.cc)
+ add_executable(parquet-reader-writer2 reader-writer2.cc)
+ target_include_directories(parquet-reader-writer PRIVATE .)
+ target_include_directories(parquet-reader-writer2 PRIVATE .)
+ target_link_libraries(parquet-reader-writer parquet_static)
+ target_link_libraries(parquet-reader-writer2 parquet_static)
+
+ add_dependencies(parquet
+ parquet-reader-writer
+ parquet-reader-writer2)
endif()
diff --git a/cpp/examples/parquet/parquet-arrow/CMakeLists.txt b/cpp/examples/parquet/parquet-arrow/CMakeLists.txt
index 897fcfb..892ec92 100644
--- a/cpp/examples/parquet/parquet-arrow/CMakeLists.txt
+++ b/cpp/examples/parquet/parquet-arrow/CMakeLists.txt
@@ -32,41 +32,9 @@ set(CMAKE_CXX_STANDARD 11)
# We require a C++11 compliant compiler
set(CMAKE_CXX_STANDARD_REQUIRED ON)
-# We want to link dynamically against Arrow and Parquet
-set(PARQUET_BUILD_SHARED ON)
-
-
# First search the packages in the system. If they are not found, use CMake's
# ExternalProject mechanism to build them locally.
find_package(Arrow)
-if (NOT ARROW_FOUND)
- # set compile output directory
- if (NOT CMAKE_BUILD_TYPE)
- set(CMAKE_BUILD_TYPE Debug)
- endif(NOT CMAKE_BUILD_TYPE)
- string (TOLOWER ${CMAKE_BUILD_TYPE} BUILD_SUBDIR_NAME)
- # If build in-source, create the latest symlink. If build out-of-source, which is
- # preferred, simply output the binaries in the build folder
- if (${CMAKE_SOURCE_DIR} STREQUAL "${CMAKE_CURRENT_BINARY_DIR}")
- set(BUILD_OUTPUT_ROOT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/build/${BUILD_SUBDIR_NAME}")
- # Link build/latest to the current build directory, to avoid developers
- # accidentally running the latest debug build when in fact they're building
- # release builds.
- FILE(MAKE_DIRECTORY ${BUILD_OUTPUT_ROOT_DIRECTORY})
- if (NOT APPLE)
- set(MORE_ARGS "-T")
- endif()
- EXECUTE_PROCESS(COMMAND ln ${MORE_ARGS} -sf ${BUILD_OUTPUT_ROOT_DIRECTORY}
- ${CMAKE_CURRENT_BINARY_DIR}/build/latest)
- else()
- set(BUILD_OUTPUT_ROOT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${BUILD_SUBDIR_NAME}")
- endif()
-
- include(ArrowExternalProject)
- set(ARROW_VENDORED 1)
-else()
- set(ARROW_VENDORED 0)
-endif()
find_package(Parquet)
include_directories(SYSTEM ${ARROW_INCLUDE_DIR} ${PARQUET_INCLUDE_DIR})
diff --git a/cpp/examples/parquet/parquet-arrow/cmake_modules/ArrowExternalProject.cmake b/cpp/examples/parquet/parquet-arrow/cmake_modules/ArrowExternalProject.cmake
deleted file mode 120000
index b535f6e..0000000
--- a/cpp/examples/parquet/parquet-arrow/cmake_modules/ArrowExternalProject.cmake
+++ /dev/null
@@ -1 +0,0 @@
-../../../cmake_modules/ArrowExternalProject.cmake
\ No newline at end of file
diff --git a/cpp/examples/parquet/parquet-arrow/cmake_modules/FindArrow.cmake b/cpp/examples/parquet/parquet-arrow/cmake_modules/FindArrow.cmake
index 6c451ce..865f5da 120000
--- a/cpp/examples/parquet/parquet-arrow/cmake_modules/FindArrow.cmake
+++ b/cpp/examples/parquet/parquet-arrow/cmake_modules/FindArrow.cmake
@@ -1 +1 @@
-../../../cmake_modules/FindArrow.cmake
\ No newline at end of file
+../../../../cmake_modules/FindArrow.cmake
\ No newline at end of file
diff --git a/cpp/examples/parquet/parquet-arrow/src/reader-writer.cc b/cpp/examples/parquet/parquet-arrow/src/reader-writer.cc
index f333cab..8154d7a 100644
--- a/cpp/examples/parquet/parquet-arrow/src/reader-writer.cc
+++ b/cpp/examples/parquet/parquet-arrow/src/reader-writer.cc
@@ -26,7 +26,7 @@
// some data.
std::shared_ptr<arrow::Table> generate_table() {
arrow::Int64Builder i64builder;
- PARQUET_THROW_NOT_OK(i64builder.Append({1, 2, 3, 4, 5}));
+ PARQUET_THROW_NOT_OK(i64builder.AppendValues({1, 2, 3, 4, 5}));
std::shared_ptr<arrow::Array> i64array;
PARQUET_THROW_NOT_OK(i64builder.Finish(&i64array));
diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt
index 0705c83..f77c093 100644
--- a/cpp/tools/parquet/CMakeLists.txt
+++ b/cpp/tools/parquet/CMakeLists.txt
@@ -16,16 +16,18 @@
# under the License.
if (PARQUET_BUILD_EXECUTABLES)
- set(EXECUTABLE_TOOLS
+ set(PARQUET_TOOLS
parquet-dump-schema
parquet_reader
parquet-scan)
- foreach(TOOL ${EXECUTABLE_TOOLS})
+ foreach(TOOL ${PARQUET_TOOLS})
add_executable(${TOOL} "${TOOL}.cc")
target_link_libraries(${TOOL} parquet_static)
# Avoid unsetting RPATH when installing
set_target_properties(${TOOL} PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
install(TARGETS ${TOOL} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
endforeach(TOOL)
+
+ add_dependencies(parquet ${PARQUET_TOOLS})
endif()
diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt
index 7bdef5d..e366ae3 100644
--- a/dev/release/rat_exclude_files.txt
+++ b/dev/release/rat_exclude_files.txt
@@ -7,6 +7,8 @@
*.json
*.snap
.github/ISSUE_TEMPLATE.md
+cpp/examples/parquet/parquet-arrow/cmake_modules/FindArrow.cmake
+cpp/CHANGELOG_PARQUET.md
cpp/src/arrow/io/mman.h
cpp/src/arrow/util/random.h
cpp/src/arrow/status.cc
[arrow] 06/24: PARQUET-807: Allow user to retain ownership of
parquet::FileMetaData.
Posted by we...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 58ad1a8b68ebdb9034d0ea0150aa068875437467
Author: Wes McKinney <we...@twosigma.com>
AuthorDate: Thu Jan 5 12:19:31 2017 -0500
PARQUET-807: Allow user to retain ownership of parquet::FileMetaData.
Also implements PARQUET-808: opening file with existing metadata object.
This allows a user to create a reader only for the purposes of obtaining the metadata.
Do you all think it's worth having a convenience method for reading the metadata out of a file?
Author: Wes McKinney <we...@twosigma.com>
Closes #213 from wesm/PARQUET-807 and squashes the following commits:
c1b5c7c [Wes McKinney] Use ReadMetaData function in test
d382cca [Wes McKinney] Add note about ARROW-455
05ecd37 [Wes McKinney] Implement/test opening with provided metadata. Do not close Arrow output files automatically
d790bb5 [Wes McKinney] Tweak
0dd4184 [Wes McKinney] Add ReadMetaData convenience method
97527ba [Wes McKinney] Change FileMetaData in ParquetFileReader to a shared_ptr so that ownership can be transferred away
Change-Id: Ic76c48671b41816d3dc1ebb736eb5a06c907c95d
---
cpp/examples/parquet/reader-writer.cc | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cpp/examples/parquet/reader-writer.cc b/cpp/examples/parquet/reader-writer.cc
index 0289eed..59ee63b 100644
--- a/cpp/examples/parquet/reader-writer.cc
+++ b/cpp/examples/parquet/reader-writer.cc
@@ -226,7 +226,7 @@ int main(int argc, char** argv) {
std::unique_ptr<parquet::ParquetFileReader> parquet_reader =
parquet::ParquetFileReader::OpenFile(PARQUET_FILENAME, false);
// Get the File MetaData
- const parquet::FileMetaData* file_metadata = parquet_reader->metadata();
+ std::shared_ptr<parquet::FileMetaData> file_metadata = parquet_reader->metadata();
// Get the number of RowGroups
int num_row_groups = file_metadata->num_row_groups();
[arrow] 21/24: PARQUET-1270: Install executable tools
Posted by we...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit fca0625b6f67e45bba7f321e8a93be742d456455
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Tue Apr 17 19:08:31 2018 +0200
PARQUET-1270: Install executable tools
"parquet_reader" and friends should be installed along with the Parquet libraries.
Author: Antoine Pitrou <an...@python.org>
Closes #455 from pitrou/PARQUET-1270-install-tools and squashes the following commits:
44c486c [Antoine Pitrou] PARQUET-1270: Install executable tools
Change-Id: I96fa2c9262716eeceddbf2d844e3805095431693
---
cpp/tools/parquet/CMakeLists.txt | 18 +++++++++++-------
1 file changed, 11 insertions(+), 7 deletions(-)
diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt
index d473d08..0705c83 100644
--- a/cpp/tools/parquet/CMakeLists.txt
+++ b/cpp/tools/parquet/CMakeLists.txt
@@ -16,12 +16,16 @@
# under the License.
if (PARQUET_BUILD_EXECUTABLES)
- add_executable(parquet-dump-schema parquet-dump-schema.cc)
- target_link_libraries(parquet-dump-schema parquet_static)
+ set(EXECUTABLE_TOOLS
+ parquet-dump-schema
+ parquet_reader
+ parquet-scan)
- add_executable(parquet_reader parquet_reader.cc)
- target_link_libraries(parquet_reader parquet_static)
-
- add_executable(parquet-scan parquet-scan.cc)
- target_link_libraries(parquet-scan parquet_static)
+ foreach(TOOL ${EXECUTABLE_TOOLS})
+ add_executable(${TOOL} "${TOOL}.cc")
+ target_link_libraries(${TOOL} parquet_static)
+ # Avoid unsetting RPATH when installing
+ set_target_properties(${TOOL} PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
+ install(TARGETS ${TOOL} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+ endforeach(TOOL)
endif()
[arrow] 15/24: PARQUET-1048: Apache Arrow static transitive
dependencies
Posted by we...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 80dc88363019b34a1e0bbffe6317533711fbfcd4
Author: Deepak Majeti <de...@hpe.com>
AuthorDate: Mon Jul 10 19:40:10 2017 +0200
PARQUET-1048: Apache Arrow static transitive dependencies
Author: Deepak Majeti <de...@hpe.com>
Closes #367 from majetideepak/PARQUET-1048 and squashes the following commits:
42635f2 [Deepak Majeti] try another script
bb96b28 [Deepak Majeti] disable boost shared
b3a0e75 [Deepak Majeti] enable mem check
db50b65 [Deepak Majeti] build arrow shared only if parquet shared is specified
5fe857e [Deepak Majeti] Remove mem check
68f8c59 [Deepak Majeti] Fix Arrow visibility warnings
0bbd073 [Deepak Majeti] add env vars
e4406ab [Deepak Majeti] Add static linking for travis ci
032c8c7 [Deepak Majeti] Fix linking of benchmarks and examples
bdb5b2f [Deepak Majeti] Add Arrow lib transitive dependencies
Change-Id: I7869ce951e2087d713775787b5239921ada43da0
---
cpp/examples/parquet/CMakeLists.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/CMakeLists.txt
index 594295d..721fa9a 100644
--- a/cpp/examples/parquet/CMakeLists.txt
+++ b/cpp/examples/parquet/CMakeLists.txt
@@ -17,5 +17,5 @@
if (PARQUET_BUILD_EXECUTABLES)
add_executable(reader-writer reader-writer.cc)
- target_link_libraries(reader-writer parquet_shared arrow)
+ target_link_libraries(reader-writer parquet_static)
endif()
[arrow] 19/24: PARQUET-1177: Add PARQUET_BUILD_WARNING_LEVEL option
and more rigorous Clang warnings
Posted by we...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit eefd36eba3e33fc5df570905c8d85162fd1728db
Author: Wes McKinney <we...@twosigma.com>
AuthorDate: Wed Dec 13 17:19:22 2017 -0500
PARQUET-1177: Add PARQUET_BUILD_WARNING_LEVEL option and more rigorous Clang warnings
These warnings will catch a number of things that have bitten us in the past, like missing virtual destructors. This brings Parquet's compiler warnings up to the same quality as Arrow's
Author: Wes McKinney <we...@twosigma.com>
Author: Wes McKinney <we...@gmail.com>
Closes #425 from wesm/PARQUET-1177 and squashes the following commits:
3769a8c [Wes McKinney] Add -Wno-missing-noreturn
5b6cd80 [Wes McKinney] Compile with /bigobj in MSVC
cc5bca0 [Wes McKinney] Add noreturn to static methods in ParquetException
e3ffb71 [Wes McKinney] Fix -Wconversion warnings in decode_benchmark.cc
758a216 [Wes McKinney] Fix warnings on macOS Clang
3aef3b4 [Wes McKinney] Do not pass -Werror via PARQUET_CXXFLAGS
5a98e81 [Wes McKinney] Fix usage of PrimitiveArray::raw_values
c848855 [Wes McKinney] Fix compiler warnings with gcc 4.9
ca9a374 [Wes McKinney] Add SetupCxxFlags.cmake from Apache Arrow. Add PARQUET_BUILD_WARNING_LEVEL flag. Fix Clang compiler warnings
Change-Id: I428d1d90bc4eb3dab8b56a538d1eb58656664b74
---
cpp/examples/parquet/reader-writer.cc | 16 ++++++++--------
cpp/tools/parquet/parquet-scan.cc | 3 ++-
2 files changed, 10 insertions(+), 9 deletions(-)
diff --git a/cpp/examples/parquet/reader-writer.cc b/cpp/examples/parquet/reader-writer.cc
index 7136b28..fb2ec77 100644
--- a/cpp/examples/parquet/reader-writer.cc
+++ b/cpp/examples/parquet/reader-writer.cc
@@ -170,7 +170,7 @@ int main(int argc, char** argv) {
parquet::FloatWriter* float_writer =
static_cast<parquet::FloatWriter*>(rg_writer->NextColumn());
for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
- float value = i * 1.1f;
+ float value = static_cast<float>(i) * 1.1f;
float_writer->WriteBatch(1, nullptr, nullptr, &value);
}
@@ -188,9 +188,9 @@ int main(int argc, char** argv) {
for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
parquet::ByteArray value;
char hello[FIXED_LENGTH] = "parquet";
- hello[7] = '0' + i / 100;
- hello[8] = '0' + (i / 10) % 10;
- hello[9] = '0' + i % 10;
+ hello[7] = static_cast<char>(static_cast<int>('0') + i / 100);
+ hello[8] = static_cast<char>(static_cast<int>('0') + (i / 10) % 10);
+ hello[9] = static_cast<char>(static_cast<int>('0') + i % 10);
if (i % 2 == 0) {
int16_t definition_level = 1;
value.ptr = reinterpret_cast<const uint8_t*>(&hello[0]);
@@ -369,7 +369,7 @@ int main(int argc, char** argv) {
// There are no NULL values in the rows written
assert(values_read == 1);
// Verify the value written
- float expected_value = i * 1.1f;
+ float expected_value = static_cast<float>(i) * 1.1f;
assert(value == expected_value);
i++;
}
@@ -411,9 +411,9 @@ int main(int argc, char** argv) {
assert(rows_read == 1);
// Verify the value written
char expected_value[FIXED_LENGTH] = "parquet";
- expected_value[7] = '0' + i / 100;
- expected_value[8] = '0' + (i / 10) % 10;
- expected_value[9] = '0' + i % 10;
+ expected_value[7] = static_cast<char>('0' + i / 100);
+ expected_value[8] = static_cast<char>('0' + (i / 10) % 10);
+ expected_value[9] = static_cast<char>('0' + i % 10);
if (i % 2 == 0) { // only alternate values exist
// There are no NULL values in the rows written
assert(values_read == 1);
diff --git a/cpp/tools/parquet/parquet-scan.cc b/cpp/tools/parquet/parquet-scan.cc
index fdc73d7..ab9363b 100644
--- a/cpp/tools/parquet/parquet-scan.cc
+++ b/cpp/tools/parquet/parquet-scan.cc
@@ -65,7 +65,8 @@ int main(int argc, char** argv) {
int64_t total_rows = parquet::ScanFileContents(columns, batch_size, reader.get());
- total_time = (std::clock() - start_time) / static_cast<double>(CLOCKS_PER_SEC);
+ total_time = static_cast<double>(std::clock() - start_time) /
+ static_cast<double>(CLOCKS_PER_SEC);
std::cout << total_rows << " rows scanned in " << total_time << " seconds."
<< std::endl;
} catch (const std::exception& e) {
[arrow] 03/24: PARQUET-728: Incorporate upstream Arrow API changes
Posted by we...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 5d0f5acf51bafe786243d630c3ac0cab25ae2a6e
Author: Wes McKinney <we...@twosigma.com>
AuthorDate: Sun Sep 25 19:22:48 2016 -0400
PARQUET-728: Incorporate upstream Arrow API changes
I bumped the thirdparty git hash to Arrow trunk. Pardon the diff noise from clang format. Also resolves PARQUET-729.
Author: Wes McKinney <we...@twosigma.com>
Closes #167 from wesm/PARQUET-728 and squashes the following commits:
222e72c [Wes McKinney] Write multiple columns in file-serialize-test. Fix PARQUET-729
ef56871 [Wes McKinney] Incorporate upstream Arrow API changes, clang format with LLVM 3.8.0
Change-Id: I673b1538c93623ae50ebf70cd8d9c7048781c982
---
cpp/tools/parquet/parquet-scan.cc | 2 +-
cpp/tools/parquet/parquet_reader.cc | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/cpp/tools/parquet/parquet-scan.cc b/cpp/tools/parquet/parquet-scan.cc
index d146a1d..f0bbb8e 100644
--- a/cpp/tools/parquet/parquet-scan.cc
+++ b/cpp/tools/parquet/parquet-scan.cc
@@ -17,8 +17,8 @@
#include <ctime>
#include <iostream>
-#include <memory>
#include <list>
+#include <memory>
#include "parquet/api/reader.h"
diff --git a/cpp/tools/parquet/parquet_reader.cc b/cpp/tools/parquet/parquet_reader.cc
index ced84d5..bc0711f 100644
--- a/cpp/tools/parquet/parquet_reader.cc
+++ b/cpp/tools/parquet/parquet_reader.cc
@@ -16,8 +16,8 @@
// under the License.
#include <iostream>
-#include <memory>
#include <list>
+#include <memory>
#include "parquet/api/reader.h"
[arrow] 09/24: PARQUET-508: Add ParquetFilePrinter
Posted by we...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit bfb1e9af6db19f5cf23803726a0e53776bb3ae24
Author: Deepak Majeti <de...@hpe.com>
AuthorDate: Fri Apr 21 10:21:56 2017 -0400
PARQUET-508: Add ParquetFilePrinter
Author: Deepak Majeti <de...@hpe.com>
Closes #307 from majetideepak/PARQUET-508 and squashes the following commits:
de1364a [Deepak Majeti] add to api
1dc6260 [Deepak Majeti] PARQUET-508: Add ParquetFilePrinter
Change-Id: Ic581d6c47fbe2a055622e9345918f6b64ebaf08e
---
cpp/tools/parquet/parquet_reader.cc | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/cpp/tools/parquet/parquet_reader.cc b/cpp/tools/parquet/parquet_reader.cc
index bc0711f..25f81c1 100644
--- a/cpp/tools/parquet/parquet_reader.cc
+++ b/cpp/tools/parquet/parquet_reader.cc
@@ -57,7 +57,8 @@ int main(int argc, char** argv) {
try {
std::unique_ptr<parquet::ParquetFileReader> reader =
parquet::ParquetFileReader::OpenFile(filename, memory_map);
- reader->DebugPrint(std::cout, columns, print_values);
+ parquet::ParquetFilePrinter printer(reader.get());
+ printer.DebugPrint(std::cout, columns, print_values);
} catch (const std::exception& e) {
std::cerr << "Parquet error: " << e.what() << std::endl;
return -1;
[arrow] 01/24: PARQUET-681: Add tool to scan a parquet file
Posted by we...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 6beeaf4d40ea36c048e9a1a2d4410ca4e5d66864
Author: Deepak Majeti <de...@hpe.com>
AuthorDate: Mon Sep 5 15:49:36 2016 -0400
PARQUET-681: Add tool to scan a parquet file
Added a ReadBatchValues() API to the Column class.
Added a parquet-scan tool
Separated examples into benchmarks/tools
added clang tidy and clang format to benchmarks and tools
Author: Deepak Majeti <de...@hpe.com>
Closes #144 from majetideepak/parquetscan and squashes the following commits:
cc7f183 [Deepak Majeti] Removed GetRemainingInPage API
44da480 [Deepak Majeti] add scan all in public api
20829b8 [Deepak Majeti] clang-format
da62354 [Deepak Majeti] ScanAllValues API
e385f61 [Deepak Majeti] put clang-* in the root directory
9ff785c [Deepak Majeti] use c++ random
d854bde [Deepak Majeti] parquet scan tool
Change-Id: I1e5d1e42aa5a3e8dfbe6b556dd0081bb0ed7f4d8
---
cpp/tools/parquet/CMakeLists.txt | 34 ++++++++++
cpp/tools/parquet/parquet-dump-schema.cc | 36 +++++++++++
cpp/tools/parquet/parquet-scan.cc | 108 +++++++++++++++++++++++++++++++
cpp/tools/parquet/parquet_reader.cc | 67 +++++++++++++++++++
4 files changed, 245 insertions(+)
diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt
new file mode 100644
index 0000000..5c4eaa8
--- /dev/null
+++ b/cpp/tools/parquet/CMakeLists.txt
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+SET(LINK_LIBS
+ snappystatic
+ thriftstatic)
+
+if (PARQUET_BUILD_EXECUTABLES)
+ add_executable(parquet-dump-schema parquet-dump-schema.cc)
+ target_link_libraries(parquet-dump-schema ${LINK_LIBS}
+ parquet_static)
+
+ add_executable(parquet_reader parquet_reader.cc)
+ target_link_libraries(parquet_reader ${LINK_LIBS}
+ parquet_static)
+
+ add_executable(parquet-scan parquet-scan.cc)
+ target_link_libraries(parquet-scan ${LINK_LIBS}
+ parquet_static)
+endif()
diff --git a/cpp/tools/parquet/parquet-dump-schema.cc b/cpp/tools/parquet/parquet-dump-schema.cc
new file mode 100644
index 0000000..deef2fd
--- /dev/null
+++ b/cpp/tools/parquet/parquet-dump-schema.cc
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <iostream>
+
+#include "parquet/api/reader.h"
+#include "parquet/api/schema.h"
+
+int main(int argc, char** argv) {
+ std::string filename = argv[1];
+
+ try {
+ std::unique_ptr<parquet::ParquetFileReader> reader =
+ parquet::ParquetFileReader::OpenFile(filename);
+ PrintSchema(reader->metadata()->schema_descriptor()->schema().get(), std::cout);
+ } catch (const std::exception& e) {
+ std::cerr << "Parquet error: " << e.what() << std::endl;
+ return -1;
+ }
+
+ return 0;
+}
diff --git a/cpp/tools/parquet/parquet-scan.cc b/cpp/tools/parquet/parquet-scan.cc
new file mode 100644
index 0000000..d146a1d
--- /dev/null
+++ b/cpp/tools/parquet/parquet-scan.cc
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <ctime>
+#include <iostream>
+#include <memory>
+#include <list>
+
+#include "parquet/api/reader.h"
+
+int main(int argc, char** argv) {
+ if (argc > 4 || argc < 1) {
+ std::cerr << "Usage: parquet-scan [--batch-size=] [--columns=...] <file>"
+ << std::endl;
+ return -1;
+ }
+
+ std::string filename;
+
+ // Read command-line options
+ int batch_size = 256;
+ const std::string COLUMNS_PREFIX = "--columns=";
+ const std::string BATCH_SIZE_PREFIX = "--batch-size=";
+ std::vector<int> columns;
+ int num_columns = 0;
+
+ char *param, *value;
+ for (int i = 1; i < argc; i++) {
+ if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) {
+ value = std::strtok(param + COLUMNS_PREFIX.length(), ",");
+ while (value) {
+ columns.push_back(std::atoi(value));
+ value = std::strtok(nullptr, ",");
+ num_columns++;
+ }
+ } else if ((param = std::strstr(argv[i], BATCH_SIZE_PREFIX.c_str()))) {
+ value = std::strtok(param + BATCH_SIZE_PREFIX.length(), " ");
+ if (value) { batch_size = std::atoi(value); }
+ } else {
+ filename = argv[i];
+ }
+ }
+
+ std::vector<int16_t> rep_levels(batch_size);
+ std::vector<int16_t> def_levels(batch_size);
+ try {
+ double total_time;
+ std::clock_t start_time = std::clock();
+ std::unique_ptr<parquet::ParquetFileReader> reader =
+ parquet::ParquetFileReader::OpenFile(filename);
+ // columns are not specified explicitly. Add all columns
+ if (num_columns == 0) {
+ num_columns = reader->metadata()->num_columns();
+ columns.resize(num_columns);
+ for (int i = 0; i < num_columns; i++) {
+ columns[i] = i;
+ }
+ }
+
+ int64_t total_rows[num_columns];
+
+ for (int r = 0; r < reader->metadata()->num_row_groups(); ++r) {
+ auto group_reader = reader->RowGroup(r);
+ int col = 0;
+ for (auto i : columns) {
+ total_rows[col] = 0;
+ std::shared_ptr<parquet::ColumnReader> col_reader = group_reader->Column(i);
+ size_t value_byte_size = GetTypeByteSize(col_reader->descr()->physical_type());
+ std::vector<uint8_t> values(batch_size * value_byte_size);
+
+ int64_t values_read = 0;
+ while (col_reader->HasNext()) {
+ total_rows[col] += ScanAllValues(batch_size, def_levels.data(),
+ rep_levels.data(), values.data(), &values_read, col_reader.get());
+ }
+ col++;
+ }
+ }
+
+ total_time = (std::clock() - start_time) / static_cast<double>(CLOCKS_PER_SEC);
+ for (int ct = 1; ct < num_columns; ++ct) {
+ if (total_rows[0] != total_rows[ct]) {
+ std::cerr << "Parquet error: Total rows among columns do not match" << std::endl;
+ }
+ }
+ std::cout << total_rows[0] << " rows scanned in " << total_time << " seconds."
+ << std::endl;
+ } catch (const std::exception& e) {
+ std::cerr << "Parquet error: " << e.what() << std::endl;
+ return -1;
+ }
+
+ return 0;
+}
diff --git a/cpp/tools/parquet/parquet_reader.cc b/cpp/tools/parquet/parquet_reader.cc
new file mode 100644
index 0000000..ced84d5
--- /dev/null
+++ b/cpp/tools/parquet/parquet_reader.cc
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <iostream>
+#include <memory>
+#include <list>
+
+#include "parquet/api/reader.h"
+
+int main(int argc, char** argv) {
+ if (argc > 5 || argc < 2) {
+ std::cerr << "Usage: parquet_reader [--only-metadata] [--no-memory-map] "
+ "[--columns=...] <file>"
+ << std::endl;
+ return -1;
+ }
+
+ std::string filename;
+ bool print_values = true;
+ bool memory_map = true;
+
+ // Read command-line options
+ const std::string COLUMNS_PREFIX = "--columns=";
+ std::list<int> columns;
+
+ char *param, *value;
+ for (int i = 1; i < argc; i++) {
+ if ((param = std::strstr(argv[i], "--only-metadata"))) {
+ print_values = false;
+ } else if ((param = std::strstr(argv[i], "--no-memory-map"))) {
+ memory_map = false;
+ } else if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) {
+ value = std::strtok(param + COLUMNS_PREFIX.length(), ",");
+ while (value) {
+ columns.push_back(std::atoi(value));
+ value = std::strtok(nullptr, ",");
+ }
+ } else {
+ filename = argv[i];
+ }
+ }
+
+ try {
+ std::unique_ptr<parquet::ParquetFileReader> reader =
+ parquet::ParquetFileReader::OpenFile(filename, memory_map);
+ reader->DebugPrint(std::cout, columns, print_values);
+ } catch (const std::exception& e) {
+ std::cerr << "Parquet error: " << e.what() << std::endl;
+ return -1;
+ }
+
+ return 0;
+}
[arrow] 04/24: PARQUET-702: Add a writer + reader example with
detailed comments
Posted by we...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit de3f844000243761b3775c87932de39b23903852
Author: Deepak Majeti <de...@hpe.com>
AuthorDate: Tue Nov 15 21:13:30 2016 +0100
PARQUET-702: Add a writer + reader example with detailed comments
1. Added a reader writer example.
- Covered all the physical types supported by Parquet.
Author: Deepak Majeti <de...@hpe.com>
Closes #187 from majetideepak/readerwriterexample and squashes the following commits:
c7061d9 [Deepak Majeti] review comments
d075ec5 [Deepak Majeti] Use Typed Writer and Reader
7c2357c [Deepak Majeti] Review comments
616ccdb [Deepak Majeti] add Optional and repetition fields
3bb2b08 [Deepak Majeti] clang format
0eea968 [Deepak Majeti] Parquet Reader Writer example
Change-Id: I50da1d9924451fe3f84a1a20917cf6b7f42e93f9
---
cpp/examples/parquet/CMakeLists.txt | 26 ++
cpp/examples/parquet/reader-writer.cc | 450 ++++++++++++++++++++++++++++++++++
2 files changed, 476 insertions(+)
diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/CMakeLists.txt
new file mode 100644
index 0000000..204cc27
--- /dev/null
+++ b/cpp/examples/parquet/CMakeLists.txt
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+SET(LINK_LIBS
+ snappystatic
+ thriftstatic)
+
+if (PARQUET_BUILD_EXECUTABLES)
+ add_executable(reader-writer reader-writer.cc)
+ target_link_libraries(reader-writer ${LINK_LIBS}
+ parquet_static)
+endif()
diff --git a/cpp/examples/parquet/reader-writer.cc b/cpp/examples/parquet/reader-writer.cc
new file mode 100644
index 0000000..cc066ac
--- /dev/null
+++ b/cpp/examples/parquet/reader-writer.cc
@@ -0,0 +1,450 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <list>
+#include <memory>
+
+#include <parquet/api/reader.h>
+#include <parquet/api/writer.h>
+
+/*
+ * This example describes writing and reading Parquet Files in C++ and serves as a
+ * reference to the API.
+ * The file contains all the physical data types supported by Parquet.
+**/
+
+/* Parquet is a structured columnar file format
+ * Parquet File = "Parquet data" + "Parquet Metadata"
+ * "Parquet data" is simply a vector of RowGroups. Each RowGroup is a batch of rows in a
+ * columnar layout
+ * "Parquet Metadata" contains the "file schema" and attributes of the RowGroups and their
+ * Columns
+ * "file schema" is a tree where each node is either a primitive type (leaf nodes) or a
+ * complex (nested) type (internal nodes)
+ * For specific details, please refer the format here:
+ * https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
+**/
+
+constexpr int NUM_ROWS_PER_ROW_GROUP = 500;
+constexpr int FIXED_LENGTH = 10;
+const std::string PARQUET_FILENAME = "parquet_cpp_example.parquet";
+
+using parquet::Repetition;
+using parquet::Type;
+using parquet::LogicalType;
+using parquet::schema::PrimitiveNode;
+using parquet::schema::GroupNode;
+
+static std::shared_ptr<GroupNode> SetupSchema() {
+ parquet::schema::NodeVector fields;
+ // Create a primitive node named 'boolean_field' with type:BOOLEAN,
+ // repetition:REQUIRED
+ fields.push_back(PrimitiveNode::Make(
+ "boolean_field", Repetition::REQUIRED, Type::BOOLEAN, LogicalType::NONE));
+
+ // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED,
+ // logical type:TIME_MILLIS
+ fields.push_back(PrimitiveNode::Make(
+ "int32_field", Repetition::REQUIRED, Type::INT32, LogicalType::TIME_MILLIS));
+
+ // Create a primitive node named 'int64_field' with type:INT64, repetition:REPEATED
+ fields.push_back(PrimitiveNode::Make(
+ "int64_field", Repetition::REPEATED, Type::INT64, LogicalType::NONE));
+
+ fields.push_back(PrimitiveNode::Make(
+ "int96_field", Repetition::REQUIRED, Type::INT96, LogicalType::NONE));
+
+ fields.push_back(PrimitiveNode::Make(
+ "float_field", Repetition::REQUIRED, Type::FLOAT, LogicalType::NONE));
+
+ fields.push_back(PrimitiveNode::Make(
+ "double_field", Repetition::REQUIRED, Type::DOUBLE, LogicalType::NONE));
+
+ // Create a primitive node named 'ba_field' with type:BYTE_ARRAY, repetition:OPTIONAL
+ fields.push_back(PrimitiveNode::Make(
+ "ba_field", Repetition::OPTIONAL, Type::BYTE_ARRAY, LogicalType::NONE));
+
+ // Create a primitive node named 'flba_field' with type:FIXED_LEN_BYTE_ARRAY,
+ // repetition:REQUIRED, field_length = FIXED_LENGTH
+ fields.push_back(PrimitiveNode::Make("flba_field", Repetition::REQUIRED,
+ Type::FIXED_LEN_BYTE_ARRAY, LogicalType::NONE, FIXED_LENGTH));
+
+ // Create a GroupNode named 'schema' using the primitive nodes defined above
+ // This GroupNode is the root node of the schema tree
+ return std::static_pointer_cast<GroupNode>(
+ GroupNode::Make("schema", Repetition::REQUIRED, fields));
+}
+
+int main(int argc, char** argv) {
+ /**********************************************************************************
+ PARQUET WRITER EXAMPLE
+ **********************************************************************************/
+ // parquet::REQUIRED fields do not need definition and repetition level values
+ // parquet::OPTIONAL fields require only definition level values
+ // parquet::REPEATED fields require both definition and repetition level values
+ try {
+ // Create a local file output stream instance.
+ std::shared_ptr<parquet::OutputStream> out_file =
+ std::make_shared<parquet::LocalFileOutputStream>(PARQUET_FILENAME);
+
+ // Setup the parquet schema
+ std::shared_ptr<GroupNode> schema = SetupSchema();
+
+ // Create a ParquetFileWriter instance
+ std::shared_ptr<parquet::ParquetFileWriter> file_writer =
+ parquet::ParquetFileWriter::Open(out_file, schema);
+
+ // Append a RowGroup with a specific number of rows.
+ parquet::RowGroupWriter* rg_writer =
+ file_writer->AppendRowGroup(NUM_ROWS_PER_ROW_GROUP);
+
+ // Write the Bool column
+ parquet::BoolWriter* bool_writer =
+ static_cast<parquet::BoolWriter*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ bool value = ((i % 2) == 0) ? true : false;
+ bool_writer->WriteBatch(1, nullptr, nullptr, &value);
+ }
+
+ // Write the Int32 column
+ parquet::Int32Writer* int32_writer =
+ static_cast<parquet::Int32Writer*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ int32_t value = i;
+ int32_writer->WriteBatch(1, nullptr, nullptr, &value);
+ }
+
+ // Write the Int64 column. Each row has repeats twice.
+ parquet::Int64Writer* int64_writer =
+ static_cast<parquet::Int64Writer*>(rg_writer->NextColumn());
+ for (int i = 0; i < 2 * NUM_ROWS_PER_ROW_GROUP; i++) {
+ int64_t value = i * 1000 * 1000;
+ value *= 1000 * 1000;
+ int16_t definition_level = 1;
+ int16_t repetition_level = 0;
+ if ((i % 2) == 0) {
+ repetition_level = 1; // start of a new record
+ }
+ int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value);
+ }
+
+ // Write the INT96 column.
+ parquet::Int96Writer* int96_writer =
+ static_cast<parquet::Int96Writer*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ parquet::Int96 value;
+ value.value[0] = i;
+ value.value[1] = i + 1;
+ value.value[2] = i + 2;
+ int96_writer->WriteBatch(1, nullptr, nullptr, &value);
+ }
+
+ // Write the Float column
+ parquet::FloatWriter* float_writer =
+ static_cast<parquet::FloatWriter*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ float value = i * 1.1;
+ float_writer->WriteBatch(1, nullptr, nullptr, &value);
+ }
+
+ // Write the Double column
+ parquet::DoubleWriter* double_writer =
+ static_cast<parquet::DoubleWriter*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ double value = i * 1.1111111;
+ double_writer->WriteBatch(1, nullptr, nullptr, &value);
+ }
+
+ // Write the ByteArray column. Make every alternate values NULL
+ parquet::ByteArrayWriter* ba_writer =
+ static_cast<parquet::ByteArrayWriter*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ parquet::ByteArray value;
+ char hello[FIXED_LENGTH] = "parquet";
+ hello[7] = '0' + i / 100;
+ hello[8] = '0' + (i / 10) % 10;
+ hello[9] = '0' + i % 10;
+ if (i % 2 == 0) {
+ int16_t definition_level = 1;
+ value.ptr = reinterpret_cast<const uint8_t*>(&hello[0]);
+ value.len = FIXED_LENGTH;
+ ba_writer->WriteBatch(1, &definition_level, nullptr, &value);
+ } else {
+ int16_t definition_level = 0;
+ ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr);
+ }
+ }
+
+ // Write the FixedLengthByteArray column
+ parquet::FixedLenByteArrayWriter* flba_writer =
+ static_cast<parquet::FixedLenByteArrayWriter*>(rg_writer->NextColumn());
+ for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
+ parquet::FixedLenByteArray value;
+ char v = static_cast<char>(i);
+ char flba[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v};
+ value.ptr = reinterpret_cast<const uint8_t*>(&flba[0]);
+
+ flba_writer->WriteBatch(1, nullptr, nullptr, &value);
+ }
+
+ // Close the ParquetFileWriter
+ file_writer->Close();
+
+ // Write the bytes to file
+ out_file->Close();
+ } catch (const std::exception& e) {
+ std::cerr << "Parquet write error: " << e.what() << std::endl;
+ return -1;
+ }
+
+ /**********************************************************************************
+ PARQUET READER EXAMPLE
+ **********************************************************************************/
+
+ try {
+ // Create a ParquetReader instance
+ std::unique_ptr<parquet::ParquetFileReader> parquet_reader =
+ parquet::ParquetFileReader::OpenFile(PARQUET_FILENAME, false);
+ // Get the File MetaData
+ const parquet::FileMetaData* file_metadata = parquet_reader->metadata();
+
+ // Get the number of RowGroups
+ int num_row_groups = file_metadata->num_row_groups();
+ assert(num_row_groups == 1);
+
+ // Get the number of Columns
+ int num_columns = file_metadata->num_columns();
+ assert(num_columns == 8);
+
+ // Iterate over all the RowGroups in the file
+ for (int r = 0; r < num_row_groups; ++r) {
+ // Get the RowGroup Reader
+ std::shared_ptr<parquet::RowGroupReader> row_group_reader =
+ parquet_reader->RowGroup(r);
+
+ int64_t values_read = 0;
+ int64_t rows_read = 0;
+ int16_t definition_level;
+ int16_t repetition_level;
+ int i;
+ std::shared_ptr<parquet::ColumnReader> column_reader;
+
+ // Get the Column Reader for the boolean column
+ column_reader = row_group_reader->Column(0);
+ parquet::BoolReader* bool_reader =
+ static_cast<parquet::BoolReader*>(column_reader.get());
+
+ // Read all the rows in the column
+ i = 0;
+ while (bool_reader->HasNext()) {
+ bool value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = bool_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ bool expected_value = ((i % 2) == 0) ? true : false;
+ assert(value == expected_value);
+ i++;
+ }
+
+ // Get the Column Reader for the Int32 column
+ column_reader = row_group_reader->Column(1);
+ parquet::Int32Reader* int32_reader =
+ static_cast<parquet::Int32Reader*>(column_reader.get());
+ // Read all the rows in the column
+ i = 0;
+ while (int32_reader->HasNext()) {
+ int32_t value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = int32_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ assert(value == i);
+ i++;
+ }
+
+ // Get the Column Reader for the Int64 column
+ column_reader = row_group_reader->Column(2);
+ parquet::Int64Reader* int64_reader =
+ static_cast<parquet::Int64Reader*>(column_reader.get());
+ // Read all the rows in the column
+ i = 0;
+ while (int64_reader->HasNext()) {
+ int64_t value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = int64_reader->ReadBatch(
+ 1, &definition_level, &repetition_level, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ int64_t expected_value = i * 1000 * 1000;
+ expected_value *= 1000 * 1000;
+ assert(value == expected_value);
+ if ((i % 2) == 0) {
+ assert(repetition_level == 1);
+ } else {
+ assert(repetition_level == 0);
+ }
+ i++;
+ }
+
+ // Get the Column Reader for the Int96 column
+ column_reader = row_group_reader->Column(3);
+ parquet::Int96Reader* int96_reader =
+ static_cast<parquet::Int96Reader*>(column_reader.get());
+ // Read all the rows in the column
+ i = 0;
+ while (int96_reader->HasNext()) {
+ parquet::Int96 value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = int96_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ parquet::Int96 expected_value;
+ expected_value.value[0] = i;
+ expected_value.value[1] = i + 1;
+ expected_value.value[2] = i + 2;
+ for (int j = 0; j < 3; j++) {
+ assert(value.value[j] == expected_value.value[j]);
+ }
+ i++;
+ }
+
+ // Get the Column Reader for the Float column
+ column_reader = row_group_reader->Column(4);
+ parquet::FloatReader* float_reader =
+ static_cast<parquet::FloatReader*>(column_reader.get());
+ // Read all the rows in the column
+ i = 0;
+ while (float_reader->HasNext()) {
+ float value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = float_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ float expected_value = i * 1.1;
+ assert(value == expected_value);
+ i++;
+ }
+
+ // Get the Column Reader for the Double column
+ column_reader = row_group_reader->Column(5);
+ parquet::DoubleReader* double_reader =
+ static_cast<parquet::DoubleReader*>(column_reader.get());
+ // Read all the rows in the column
+ i = 0;
+ while (double_reader->HasNext()) {
+ double value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = double_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ double expected_value = i * 1.1111111;
+ assert(value == expected_value);
+ i++;
+ }
+
+ // Get the Column Reader for the ByteArray column
+ column_reader = row_group_reader->Column(6);
+ parquet::ByteArrayReader* ba_reader =
+ static_cast<parquet::ByteArrayReader*>(column_reader.get());
+ // Read all the rows in the column
+ i = 0;
+ while (ba_reader->HasNext()) {
+ parquet::ByteArray value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read =
+ ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // Verify the value written
+ char expected_value[FIXED_LENGTH] = "parquet";
+ expected_value[7] = '0' + i / 100;
+ expected_value[8] = '0' + (i / 10) % 10;
+ expected_value[9] = '0' + i % 10;
+ if (i % 2 == 0) { // only alternate values exist
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ assert(value.len == FIXED_LENGTH);
+ assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0);
+ assert(definition_level == 1);
+ } else {
+ // There are NULL values in the rows written
+ assert(values_read == 0);
+ assert(definition_level == 0);
+ }
+ i++;
+ }
+
+ // Get the Column Reader for the FixedLengthByteArray column
+ column_reader = row_group_reader->Column(7);
+ parquet::FixedLenByteArrayReader* flba_reader =
+ static_cast<parquet::FixedLenByteArrayReader*>(column_reader.get());
+ // Read all the rows in the column
+ i = 0;
+ while (flba_reader->HasNext()) {
+ parquet::FixedLenByteArray value;
+ // Read one value at a time. The number of rows read is returned. values_read
+ // contains the number of non-null rows
+ rows_read = flba_reader->ReadBatch(1, nullptr, nullptr, &value, &values_read);
+ // Ensure only one value is read
+ assert(rows_read == 1);
+ // There are no NULL values in the rows written
+ assert(values_read == 1);
+ // Verify the value written
+ char v = static_cast<char>(i);
+ char expected_value[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v};
+ assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0);
+ i++;
+ }
+ }
+ } catch (const std::exception& e) {
+ std::cerr << "Parquet read error: " << e.what() << std::endl;
+ return -1;
+ }
+
+ std::cout << "Parquet Writing and Reading Complete" << std::endl;
+
+ return 0;
+}
[arrow] 07/24: PARQUET-892: Specify public link targets for
parquet_static so that transitive dependencies are linked in executables
Posted by we...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 22a96d9a69fb8786ee101606fa29b4905a61077c
Author: Wes McKinney <we...@twosigma.com>
AuthorDate: Thu Feb 23 09:22:06 2017 +0100
PARQUET-892: Specify public link targets for parquet_static so that transitive dependencies are linked in executables
Author: Wes McKinney <we...@twosigma.com>
Closes #259 from wesm/PARQUET-892 and squashes the following commits:
1e932c4 [Wes McKinney] Specify public link targets for parquet_static so that transitive dependencies are linked in executables
Change-Id: Ib0bea0cd272919dffa82aa1c12d7658ea15d9cb5
---
cpp/examples/parquet/CMakeLists.txt | 7 +------
cpp/tools/parquet/CMakeLists.txt | 13 +++----------
2 files changed, 4 insertions(+), 16 deletions(-)
diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/CMakeLists.txt
index 204cc27..721fa9a 100644
--- a/cpp/examples/parquet/CMakeLists.txt
+++ b/cpp/examples/parquet/CMakeLists.txt
@@ -15,12 +15,7 @@
# specific language governing permissions and limitations
# under the License.
-SET(LINK_LIBS
- snappystatic
- thriftstatic)
-
if (PARQUET_BUILD_EXECUTABLES)
add_executable(reader-writer reader-writer.cc)
- target_link_libraries(reader-writer ${LINK_LIBS}
- parquet_static)
+ target_link_libraries(reader-writer parquet_static)
endif()
diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt
index 5c4eaa8..d473d08 100644
--- a/cpp/tools/parquet/CMakeLists.txt
+++ b/cpp/tools/parquet/CMakeLists.txt
@@ -15,20 +15,13 @@
# specific language governing permissions and limitations
# under the License.
-SET(LINK_LIBS
- snappystatic
- thriftstatic)
-
if (PARQUET_BUILD_EXECUTABLES)
add_executable(parquet-dump-schema parquet-dump-schema.cc)
- target_link_libraries(parquet-dump-schema ${LINK_LIBS}
- parquet_static)
+ target_link_libraries(parquet-dump-schema parquet_static)
add_executable(parquet_reader parquet_reader.cc)
- target_link_libraries(parquet_reader ${LINK_LIBS}
- parquet_static)
+ target_link_libraries(parquet_reader parquet_static)
add_executable(parquet-scan parquet-scan.cc)
- target_link_libraries(parquet-scan ${LINK_LIBS}
- parquet_static)
+ target_link_libraries(parquet-scan parquet_static)
endif()
[arrow] 14/24: PARQUET-1029: [C++] Some extern template symbols not
being exported in gcc
Posted by we...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 4d21fd36f9a0772249a32f3532a0ec209a9b0030
Author: Wes McKinney <we...@twosigma.com>
AuthorDate: Wed Jun 14 10:31:14 2017 -0400
PARQUET-1029: [C++] Some extern template symbols not being exported in gcc
Extern template visibility contributes to be a mystery to me, but this fixes the regression from PARQUET-991.
cc @saatvikshah1994
Author: Wes McKinney <we...@twosigma.com>
Closes #352 from wesm/PARQUET-1029 and squashes the following commits:
3f52852 [Wes McKinney] Some extern template symbols not being exported in gcc
Change-Id: Icd71f67290c25de8f9fc38153b48a70e20bfa438
---
cpp/examples/parquet/CMakeLists.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/CMakeLists.txt
index 721fa9a..594295d 100644
--- a/cpp/examples/parquet/CMakeLists.txt
+++ b/cpp/examples/parquet/CMakeLists.txt
@@ -17,5 +17,5 @@
if (PARQUET_BUILD_EXECUTABLES)
add_executable(reader-writer reader-writer.cc)
- target_link_libraries(reader-writer parquet_static)
+ target_link_libraries(reader-writer parquet_shared arrow)
endif()
[arrow] 02/24: PARQUET-711: Use metadata builders in parquet writer
Posted by we...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit d1e8433b09ec16d9d9e98fa71a51542c99a36df7
Author: Deepak Majeti <de...@hpe.com>
AuthorDate: Thu Sep 8 23:06:08 2016 -0400
PARQUET-711: Use metadata builders in parquet writer
I wrote a sample file and the metadata seems to be correct.
@xhochy I fixed some missing metadata like `dictionary_page_offset`. You might want to check if this fixes the Drill problem.
Author: Deepak Majeti <de...@hpe.com>
Closes #156 from majetideepak/PARQUET-711 and squashes the following commits:
25f5a7e [Deepak Majeti] fix schema and descr. Resolves PARQUET-705 and PARQUET-707
8b4784d [Deepak Majeti] Review comments to add methods back
fdbc761 [Deepak Majeti] fix clang error and comments
c6cb071 [Deepak Majeti] convert DCHECKS to Exceptions in metadata
ada3ac2 [Deepak Majeti] clang format
d9c9131 [Deepak Majeti] Use metadata builders in parquet writer
Change-Id: Iabe80b1cbe3fd8f1de6239187058b6402b160975
---
cpp/tools/parquet/parquet-dump-schema.cc | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cpp/tools/parquet/parquet-dump-schema.cc b/cpp/tools/parquet/parquet-dump-schema.cc
index deef2fd..1e0239b 100644
--- a/cpp/tools/parquet/parquet-dump-schema.cc
+++ b/cpp/tools/parquet/parquet-dump-schema.cc
@@ -26,7 +26,7 @@ int main(int argc, char** argv) {
try {
std::unique_ptr<parquet::ParquetFileReader> reader =
parquet::ParquetFileReader::OpenFile(filename);
- PrintSchema(reader->metadata()->schema_descriptor()->schema().get(), std::cout);
+ PrintSchema(reader->metadata()->schema()->schema_root().get(), std::cout);
} catch (const std::exception& e) {
std::cerr << "Parquet error: " << e.what() << std::endl;
return -1;
[arrow] 20/24: PARQUET-1196: Example parquet_arrow project
Posted by we...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 3a9dbdf467382f380eef6d36c50f2c302bb07ca0
Author: Uwe L. Korn <uw...@xhochy.com>
AuthorDate: Thu Feb 15 18:42:22 2018 +0100
PARQUET-1196: Example parquet_arrow project
Depends on https://github.com/apache/parquet-cpp/pull/434
Author: Uwe L. Korn <uw...@xhochy.com>
Author: Korn, Uwe <Uw...@blue-yonder.com>
Closes #436 from xhochy/PARQUET-1196 and squashes the following commits:
a938da7 [Uwe L. Korn] Check Status for PrettyPrint
15d62f3 [Uwe L. Korn] PARQUET-1196: Example parquet_arrow project
1280fd5 [Korn, Uwe] PARQUET-1200: Support reading a single Arrow column from a Parquet file
Change-Id: I907f2276b319491f6e02117f4a21ab2383006a99
---
.../parquet/{ => low-level-api}/CMakeLists.txt | 0
.../parquet/{ => low-level-api}/reader-writer.cc | 0
cpp/examples/parquet/parquet-arrow/CMakeLists.txt | 78 +++++++++++
cpp/examples/parquet/parquet-arrow/README.md | 20 +++
.../cmake_modules/ArrowExternalProject.cmake | 1 +
.../parquet-arrow/cmake_modules/FindArrow.cmake | 1 +
.../parquet-arrow/cmake_modules/FindParquet.cmake | 145 +++++++++++++++++++++
.../parquet/parquet-arrow/src/reader-writer.cc | 134 +++++++++++++++++++
8 files changed, 379 insertions(+)
diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/low-level-api/CMakeLists.txt
similarity index 100%
rename from cpp/examples/parquet/CMakeLists.txt
rename to cpp/examples/parquet/low-level-api/CMakeLists.txt
diff --git a/cpp/examples/parquet/reader-writer.cc b/cpp/examples/parquet/low-level-api/reader-writer.cc
similarity index 100%
rename from cpp/examples/parquet/reader-writer.cc
rename to cpp/examples/parquet/low-level-api/reader-writer.cc
diff --git a/cpp/examples/parquet/parquet-arrow/CMakeLists.txt b/cpp/examples/parquet/parquet-arrow/CMakeLists.txt
new file mode 100644
index 0000000..897fcfb
--- /dev/null
+++ b/cpp/examples/parquet/parquet-arrow/CMakeLists.txt
@@ -0,0 +1,78 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Require cmake that supports BYPRODUCTS in add_custom_command, ExternalProject_Add [1].
+cmake_minimum_required(VERSION 3.2.0)
+
+project(parquet-arrow-example)
+
+include(ExternalProject)
+include(FindPkgConfig)
+include(GNUInstallDirs)
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake_modules")
+
+# This ensures that things like gnu++11 get passed correctly
+set(CMAKE_CXX_STANDARD 11)
+
+# We require a C++11 compliant compiler
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# We want to link dynamically against Arrow and Parquet
+set(PARQUET_BUILD_SHARED ON)
+
+
+# First search the packages in the system. If they are not found, use CMake's
+# ExternalProject mechanism to build them locally.
+find_package(Arrow)
+if (NOT ARROW_FOUND)
+ # set compile output directory
+ if (NOT CMAKE_BUILD_TYPE)
+ set(CMAKE_BUILD_TYPE Debug)
+ endif(NOT CMAKE_BUILD_TYPE)
+ string (TOLOWER ${CMAKE_BUILD_TYPE} BUILD_SUBDIR_NAME)
+ # If build in-source, create the latest symlink. If build out-of-source, which is
+ # preferred, simply output the binaries in the build folder
+ if (${CMAKE_SOURCE_DIR} STREQUAL "${CMAKE_CURRENT_BINARY_DIR}")
+ set(BUILD_OUTPUT_ROOT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/build/${BUILD_SUBDIR_NAME}")
+ # Link build/latest to the current build directory, to avoid developers
+ # accidentally running the latest debug build when in fact they're building
+ # release builds.
+ FILE(MAKE_DIRECTORY ${BUILD_OUTPUT_ROOT_DIRECTORY})
+ if (NOT APPLE)
+ set(MORE_ARGS "-T")
+ endif()
+ EXECUTE_PROCESS(COMMAND ln ${MORE_ARGS} -sf ${BUILD_OUTPUT_ROOT_DIRECTORY}
+ ${CMAKE_CURRENT_BINARY_DIR}/build/latest)
+ else()
+ set(BUILD_OUTPUT_ROOT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${BUILD_SUBDIR_NAME}")
+ endif()
+
+ include(ArrowExternalProject)
+ set(ARROW_VENDORED 1)
+else()
+ set(ARROW_VENDORED 0)
+endif()
+find_package(Parquet)
+
+include_directories(SYSTEM ${ARROW_INCLUDE_DIR} ${PARQUET_INCLUDE_DIR})
+
+add_executable(parquet-arrow-reader-writer src/reader-writer.cc)
+target_link_libraries(parquet-arrow-reader-writer ${PARQUET_SHARED_LIB} ${ARROW_SHARED_LIB})
+if (ARROW_VENDORED)
+ add_dependencies(parquet-arrow-reader-writer arrow_ep)
+endif()
diff --git a/cpp/examples/parquet/parquet-arrow/README.md b/cpp/examples/parquet/parquet-arrow/README.md
new file mode 100644
index 0000000..e99819f
--- /dev/null
+++ b/cpp/examples/parquet/parquet-arrow/README.md
@@ -0,0 +1,20 @@
+<!---
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. See accompanying LICENSE file.
+-->
+
+Using parquet-cpp with the arrow interface
+==========================================
+
+This folder contains an example project that shows how to setup a CMake project
+that consumes `parquet-cpp` as a library as well as how you can use the
+`parquet/arrow` interface to reading and write Apache Parquet files.
diff --git a/cpp/examples/parquet/parquet-arrow/cmake_modules/ArrowExternalProject.cmake b/cpp/examples/parquet/parquet-arrow/cmake_modules/ArrowExternalProject.cmake
new file mode 120000
index 0000000..b535f6e
--- /dev/null
+++ b/cpp/examples/parquet/parquet-arrow/cmake_modules/ArrowExternalProject.cmake
@@ -0,0 +1 @@
+../../../cmake_modules/ArrowExternalProject.cmake
\ No newline at end of file
diff --git a/cpp/examples/parquet/parquet-arrow/cmake_modules/FindArrow.cmake b/cpp/examples/parquet/parquet-arrow/cmake_modules/FindArrow.cmake
new file mode 120000
index 0000000..6c451ce
--- /dev/null
+++ b/cpp/examples/parquet/parquet-arrow/cmake_modules/FindArrow.cmake
@@ -0,0 +1 @@
+../../../cmake_modules/FindArrow.cmake
\ No newline at end of file
diff --git a/cpp/examples/parquet/parquet-arrow/cmake_modules/FindParquet.cmake b/cpp/examples/parquet/parquet-arrow/cmake_modules/FindParquet.cmake
new file mode 100644
index 0000000..8bbe05f
--- /dev/null
+++ b/cpp/examples/parquet/parquet-arrow/cmake_modules/FindParquet.cmake
@@ -0,0 +1,145 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# - Find PARQUET (parquet/parquet.h, libparquet.a, libparquet.so)
+# This module defines
+# PARQUET_INCLUDE_DIR, directory containing headers
+# PARQUET_LIBS, directory containing parquet libraries
+# PARQUET_STATIC_LIB, path to libparquet.a
+# PARQUET_SHARED_LIB, path to libparquet's shared library
+# PARQUET_SHARED_IMP_LIB, path to libparquet's import library (MSVC only)
+# PARQUET_FOUND, whether parquet has been found
+
+include(FindPkgConfig)
+
+if(NOT "$ENV{PARQUET_HOME}" STREQUAL "")
+ set(PARQUET_HOME "$ENV{PARQUET_HOME}")
+endif()
+
+if (MSVC)
+ SET(CMAKE_FIND_LIBRARY_SUFFIXES ".lib" ".dll")
+
+ if (MSVC AND NOT PARQUET_MSVC_STATIC_LIB_SUFFIX)
+ set(PARQUET_MSVC_STATIC_LIB_SUFFIX "_static")
+ endif()
+
+ find_library(PARQUET_SHARED_LIBRARIES NAMES parquet
+ PATHS ${PARQUET_HOME} NO_DEFAULT_PATH
+ PATH_SUFFIXES "bin" )
+
+ get_filename_component(PARQUET_SHARED_LIBS ${PARQUET_SHARED_LIBRARIES} PATH )
+endif ()
+
+if(PARQUET_HOME)
+ set(PARQUET_SEARCH_HEADER_PATHS
+ ${PARQUET_HOME}/include
+ )
+ set(PARQUET_SEARCH_LIB_PATH
+ ${PARQUET_HOME}/lib
+ )
+ find_path(PARQUET_INCLUDE_DIR parquet/api/reader.h PATHS
+ ${PARQUET_SEARCH_HEADER_PATHS}
+ # make sure we don't accidentally pick up a different version
+ NO_DEFAULT_PATH
+ )
+ find_library(PARQUET_LIBRARIES NAMES parquet
+ PATHS ${PARQUET_HOME} NO_DEFAULT_PATH
+ PATH_SUFFIXES "lib")
+ get_filename_component(PARQUET_LIBS ${PARQUET_LIBRARIES} PATH )
+
+ # Try to autodiscover the Parquet ABI version
+ get_filename_component(PARQUET_LIB_REALPATH ${PARQUET_LIBRARIES} REALPATH)
+ get_filename_component(PARQUET_EXT_REALPATH ${PARQUET_LIB_REALPATH} EXT)
+ string(REGEX MATCH ".([0-9]+.[0-9]+.[0-9]+)" HAS_ABI_VERSION ${PARQUET_EXT_REALPATH})
+ if (HAS_ABI_VERSION)
+ if (APPLE)
+ string(REGEX REPLACE ".([0-9]+.[0-9]+.[0-9]+).dylib" "\\1" PARQUET_ABI_VERSION ${PARQUET_EXT_REALPATH})
+ else()
+ string(REGEX REPLACE ".so.([0-9]+.[0-9]+.[0-9]+)" "\\1" PARQUET_ABI_VERSION ${PARQUET_EXT_REALPATH})
+ endif()
+ string(REGEX REPLACE "([0-9]+).[0-9]+.[0-9]+" "\\1" PARQUET_SO_VERSION ${PARQUET_ABI_VERSION})
+ else()
+ set(PARQUET_ABI_VERSION "1.0.0")
+ set(PARQUET_SO_VERSION "1")
+ endif()
+else()
+ pkg_check_modules(PARQUET parquet)
+ if (PARQUET_FOUND)
+ pkg_get_variable(PARQUET_ABI_VERSION parquet abi_version)
+ message(STATUS "Parquet C++ ABI version: ${PARQUET_ABI_VERSION}")
+ pkg_get_variable(PARQUET_SO_VERSION parquet so_version)
+ message(STATUS "Parquet C++ SO version: ${PARQUET_SO_VERSION}")
+ set(PARQUET_INCLUDE_DIR ${PARQUET_INCLUDE_DIRS})
+ set(PARQUET_LIBS ${PARQUET_LIBRARY_DIRS})
+ set(PARQUET_SEARCH_LIB_PATH ${PARQUET_LIBRARY_DIRS})
+ message(STATUS "Searching for parquet libs in: ${PARQUET_SEARCH_LIB_PATH}")
+ find_library(PARQUET_LIBRARIES NAMES parquet
+ PATHS ${PARQUET_SEARCH_LIB_PATH} NO_DEFAULT_PATH)
+ else()
+ find_path(PARQUET_INCLUDE_DIR NAMES parquet/api/reader.h )
+ find_library(PARQUET_LIBRARIES NAMES parquet)
+ get_filename_component(PARQUET_LIBS ${PARQUET_LIBRARIES} PATH )
+ endif()
+endif()
+
+if (PARQUET_INCLUDE_DIR AND PARQUET_LIBRARIES)
+ set(PARQUET_FOUND TRUE)
+ set(PARQUET_LIB_NAME parquet)
+ if (MSVC)
+ set(PARQUET_STATIC_LIB "${PARQUET_LIBS}/${PARQUET_LIB_NAME}${PARQUET_MSVC_STATIC_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}")
+ set(PARQUET_SHARED_LIB "${PARQUET_SHARED_LIBS}/${PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}")
+ set(PARQUET_SHARED_IMP_LIB "${PARQUET_LIBS}/${PARQUET_LIB_NAME}.lib")
+ else()
+ set(PARQUET_STATIC_LIB ${PARQUET_LIBS}/${CMAKE_STATIC_LIBRARY_PREFIX}${PARQUET_LIB_NAME}.a)
+ set(PARQUET_SHARED_LIB ${PARQUET_LIBS}/${CMAKE_SHARED_LIBRARY_PREFIX}${PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})
+ endif()
+else ()
+ set(PARQUET_FOUND FALSE)
+endif ()
+
+if (PARQUET_FOUND)
+ if (NOT Parquet_FIND_QUIETLY)
+ message(STATUS "Found the Parquet library: ${PARQUET_LIBRARIES}")
+ endif ()
+else ()
+ if (NOT Parquet_FIND_QUIETLY)
+ if (NOT PARQUET_FOUND)
+ set(PARQUET_ERR_MSG "${PARQUET_ERR_MSG} Could not find the parquet library.")
+ endif()
+
+ set(PARQUET_ERR_MSG "${PARQUET_ERR_MSG} Looked in ")
+ if ( _parquet_roots )
+ set(PARQUET_ERR_MSG "${PARQUET_ERR_MSG} in ${_parquet_roots}.")
+ else ()
+ set(PARQUET_ERR_MSG "${PARQUET_ERR_MSG} system search paths.")
+ endif ()
+ if (Parquet_FIND_REQUIRED)
+ message(FATAL_ERROR "${PARQUET_ERR_MSG}")
+ else (Parquet_FIND_REQUIRED)
+ message(STATUS "${PARQUET_ERR_MSG}")
+ endif (Parquet_FIND_REQUIRED)
+ endif ()
+endif ()
+
+mark_as_advanced(
+ PARQUET_FOUND
+ PARQUET_INCLUDE_DIR
+ PARQUET_LIBS
+ PARQUET_LIBRARIES
+ PARQUET_STATIC_LIB
+ PARQUET_SHARED_LIB
+)
diff --git a/cpp/examples/parquet/parquet-arrow/src/reader-writer.cc b/cpp/examples/parquet/parquet-arrow/src/reader-writer.cc
new file mode 100644
index 0000000..f333cab
--- /dev/null
+++ b/cpp/examples/parquet/parquet-arrow/src/reader-writer.cc
@@ -0,0 +1,134 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/api.h>
+#include <arrow/io/api.h>
+#include <parquet/arrow/reader.h>
+#include <parquet/arrow/writer.h>
+#include <parquet/exception.h>
+
+// #0 Build dummy data to pass around
+// To have some input data, we first create an Arrow Table that holds
+// some data.
+std::shared_ptr<arrow::Table> generate_table() {
+ arrow::Int64Builder i64builder;
+ PARQUET_THROW_NOT_OK(i64builder.Append({1, 2, 3, 4, 5}));
+ std::shared_ptr<arrow::Array> i64array;
+ PARQUET_THROW_NOT_OK(i64builder.Finish(&i64array));
+
+ arrow::StringBuilder strbuilder;
+ PARQUET_THROW_NOT_OK(strbuilder.Append("some"));
+ PARQUET_THROW_NOT_OK(strbuilder.Append("string"));
+ PARQUET_THROW_NOT_OK(strbuilder.Append("content"));
+ PARQUET_THROW_NOT_OK(strbuilder.Append("in"));
+ PARQUET_THROW_NOT_OK(strbuilder.Append("rows"));
+ std::shared_ptr<arrow::Array> strarray;
+ PARQUET_THROW_NOT_OK(strbuilder.Finish(&strarray));
+
+ std::shared_ptr<arrow::Schema> schema = arrow::schema(
+ {arrow::field("int", arrow::int64()), arrow::field("str", arrow::utf8())});
+
+ return arrow::Table::Make(schema, {i64array, strarray});
+}
+
+// #1 Write out the data as a Parquet file
+void write_parquet_file(const arrow::Table& table) {
+ std::shared_ptr<arrow::io::FileOutputStream> outfile;
+ PARQUET_THROW_NOT_OK(
+ arrow::io::FileOutputStream::Open("parquet-arrow-example.parquet", &outfile));
+ // The last argument to the function call is the size of the RowGroup in
+ // the parquet file. Normally you would choose this to be rather large but
+ // for the example, we use a small value to have multiple RowGroups.
+ PARQUET_THROW_NOT_OK(
+ parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3));
+}
+
+// #2: Fully read in the file
+void read_whole_file() {
+ std::cout << "Reading parquet-arrow-example.parquet at once" << std::endl;
+ std::shared_ptr<arrow::io::ReadableFile> infile;
+ PARQUET_THROW_NOT_OK(arrow::io::ReadableFile::Open(
+ "parquet-arrow-example.parquet", arrow::default_memory_pool(), &infile));
+
+ std::unique_ptr<parquet::arrow::FileReader> reader;
+ PARQUET_THROW_NOT_OK(
+ parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
+ std::shared_ptr<arrow::Table> table;
+ PARQUET_THROW_NOT_OK(reader->ReadTable(&table));
+ std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns()
+ << " columns." << std::endl;
+}
+
+// #3: Read only a single RowGroup of the parquet file
+void read_single_rowgroup() {
+ std::cout << "Reading first RowGroup of parquet-arrow-example.parquet" << std::endl;
+ std::shared_ptr<arrow::io::ReadableFile> infile;
+ PARQUET_THROW_NOT_OK(arrow::io::ReadableFile::Open(
+ "parquet-arrow-example.parquet", arrow::default_memory_pool(), &infile));
+
+ std::unique_ptr<parquet::arrow::FileReader> reader;
+ PARQUET_THROW_NOT_OK(
+ parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
+ std::shared_ptr<arrow::Table> table;
+ PARQUET_THROW_NOT_OK(reader->RowGroup(0)->ReadTable(&table));
+ std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns()
+ << " columns." << std::endl;
+}
+
+// #4: Read only a single column of the whole parquet file
+void read_single_column() {
+ std::cout << "Reading first column of parquet-arrow-example.parquet" << std::endl;
+ std::shared_ptr<arrow::io::ReadableFile> infile;
+ PARQUET_THROW_NOT_OK(arrow::io::ReadableFile::Open(
+ "parquet-arrow-example.parquet", arrow::default_memory_pool(), &infile));
+
+ std::unique_ptr<parquet::arrow::FileReader> reader;
+ PARQUET_THROW_NOT_OK(
+ parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
+ std::shared_ptr<arrow::Array> array;
+ PARQUET_THROW_NOT_OK(reader->ReadColumn(0, &array));
+ PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout));
+ std::cout << std::endl;
+}
+
+// #5: Read only a single column of a RowGroup (this is known as ColumnChunk)
+// from the Parquet file.
+void read_single_column_chunk() {
+ std::cout << "Reading first ColumnChunk of the first RowGroup of "
+ "parquet-arrow-example.parquet"
+ << std::endl;
+ std::shared_ptr<arrow::io::ReadableFile> infile;
+ PARQUET_THROW_NOT_OK(arrow::io::ReadableFile::Open(
+ "parquet-arrow-example.parquet", arrow::default_memory_pool(), &infile));
+
+ std::unique_ptr<parquet::arrow::FileReader> reader;
+ PARQUET_THROW_NOT_OK(
+ parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
+ std::shared_ptr<arrow::Array> array;
+ PARQUET_THROW_NOT_OK(reader->RowGroup(0)->Column(0)->Read(&array));
+ PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout));
+ std::cout << std::endl;
+}
+
+int main(int argc, char** argv) {
+ std::shared_ptr<arrow::Table> table = generate_table();
+ write_parquet_file(*table);
+ read_whole_file();
+ read_single_rowgroup();
+ read_single_column();
+ read_single_column_chunk();
+}
[arrow] 11/24: PARQUET-595: API for KeyValue metadata
Posted by we...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 3e150214ec80301f329deb54649c6b8e87866242
Author: Wes McKinney <we...@twosigma.com>
AuthorDate: Sat Apr 29 13:57:50 2017 -0400
PARQUET-595: API for KeyValue metadata
This supersedes #309 and incorporates the `std::shared_ptr<const KeyValueMetadata>` pattern so less copying is needed in Parquet for metadata inbound from Arrow (and vice versa).
close #309
Author: Wes McKinney <we...@twosigma.com>
Author: Phillip Cloud <cp...@gmail.com>
Closes #314 from wesm/PARQUET-595 and squashes the following commits:
c0199c5 [Wes McKinney] Remove some more std::string includes
3d3be4e [Wes McKinney] Remove string include
b2ed09e [Wes McKinney] Add backwards compatible schema APIs
116575a [Wes McKinney] Use std::shared_ptr<const KeyValueMetadata> from upstream Arrow
5116eaa [Phillip Cloud] Add support for reading/writing Schema-level Arrow metadata
Change-Id: Ib46a73ac77cc952b032f0f93ee3297808b9f959e
---
cpp/examples/parquet/reader-writer.cc | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cpp/examples/parquet/reader-writer.cc b/cpp/examples/parquet/reader-writer.cc
index 54390e0..9118c88 100644
--- a/cpp/examples/parquet/reader-writer.cc
+++ b/cpp/examples/parquet/reader-writer.cc
@@ -46,7 +46,7 @@
constexpr int NUM_ROWS_PER_ROW_GROUP = 500;
constexpr int FIXED_LENGTH = 10;
-const std::string PARQUET_FILENAME = "parquet_cpp_example.parquet";
+const char PARQUET_FILENAME[] = "parquet_cpp_example.parquet";
using parquet::Repetition;
using parquet::Type;
[arrow] 22/24: PARQUET-1256: Add --print-key-value-metadata option
to parquet_reader tool
Posted by we...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 37207bdb509edd5ddd5edb33f9714c2c10497d10
Author: Jacek Pliszka <Ja...@gmail.com>
AuthorDate: Fri Aug 17 17:52:29 2018 -0400
PARQUET-1256: Add --print-key-value-metadata option to parquet_reader tool
This is a minor change useful for debugging.
Now parquet_reader tool has --print-key-value-metadata which when present, dump of key values of file metadata is done.
Created https://issues.apache.org/jira/browse/PARQUET-1256
Author: Jacek Pliszka <Ja...@gmail.com>
Closes #450 from JacekPliszka/master and squashes the following commits:
0d9a108 [Jacek Pliszka] Added --print-key-value-metadata option to parquet_reader tool
Change-Id: Ie6ad3037bcfe59128ad39d2408c8a2656666fbc1
---
cpp/tools/parquet/parquet_reader.cc | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/cpp/tools/parquet/parquet_reader.cc b/cpp/tools/parquet/parquet_reader.cc
index 7ef59dc..34bdfc1 100644
--- a/cpp/tools/parquet/parquet_reader.cc
+++ b/cpp/tools/parquet/parquet_reader.cc
@@ -24,13 +24,14 @@
int main(int argc, char** argv) {
if (argc > 5 || argc < 2) {
std::cerr << "Usage: parquet_reader [--only-metadata] [--no-memory-map] [--json]"
- "[--columns=...] <file>"
+ "[--print-key-value-metadata] [--columns=...] <file>"
<< std::endl;
return -1;
}
std::string filename;
bool print_values = true;
+ bool print_key_value_metadata = false;
bool memory_map = true;
bool format_json = false;
@@ -42,6 +43,8 @@ int main(int argc, char** argv) {
for (int i = 1; i < argc; i++) {
if ((param = std::strstr(argv[i], "--only-metadata"))) {
print_values = false;
+ } else if ((param = std::strstr(argv[i], "--print-key-value-metadata"))) {
+ print_key_value_metadata = true;
} else if ((param = std::strstr(argv[i], "--no-memory-map"))) {
memory_map = false;
} else if ((param = std::strstr(argv[i], "--json"))) {
@@ -64,7 +67,8 @@ int main(int argc, char** argv) {
if (format_json) {
printer.JSONPrint(std::cout, columns, filename.c_str());
} else {
- printer.DebugPrint(std::cout, columns, print_values, filename.c_str());
+ printer.DebugPrint(std::cout, columns, print_values,
+ print_key_value_metadata, filename.c_str());
}
} catch (const std::exception& e) {
std::cerr << "Parquet error: " << e.what() << std::endl;
[arrow] 23/24: PARQUET-1372: Add an API to allow writing RowGroups
based on size
Posted by we...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 074915450a5f0b1dbc117d16a167e32e2ccdb4d7
Author: Deepak Majeti <de...@hpe.com>
AuthorDate: Sat Aug 25 13:33:35 2018 +0200
PARQUET-1372: Add an API to allow writing RowGroups based on size
I split the changes into multiple commits to ease the review.
Used the example program to test the new API.
I will add unit tests once we converge on the API after review.
Thanks to @anatolishein for collaborating with the API design.
Author: Deepak Majeti <de...@hpe.com>
Closes #484 from majetideepak/PARQUET-1372 and squashes the following commits:
143ed51 [Deepak Majeti] improve comments
c10fe08 [Deepak Majeti] Add test
d12b10b [Deepak Majeti] Review comments
cb99b3f [Deepak Majeti] fix compiler warnings
e179a4c [Deepak Majeti] add example header
710bbe0 [Deepak Majeti] clang format
9e03004 [Deepak Majeti] reorg examples
410a3af [Deepak Majeti] remove flush_on_close
e148817 [Deepak Majeti] add BufferedPageWriter
26a52c1 [Deepak Majeti] clang format
20049c0 [Deepak Majeti] modify examples
9db26a2 [Deepak Majeti] Combine RowGroupWriter2 with RowGroupWriter
cb7d69c [Deepak Majeti] fix compiler errors
21642b3 [Deepak Majeti] clang format
530b835 [Deepak Majeti] example for RowGroupWriter2
0fc1f5c [Deepak Majeti] Extend Column Writer to flush pages on Close
f2f420d [Deepak Majeti] RowGroupWriter2, implementation that writes all columns at once
Change-Id: I749cbde733780c9a6499df6738b2236124b8a9f7
---
cpp/examples/parquet/low-level-api/CMakeLists.txt | 4 +
.../parquet/low-level-api/reader-writer.cc | 60 +---
.../{reader-writer.cc => reader-writer2.cc} | 313 ++++++++++-----------
cpp/examples/parquet/low-level-api/reader_writer.h | 71 +++++
4 files changed, 220 insertions(+), 228 deletions(-)
diff --git a/cpp/examples/parquet/low-level-api/CMakeLists.txt b/cpp/examples/parquet/low-level-api/CMakeLists.txt
index 721fa9a..64ba110 100644
--- a/cpp/examples/parquet/low-level-api/CMakeLists.txt
+++ b/cpp/examples/parquet/low-level-api/CMakeLists.txt
@@ -17,5 +17,9 @@
if (PARQUET_BUILD_EXECUTABLES)
add_executable(reader-writer reader-writer.cc)
+ add_executable(reader-writer2 reader-writer2.cc)
+ target_include_directories(reader-writer PRIVATE .)
+ target_include_directories(reader-writer2 PRIVATE .)
target_link_libraries(reader-writer parquet_static)
+ target_link_libraries(reader-writer2 parquet_static)
endif()
diff --git a/cpp/examples/parquet/low-level-api/reader-writer.cc b/cpp/examples/parquet/low-level-api/reader-writer.cc
index fb2ec77..09cd137 100644
--- a/cpp/examples/parquet/low-level-api/reader-writer.cc
+++ b/cpp/examples/parquet/low-level-api/reader-writer.cc
@@ -18,19 +18,16 @@
#include <cassert>
#include <fstream>
#include <iostream>
-#include <list>
#include <memory>
-#include <arrow/io/file.h>
-#include <arrow/util/logging.h>
-
-#include <parquet/api/reader.h>
-#include <parquet/api/writer.h>
+#include <reader_writer.h>
/*
* This example describes writing and reading Parquet Files in C++ and serves as a
* reference to the API.
* The file contains all the physical data types supported by Parquet.
+ * This example uses the RowGroupWriter API that supports writing RowGroups optimized for
+ *memory consumption
**/
/* Parquet is a structured columnar file format
@@ -46,56 +43,8 @@
**/
constexpr int NUM_ROWS_PER_ROW_GROUP = 500;
-constexpr int FIXED_LENGTH = 10;
const char PARQUET_FILENAME[] = "parquet_cpp_example.parquet";
-using parquet::Repetition;
-using parquet::Type;
-using parquet::LogicalType;
-using parquet::schema::PrimitiveNode;
-using parquet::schema::GroupNode;
-
-static std::shared_ptr<GroupNode> SetupSchema() {
- parquet::schema::NodeVector fields;
- // Create a primitive node named 'boolean_field' with type:BOOLEAN,
- // repetition:REQUIRED
- fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED,
- Type::BOOLEAN, LogicalType::NONE));
-
- // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED,
- // logical type:TIME_MILLIS
- fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32,
- LogicalType::TIME_MILLIS));
-
- // Create a primitive node named 'int64_field' with type:INT64, repetition:REPEATED
- fields.push_back(PrimitiveNode::Make("int64_field", Repetition::REPEATED, Type::INT64,
- LogicalType::NONE));
-
- fields.push_back(PrimitiveNode::Make("int96_field", Repetition::REQUIRED, Type::INT96,
- LogicalType::NONE));
-
- fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT,
- LogicalType::NONE));
-
- fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, Type::DOUBLE,
- LogicalType::NONE));
-
- // Create a primitive node named 'ba_field' with type:BYTE_ARRAY, repetition:OPTIONAL
- fields.push_back(PrimitiveNode::Make("ba_field", Repetition::OPTIONAL, Type::BYTE_ARRAY,
- LogicalType::NONE));
-
- // Create a primitive node named 'flba_field' with type:FIXED_LEN_BYTE_ARRAY,
- // repetition:REQUIRED, field_length = FIXED_LENGTH
- fields.push_back(PrimitiveNode::Make("flba_field", Repetition::REQUIRED,
- Type::FIXED_LEN_BYTE_ARRAY, LogicalType::NONE,
- FIXED_LENGTH));
-
- // Create a GroupNode named 'schema' using the primitive nodes defined above
- // This GroupNode is the root node of the schema tree
- return std::static_pointer_cast<GroupNode>(
- GroupNode::Make("schema", Repetition::REQUIRED, fields));
-}
-
int main(int argc, char** argv) {
/**********************************************************************************
PARQUET WRITER EXAMPLE
@@ -122,8 +71,7 @@ int main(int argc, char** argv) {
parquet::ParquetFileWriter::Open(out_file, schema, props);
// Append a RowGroup with a specific number of rows.
- parquet::RowGroupWriter* rg_writer =
- file_writer->AppendRowGroup(NUM_ROWS_PER_ROW_GROUP);
+ parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup();
// Write the Bool column
parquet::BoolWriter* bool_writer =
diff --git a/cpp/examples/parquet/low-level-api/reader-writer.cc b/cpp/examples/parquet/low-level-api/reader-writer2.cc
similarity index 62%
copy from cpp/examples/parquet/low-level-api/reader-writer.cc
copy to cpp/examples/parquet/low-level-api/reader-writer2.cc
index fb2ec77..dded5fa 100644
--- a/cpp/examples/parquet/low-level-api/reader-writer.cc
+++ b/cpp/examples/parquet/low-level-api/reader-writer2.cc
@@ -18,19 +18,16 @@
#include <cassert>
#include <fstream>
#include <iostream>
-#include <list>
#include <memory>
-#include <arrow/io/file.h>
-#include <arrow/util/logging.h>
-
-#include <parquet/api/reader.h>
-#include <parquet/api/writer.h>
+#include <reader_writer.h>
/*
* This example describes writing and reading Parquet Files in C++ and serves as a
* reference to the API.
* The file contains all the physical data types supported by Parquet.
+ * This example uses the RowGroupWriter API that supports writing RowGroups based on a
+ *certain size
**/
/* Parquet is a structured columnar file format
@@ -45,56 +42,9 @@
* https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
**/
-constexpr int NUM_ROWS_PER_ROW_GROUP = 500;
-constexpr int FIXED_LENGTH = 10;
-const char PARQUET_FILENAME[] = "parquet_cpp_example.parquet";
-
-using parquet::Repetition;
-using parquet::Type;
-using parquet::LogicalType;
-using parquet::schema::PrimitiveNode;
-using parquet::schema::GroupNode;
-
-static std::shared_ptr<GroupNode> SetupSchema() {
- parquet::schema::NodeVector fields;
- // Create a primitive node named 'boolean_field' with type:BOOLEAN,
- // repetition:REQUIRED
- fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED,
- Type::BOOLEAN, LogicalType::NONE));
-
- // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED,
- // logical type:TIME_MILLIS
- fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32,
- LogicalType::TIME_MILLIS));
-
- // Create a primitive node named 'int64_field' with type:INT64, repetition:REPEATED
- fields.push_back(PrimitiveNode::Make("int64_field", Repetition::REPEATED, Type::INT64,
- LogicalType::NONE));
-
- fields.push_back(PrimitiveNode::Make("int96_field", Repetition::REQUIRED, Type::INT96,
- LogicalType::NONE));
-
- fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT,
- LogicalType::NONE));
-
- fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, Type::DOUBLE,
- LogicalType::NONE));
-
- // Create a primitive node named 'ba_field' with type:BYTE_ARRAY, repetition:OPTIONAL
- fields.push_back(PrimitiveNode::Make("ba_field", Repetition::OPTIONAL, Type::BYTE_ARRAY,
- LogicalType::NONE));
-
- // Create a primitive node named 'flba_field' with type:FIXED_LEN_BYTE_ARRAY,
- // repetition:REQUIRED, field_length = FIXED_LENGTH
- fields.push_back(PrimitiveNode::Make("flba_field", Repetition::REQUIRED,
- Type::FIXED_LEN_BYTE_ARRAY, LogicalType::NONE,
- FIXED_LENGTH));
-
- // Create a GroupNode named 'schema' using the primitive nodes defined above
- // This GroupNode is the root node of the schema tree
- return std::static_pointer_cast<GroupNode>(
- GroupNode::Make("schema", Repetition::REQUIRED, fields));
-}
+constexpr int NUM_ROWS = 2500000;
+constexpr int64_t ROW_GROUP_SIZE = 16 * 1024 * 1024; // 16 MB
+const char PARQUET_FILENAME[] = "parquet_cpp_example2.parquet";
int main(int argc, char** argv) {
/**********************************************************************************
@@ -121,99 +71,118 @@ int main(int argc, char** argv) {
std::shared_ptr<parquet::ParquetFileWriter> file_writer =
parquet::ParquetFileWriter::Open(out_file, schema, props);
- // Append a RowGroup with a specific number of rows.
- parquet::RowGroupWriter* rg_writer =
- file_writer->AppendRowGroup(NUM_ROWS_PER_ROW_GROUP);
+ // Append a BufferedRowGroup to keep the RowGroup open until a certain size
+ parquet::RowGroupWriter* rg_writer = file_writer->AppendBufferedRowGroup();
- // Write the Bool column
- parquet::BoolWriter* bool_writer =
- static_cast<parquet::BoolWriter*>(rg_writer->NextColumn());
- for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
- bool value = ((i % 2) == 0) ? true : false;
- bool_writer->WriteBatch(1, nullptr, nullptr, &value);
- }
+ int num_columns = file_writer->num_columns();
+ std::vector<int64_t> buffered_values_estimate(num_columns, 0);
+ for (int i = 0; i < NUM_ROWS; i++) {
+ int64_t estimated_bytes = 0;
+ // Get the estimated size of the values that are not written to a page yet
+ for (int n = 0; n < num_columns; n++) {
+ estimated_bytes += buffered_values_estimate[n];
+ }
- // Write the Int32 column
- parquet::Int32Writer* int32_writer =
- static_cast<parquet::Int32Writer*>(rg_writer->NextColumn());
- for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
- int32_t value = i;
- int32_writer->WriteBatch(1, nullptr, nullptr, &value);
- }
+ // We need to consider the compressed pages
+ // as well as the values that are not compressed yet
+ if ((rg_writer->total_bytes_written() + rg_writer->total_compressed_bytes() +
+ estimated_bytes) > ROW_GROUP_SIZE) {
+ rg_writer->Close();
+ std::fill(buffered_values_estimate.begin(), buffered_values_estimate.end(), 0);
+ rg_writer = file_writer->AppendBufferedRowGroup();
+ }
- // Write the Int64 column. Each row has repeats twice.
- parquet::Int64Writer* int64_writer =
- static_cast<parquet::Int64Writer*>(rg_writer->NextColumn());
- for (int i = 0; i < 2 * NUM_ROWS_PER_ROW_GROUP; i++) {
- int64_t value = i * 1000 * 1000;
- value *= 1000 * 1000;
+ int col_id = 0;
+ // Write the Bool column
+ parquet::BoolWriter* bool_writer =
+ static_cast<parquet::BoolWriter*>(rg_writer->column(col_id));
+ bool bool_value = ((i % 2) == 0) ? true : false;
+ bool_writer->WriteBatch(1, nullptr, nullptr, &bool_value);
+ buffered_values_estimate[col_id] = bool_writer->EstimatedBufferedValueBytes();
+
+ // Write the Int32 column
+ col_id++;
+ parquet::Int32Writer* int32_writer =
+ static_cast<parquet::Int32Writer*>(rg_writer->column(col_id));
+ int32_t int32_value = i;
+ int32_writer->WriteBatch(1, nullptr, nullptr, &int32_value);
+ buffered_values_estimate[col_id] = int32_writer->EstimatedBufferedValueBytes();
+
+ // Write the Int64 column. Each row has repeats twice.
+ col_id++;
+ parquet::Int64Writer* int64_writer =
+ static_cast<parquet::Int64Writer*>(rg_writer->column(col_id));
+ int64_t int64_value1 = 2 * i;
int16_t definition_level = 1;
int16_t repetition_level = 0;
- if ((i % 2) == 0) {
- repetition_level = 1; // start of a new record
- }
- int64_writer->WriteBatch(1, &definition_level, &repetition_level, &value);
- }
-
- // Write the INT96 column.
- parquet::Int96Writer* int96_writer =
- static_cast<parquet::Int96Writer*>(rg_writer->NextColumn());
- for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
- parquet::Int96 value;
- value.value[0] = i;
- value.value[1] = i + 1;
- value.value[2] = i + 2;
- int96_writer->WriteBatch(1, nullptr, nullptr, &value);
- }
-
- // Write the Float column
- parquet::FloatWriter* float_writer =
- static_cast<parquet::FloatWriter*>(rg_writer->NextColumn());
- for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
- float value = static_cast<float>(i) * 1.1f;
- float_writer->WriteBatch(1, nullptr, nullptr, &value);
- }
-
- // Write the Double column
- parquet::DoubleWriter* double_writer =
- static_cast<parquet::DoubleWriter*>(rg_writer->NextColumn());
- for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
- double value = i * 1.1111111;
- double_writer->WriteBatch(1, nullptr, nullptr, &value);
- }
-
- // Write the ByteArray column. Make every alternate values NULL
- parquet::ByteArrayWriter* ba_writer =
- static_cast<parquet::ByteArrayWriter*>(rg_writer->NextColumn());
- for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
- parquet::ByteArray value;
+ int64_writer->WriteBatch(1, &definition_level, &repetition_level, &int64_value1);
+ int64_t int64_value2 = (2 * i + 1);
+ repetition_level = 1; // start of a new record
+ int64_writer->WriteBatch(1, &definition_level, &repetition_level, &int64_value2);
+ buffered_values_estimate[col_id] = int64_writer->EstimatedBufferedValueBytes();
+
+ // Write the INT96 column.
+ col_id++;
+ parquet::Int96Writer* int96_writer =
+ static_cast<parquet::Int96Writer*>(rg_writer->column(col_id));
+ parquet::Int96 int96_value;
+ int96_value.value[0] = i;
+ int96_value.value[1] = i + 1;
+ int96_value.value[2] = i + 2;
+ int96_writer->WriteBatch(1, nullptr, nullptr, &int96_value);
+ buffered_values_estimate[col_id] = int96_writer->EstimatedBufferedValueBytes();
+
+ // Write the Float column
+ col_id++;
+ parquet::FloatWriter* float_writer =
+ static_cast<parquet::FloatWriter*>(rg_writer->column(col_id));
+ float float_value = static_cast<float>(i) * 1.1f;
+ float_writer->WriteBatch(1, nullptr, nullptr, &float_value);
+ buffered_values_estimate[col_id] = float_writer->EstimatedBufferedValueBytes();
+
+ // Write the Double column
+ col_id++;
+ parquet::DoubleWriter* double_writer =
+ static_cast<parquet::DoubleWriter*>(rg_writer->column(col_id));
+ double double_value = i * 1.1111111;
+ double_writer->WriteBatch(1, nullptr, nullptr, &double_value);
+ buffered_values_estimate[col_id] = double_writer->EstimatedBufferedValueBytes();
+
+ // Write the ByteArray column. Make every alternate values NULL
+ col_id++;
+ parquet::ByteArrayWriter* ba_writer =
+ static_cast<parquet::ByteArrayWriter*>(rg_writer->column(col_id));
+ parquet::ByteArray ba_value;
char hello[FIXED_LENGTH] = "parquet";
hello[7] = static_cast<char>(static_cast<int>('0') + i / 100);
hello[8] = static_cast<char>(static_cast<int>('0') + (i / 10) % 10);
hello[9] = static_cast<char>(static_cast<int>('0') + i % 10);
if (i % 2 == 0) {
int16_t definition_level = 1;
- value.ptr = reinterpret_cast<const uint8_t*>(&hello[0]);
- value.len = FIXED_LENGTH;
- ba_writer->WriteBatch(1, &definition_level, nullptr, &value);
+ ba_value.ptr = reinterpret_cast<const uint8_t*>(&hello[0]);
+ ba_value.len = FIXED_LENGTH;
+ ba_writer->WriteBatch(1, &definition_level, nullptr, &ba_value);
} else {
int16_t definition_level = 0;
ba_writer->WriteBatch(1, &definition_level, nullptr, nullptr);
}
- }
+ buffered_values_estimate[col_id] = ba_writer->EstimatedBufferedValueBytes();
- // Write the FixedLengthByteArray column
- parquet::FixedLenByteArrayWriter* flba_writer =
- static_cast<parquet::FixedLenByteArrayWriter*>(rg_writer->NextColumn());
- for (int i = 0; i < NUM_ROWS_PER_ROW_GROUP; i++) {
- parquet::FixedLenByteArray value;
+ // Write the FixedLengthByteArray column
+ col_id++;
+ parquet::FixedLenByteArrayWriter* flba_writer =
+ static_cast<parquet::FixedLenByteArrayWriter*>(rg_writer->column(col_id));
+ parquet::FixedLenByteArray flba_value;
char v = static_cast<char>(i);
char flba[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v};
- value.ptr = reinterpret_cast<const uint8_t*>(&flba[0]);
+ flba_value.ptr = reinterpret_cast<const uint8_t*>(&flba[0]);
- flba_writer->WriteBatch(1, nullptr, nullptr, &value);
+ flba_writer->WriteBatch(1, nullptr, nullptr, &flba_value);
+ buffered_values_estimate[col_id] = flba_writer->EstimatedBufferedValueBytes();
}
+ // Close the RowGroupWriter
+ rg_writer->Close();
// Close the ParquetFileWriter
file_writer->Close();
@@ -236,34 +205,35 @@ int main(int argc, char** argv) {
// Get the File MetaData
std::shared_ptr<parquet::FileMetaData> file_metadata = parquet_reader->metadata();
- // Get the number of RowGroups
int num_row_groups = file_metadata->num_row_groups();
- assert(num_row_groups == 1);
// Get the number of Columns
int num_columns = file_metadata->num_columns();
assert(num_columns == 8);
+ std::vector<int> col_row_counts(num_columns, 0);
+
// Iterate over all the RowGroups in the file
for (int r = 0; r < num_row_groups; ++r) {
// Get the RowGroup Reader
std::shared_ptr<parquet::RowGroupReader> row_group_reader =
parquet_reader->RowGroup(r);
+ assert(row_group_reader->metadata()->total_byte_size() < ROW_GROUP_SIZE);
+
int64_t values_read = 0;
int64_t rows_read = 0;
int16_t definition_level;
int16_t repetition_level;
- int i;
std::shared_ptr<parquet::ColumnReader> column_reader;
+ int col_id = 0;
// Get the Column Reader for the boolean column
- column_reader = row_group_reader->Column(0);
+ column_reader = row_group_reader->Column(col_id);
parquet::BoolReader* bool_reader =
static_cast<parquet::BoolReader*>(column_reader.get());
// Read all the rows in the column
- i = 0;
while (bool_reader->HasNext()) {
bool value;
// Read one value at a time. The number of rows read is returned. values_read
@@ -274,17 +244,17 @@ int main(int argc, char** argv) {
// There are no NULL values in the rows written
assert(values_read == 1);
// Verify the value written
- bool expected_value = ((i % 2) == 0) ? true : false;
+ bool expected_value = ((col_row_counts[col_id] % 2) == 0) ? true : false;
assert(value == expected_value);
- i++;
+ col_row_counts[col_id]++;
}
// Get the Column Reader for the Int32 column
- column_reader = row_group_reader->Column(1);
+ col_id++;
+ column_reader = row_group_reader->Column(col_id);
parquet::Int32Reader* int32_reader =
static_cast<parquet::Int32Reader*>(column_reader.get());
// Read all the rows in the column
- i = 0;
while (int32_reader->HasNext()) {
int32_t value;
// Read one value at a time. The number of rows read is returned. values_read
@@ -295,16 +265,16 @@ int main(int argc, char** argv) {
// There are no NULL values in the rows written
assert(values_read == 1);
// Verify the value written
- assert(value == i);
- i++;
+ assert(value == col_row_counts[col_id]);
+ col_row_counts[col_id]++;
}
// Get the Column Reader for the Int64 column
- column_reader = row_group_reader->Column(2);
+ col_id++;
+ column_reader = row_group_reader->Column(col_id);
parquet::Int64Reader* int64_reader =
static_cast<parquet::Int64Reader*>(column_reader.get());
// Read all the rows in the column
- i = 0;
while (int64_reader->HasNext()) {
int64_t value;
// Read one value at a time. The number of rows read is returned. values_read
@@ -316,23 +286,22 @@ int main(int argc, char** argv) {
// There are no NULL values in the rows written
assert(values_read == 1);
// Verify the value written
- int64_t expected_value = i * 1000 * 1000;
- expected_value *= 1000 * 1000;
+ int64_t expected_value = col_row_counts[col_id];
assert(value == expected_value);
- if ((i % 2) == 0) {
- assert(repetition_level == 1);
- } else {
+ if ((col_row_counts[col_id] % 2) == 0) {
assert(repetition_level == 0);
+ } else {
+ assert(repetition_level == 1);
}
- i++;
+ col_row_counts[col_id]++;
}
// Get the Column Reader for the Int96 column
- column_reader = row_group_reader->Column(3);
+ col_id++;
+ column_reader = row_group_reader->Column(col_id);
parquet::Int96Reader* int96_reader =
static_cast<parquet::Int96Reader*>(column_reader.get());
// Read all the rows in the column
- i = 0;
while (int96_reader->HasNext()) {
parquet::Int96 value;
// Read one value at a time. The number of rows read is returned. values_read
@@ -344,21 +313,21 @@ int main(int argc, char** argv) {
assert(values_read == 1);
// Verify the value written
parquet::Int96 expected_value;
- expected_value.value[0] = i;
- expected_value.value[1] = i + 1;
- expected_value.value[2] = i + 2;
+ expected_value.value[0] = col_row_counts[col_id];
+ expected_value.value[1] = col_row_counts[col_id] + 1;
+ expected_value.value[2] = col_row_counts[col_id] + 2;
for (int j = 0; j < 3; j++) {
assert(value.value[j] == expected_value.value[j]);
}
- i++;
+ col_row_counts[col_id]++;
}
// Get the Column Reader for the Float column
- column_reader = row_group_reader->Column(4);
+ col_id++;
+ column_reader = row_group_reader->Column(col_id);
parquet::FloatReader* float_reader =
static_cast<parquet::FloatReader*>(column_reader.get());
// Read all the rows in the column
- i = 0;
while (float_reader->HasNext()) {
float value;
// Read one value at a time. The number of rows read is returned. values_read
@@ -369,17 +338,17 @@ int main(int argc, char** argv) {
// There are no NULL values in the rows written
assert(values_read == 1);
// Verify the value written
- float expected_value = static_cast<float>(i) * 1.1f;
+ float expected_value = static_cast<float>(col_row_counts[col_id]) * 1.1f;
assert(value == expected_value);
- i++;
+ col_row_counts[col_id]++;
}
// Get the Column Reader for the Double column
- column_reader = row_group_reader->Column(5);
+ col_id++;
+ column_reader = row_group_reader->Column(col_id);
parquet::DoubleReader* double_reader =
static_cast<parquet::DoubleReader*>(column_reader.get());
// Read all the rows in the column
- i = 0;
while (double_reader->HasNext()) {
double value;
// Read one value at a time. The number of rows read is returned. values_read
@@ -390,17 +359,17 @@ int main(int argc, char** argv) {
// There are no NULL values in the rows written
assert(values_read == 1);
// Verify the value written
- double expected_value = i * 1.1111111;
+ double expected_value = col_row_counts[col_id] * 1.1111111;
assert(value == expected_value);
- i++;
+ col_row_counts[col_id]++;
}
// Get the Column Reader for the ByteArray column
- column_reader = row_group_reader->Column(6);
+ col_id++;
+ column_reader = row_group_reader->Column(col_id);
parquet::ByteArrayReader* ba_reader =
static_cast<parquet::ByteArrayReader*>(column_reader.get());
// Read all the rows in the column
- i = 0;
while (ba_reader->HasNext()) {
parquet::ByteArray value;
// Read one value at a time. The number of rows read is returned. values_read
@@ -411,10 +380,10 @@ int main(int argc, char** argv) {
assert(rows_read == 1);
// Verify the value written
char expected_value[FIXED_LENGTH] = "parquet";
- expected_value[7] = static_cast<char>('0' + i / 100);
- expected_value[8] = static_cast<char>('0' + (i / 10) % 10);
- expected_value[9] = static_cast<char>('0' + i % 10);
- if (i % 2 == 0) { // only alternate values exist
+ expected_value[7] = static_cast<char>('0' + col_row_counts[col_id] / 100);
+ expected_value[8] = static_cast<char>('0' + (col_row_counts[col_id] / 10) % 10);
+ expected_value[9] = static_cast<char>('0' + col_row_counts[col_id] % 10);
+ if (col_row_counts[col_id] % 2 == 0) { // only alternate values exist
// There are no NULL values in the rows written
assert(values_read == 1);
assert(value.len == FIXED_LENGTH);
@@ -425,15 +394,15 @@ int main(int argc, char** argv) {
assert(values_read == 0);
assert(definition_level == 0);
}
- i++;
+ col_row_counts[col_id]++;
}
// Get the Column Reader for the FixedLengthByteArray column
- column_reader = row_group_reader->Column(7);
+ col_id++;
+ column_reader = row_group_reader->Column(col_id);
parquet::FixedLenByteArrayReader* flba_reader =
static_cast<parquet::FixedLenByteArrayReader*>(column_reader.get());
// Read all the rows in the column
- i = 0;
while (flba_reader->HasNext()) {
parquet::FixedLenByteArray value;
// Read one value at a time. The number of rows read is returned. values_read
@@ -444,10 +413,10 @@ int main(int argc, char** argv) {
// There are no NULL values in the rows written
assert(values_read == 1);
// Verify the value written
- char v = static_cast<char>(i);
+ char v = static_cast<char>(col_row_counts[col_id]);
char expected_value[FIXED_LENGTH] = {v, v, v, v, v, v, v, v, v, v};
assert(memcmp(value.ptr, &expected_value[0], FIXED_LENGTH) == 0);
- i++;
+ col_row_counts[col_id]++;
}
}
} catch (const std::exception& e) {
diff --git a/cpp/examples/parquet/low-level-api/reader_writer.h b/cpp/examples/parquet/low-level-api/reader_writer.h
new file mode 100644
index 0000000..3fda0cf
--- /dev/null
+++ b/cpp/examples/parquet/low-level-api/reader_writer.h
@@ -0,0 +1,71 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/io/file.h>
+#include <arrow/util/logging.h>
+
+#include <parquet/api/reader.h>
+#include <parquet/api/writer.h>
+
+using parquet::LogicalType;
+using parquet::Repetition;
+using parquet::Type;
+using parquet::schema::GroupNode;
+using parquet::schema::PrimitiveNode;
+
+constexpr int FIXED_LENGTH = 10;
+
+static std::shared_ptr<GroupNode> SetupSchema() {
+ parquet::schema::NodeVector fields;
+ // Create a primitive node named 'boolean_field' with type:BOOLEAN,
+ // repetition:REQUIRED
+ fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED,
+ Type::BOOLEAN, LogicalType::NONE));
+
+ // Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED,
+ // logical type:TIME_MILLIS
+ fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32,
+ LogicalType::TIME_MILLIS));
+
+ // Create a primitive node named 'int64_field' with type:INT64, repetition:REPEATED
+ fields.push_back(PrimitiveNode::Make("int64_field", Repetition::REPEATED, Type::INT64,
+ LogicalType::NONE));
+
+ fields.push_back(PrimitiveNode::Make("int96_field", Repetition::REQUIRED, Type::INT96,
+ LogicalType::NONE));
+
+ fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT,
+ LogicalType::NONE));
+
+ fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, Type::DOUBLE,
+ LogicalType::NONE));
+
+ // Create a primitive node named 'ba_field' with type:BYTE_ARRAY, repetition:OPTIONAL
+ fields.push_back(PrimitiveNode::Make("ba_field", Repetition::OPTIONAL, Type::BYTE_ARRAY,
+ LogicalType::NONE));
+
+ // Create a primitive node named 'flba_field' with type:FIXED_LEN_BYTE_ARRAY,
+ // repetition:REQUIRED, field_length = FIXED_LENGTH
+ fields.push_back(PrimitiveNode::Make("flba_field", Repetition::REQUIRED,
+ Type::FIXED_LEN_BYTE_ARRAY, LogicalType::NONE,
+ FIXED_LENGTH));
+
+ // Create a GroupNode named 'schema' using the primitive nodes defined above
+ // This GroupNode is the root node of the schema tree
+ return std::static_pointer_cast<GroupNode>(
+ GroupNode::Make("schema", Repetition::REQUIRED, fields));
+}
[arrow] 08/24: PARQUET-909: Reduce buffer allocations (mallocs) on
critical path
Posted by we...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 4b444c626929e305e3d3cfe1e105db866084b64e
Author: Deepak Majeti <de...@hpe.com>
AuthorDate: Fri Mar 17 18:07:59 2017 -0400
PARQUET-909: Reduce buffer allocations (mallocs) on critical path
Author: Deepak Majeti <de...@hpe.com>
Closes #268 from majetideepak/ReuseBuffers and squashes the following commits:
bbf5453 [Deepak Majeti] Review comments
4d93d4b [Deepak Majeti] Improve example
3a3e2bb [Deepak Majeti] Fix Resize shrink_to_fit
53c8ac1 [Deepak Majeti] Improve API
22f422f [Deepak Majeti] clang format
8ae02cc [Deepak Majeti] optimize for uncompressed data
3190cef [Deepak Majeti] change fit_to_size of InMemoryOutputStream
03d4862 [Deepak Majeti] clang format
261aa1c [Deepak Majeti] Rewrite Compress API
b09c4a8 [Deepak Majeti] Reuse uncompressed_data buffer
6b9a81b [Deepak Majeti] Clang fromat
31af602 [Deepak Majeti] Reuse levels rle buffer
82eabfb [Deepak Majeti] Re-use def and rep levels sink
edc8e3c [Deepak Majeti] Add API to InMemoryOutputStream
Change-Id: I77483a383043d590936b42c06d425fbdc1edf9fc
---
cpp/examples/parquet/reader-writer.cc | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/cpp/examples/parquet/reader-writer.cc b/cpp/examples/parquet/reader-writer.cc
index 59ee63b..54390e0 100644
--- a/cpp/examples/parquet/reader-writer.cc
+++ b/cpp/examples/parquet/reader-writer.cc
@@ -110,9 +110,14 @@ int main(int argc, char** argv) {
// Setup the parquet schema
std::shared_ptr<GroupNode> schema = SetupSchema();
+ // Add writer properties
+ parquet::WriterProperties::Builder builder;
+ builder.compression(parquet::Compression::SNAPPY);
+ std::shared_ptr<parquet::WriterProperties> props = builder.build();
+
// Create a ParquetFileWriter instance
std::shared_ptr<parquet::ParquetFileWriter> file_writer =
- parquet::ParquetFileWriter::Open(out_file, schema);
+ parquet::ParquetFileWriter::Open(out_file, schema, props);
// Append a RowGroup with a specific number of rows.
parquet::RowGroupWriter* rg_writer =
@@ -225,6 +230,7 @@ int main(int argc, char** argv) {
// Create a ParquetReader instance
std::unique_ptr<parquet::ParquetFileReader> parquet_reader =
parquet::ParquetFileReader::OpenFile(PARQUET_FILENAME, false);
+
// Get the File MetaData
std::shared_ptr<parquet::FileMetaData> file_metadata = parquet_reader->metadata();
[arrow] 17/24: PARQUET-1068: Modify .clang-format to use straight
Google format with 90-character line width
Posted by we...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit d02cd9d0e3db2548bd758199c54f0d1a51b32fb4
Author: Wes McKinney <we...@twosigma.com>
AuthorDate: Mon Jul 31 11:14:52 2017 -0400
PARQUET-1068: Modify .clang-format to use straight Google format with 90-character line width
The main change is horizontal alignment. We should also do a clang-tidy pass sometime to do some further scrubbing
Author: Wes McKinney <we...@twosigma.com>
Closes #375 from wesm/PARQUET-1068 and squashes the following commits:
b81145d [Wes McKinney] Modify .clang-format to use straight Google format with 90-character line width
Change-Id: If8345d1d2a03d785ed41a5848de2c40e4bf53b5b
---
cpp/examples/parquet/reader-writer.cc | 35 ++++++++++++++++++-----------------
cpp/tools/parquet/parquet-scan.cc | 9 ++++++---
2 files changed, 24 insertions(+), 20 deletions(-)
diff --git a/cpp/examples/parquet/reader-writer.cc b/cpp/examples/parquet/reader-writer.cc
index 210968c..7136b28 100644
--- a/cpp/examples/parquet/reader-writer.cc
+++ b/cpp/examples/parquet/reader-writer.cc
@@ -59,35 +59,36 @@ static std::shared_ptr<GroupNode> SetupSchema() {
parquet::schema::NodeVector fields;
// Create a primitive node named 'boolean_field' with type:BOOLEAN,
// repetition:REQUIRED
- fields.push_back(PrimitiveNode::Make(
- "boolean_field", Repetition::REQUIRED, Type::BOOLEAN, LogicalType::NONE));
+ fields.push_back(PrimitiveNode::Make("boolean_field", Repetition::REQUIRED,
+ Type::BOOLEAN, LogicalType::NONE));
// Create a primitive node named 'int32_field' with type:INT32, repetition:REQUIRED,
// logical type:TIME_MILLIS
- fields.push_back(PrimitiveNode::Make(
- "int32_field", Repetition::REQUIRED, Type::INT32, LogicalType::TIME_MILLIS));
+ fields.push_back(PrimitiveNode::Make("int32_field", Repetition::REQUIRED, Type::INT32,
+ LogicalType::TIME_MILLIS));
// Create a primitive node named 'int64_field' with type:INT64, repetition:REPEATED
- fields.push_back(PrimitiveNode::Make(
- "int64_field", Repetition::REPEATED, Type::INT64, LogicalType::NONE));
+ fields.push_back(PrimitiveNode::Make("int64_field", Repetition::REPEATED, Type::INT64,
+ LogicalType::NONE));
- fields.push_back(PrimitiveNode::Make(
- "int96_field", Repetition::REQUIRED, Type::INT96, LogicalType::NONE));
+ fields.push_back(PrimitiveNode::Make("int96_field", Repetition::REQUIRED, Type::INT96,
+ LogicalType::NONE));
- fields.push_back(PrimitiveNode::Make(
- "float_field", Repetition::REQUIRED, Type::FLOAT, LogicalType::NONE));
+ fields.push_back(PrimitiveNode::Make("float_field", Repetition::REQUIRED, Type::FLOAT,
+ LogicalType::NONE));
- fields.push_back(PrimitiveNode::Make(
- "double_field", Repetition::REQUIRED, Type::DOUBLE, LogicalType::NONE));
+ fields.push_back(PrimitiveNode::Make("double_field", Repetition::REQUIRED, Type::DOUBLE,
+ LogicalType::NONE));
// Create a primitive node named 'ba_field' with type:BYTE_ARRAY, repetition:OPTIONAL
- fields.push_back(PrimitiveNode::Make(
- "ba_field", Repetition::OPTIONAL, Type::BYTE_ARRAY, LogicalType::NONE));
+ fields.push_back(PrimitiveNode::Make("ba_field", Repetition::OPTIONAL, Type::BYTE_ARRAY,
+ LogicalType::NONE));
// Create a primitive node named 'flba_field' with type:FIXED_LEN_BYTE_ARRAY,
// repetition:REQUIRED, field_length = FIXED_LENGTH
fields.push_back(PrimitiveNode::Make("flba_field", Repetition::REQUIRED,
- Type::FIXED_LEN_BYTE_ARRAY, LogicalType::NONE, FIXED_LENGTH));
+ Type::FIXED_LEN_BYTE_ARRAY, LogicalType::NONE,
+ FIXED_LENGTH));
// Create a GroupNode named 'schema' using the primitive nodes defined above
// This GroupNode is the root node of the schema tree
@@ -308,8 +309,8 @@ int main(int argc, char** argv) {
int64_t value;
// Read one value at a time. The number of rows read is returned. values_read
// contains the number of non-null rows
- rows_read = int64_reader->ReadBatch(
- 1, &definition_level, &repetition_level, &value, &values_read);
+ rows_read = int64_reader->ReadBatch(1, &definition_level, &repetition_level,
+ &value, &values_read);
// Ensure only one value is read
assert(rows_read == 1);
// There are no NULL values in the rows written
diff --git a/cpp/tools/parquet/parquet-scan.cc b/cpp/tools/parquet/parquet-scan.cc
index 8ab15a4..5bf2b18 100644
--- a/cpp/tools/parquet/parquet-scan.cc
+++ b/cpp/tools/parquet/parquet-scan.cc
@@ -49,7 +49,9 @@ int main(int argc, char** argv) {
}
} else if ((param = std::strstr(argv[i], BATCH_SIZE_PREFIX.c_str()))) {
value = std::strtok(param + BATCH_SIZE_PREFIX.length(), " ");
- if (value) { batch_size = std::atoi(value); }
+ if (value) {
+ batch_size = std::atoi(value);
+ }
} else {
filename = argv[i];
}
@@ -84,8 +86,9 @@ int main(int argc, char** argv) {
int64_t values_read = 0;
while (col_reader->HasNext()) {
- total_rows[col] += ScanAllValues(batch_size, def_levels.data(),
- rep_levels.data(), values.data(), &values_read, col_reader.get());
+ total_rows[col] +=
+ ScanAllValues(batch_size, def_levels.data(), rep_levels.data(),
+ values.data(), &values_read, col_reader.get());
}
col++;
}
[arrow] 18/24: PARQUET-1083: Factor logic in parquet-scan.cc into a
library function to help with perf testing
Posted by we...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 144699c4210ac0b75d16395a583671b2874acc9e
Author: Wes McKinney <we...@twosigma.com>
AuthorDate: Wed Aug 30 09:49:14 2017 +0200
PARQUET-1083: Factor logic in parquet-scan.cc into a library function to help with perf testing
See ARROW-1377
Author: Wes McKinney <we...@twosigma.com>
Closes #385 from wesm/PARQUET-1083 and squashes the following commits:
359cd09 [Wes McKinney] Factor main logic in parquet-scan.cc into a library function, so that library users can use for performance testing
Change-Id: Ia50d136c380c4d42d6c62577e02a9533df6fa6fe
---
cpp/tools/parquet/parquet-scan.cc | 38 ++------------------------------------
1 file changed, 2 insertions(+), 36 deletions(-)
diff --git a/cpp/tools/parquet/parquet-scan.cc b/cpp/tools/parquet/parquet-scan.cc
index 5bf2b18..fdc73d7 100644
--- a/cpp/tools/parquet/parquet-scan.cc
+++ b/cpp/tools/parquet/parquet-scan.cc
@@ -57,50 +57,16 @@ int main(int argc, char** argv) {
}
}
- std::vector<int16_t> rep_levels(batch_size);
- std::vector<int16_t> def_levels(batch_size);
try {
double total_time;
std::clock_t start_time = std::clock();
std::unique_ptr<parquet::ParquetFileReader> reader =
parquet::ParquetFileReader::OpenFile(filename);
- // columns are not specified explicitly. Add all columns
- if (num_columns == 0) {
- num_columns = reader->metadata()->num_columns();
- columns.resize(num_columns);
- for (int i = 0; i < num_columns; i++) {
- columns[i] = i;
- }
- }
-
- std::vector<int64_t> total_rows(num_columns);
-
- for (int r = 0; r < reader->metadata()->num_row_groups(); ++r) {
- auto group_reader = reader->RowGroup(r);
- int col = 0;
- for (auto i : columns) {
- total_rows[col] = 0;
- std::shared_ptr<parquet::ColumnReader> col_reader = group_reader->Column(i);
- size_t value_byte_size = GetTypeByteSize(col_reader->descr()->physical_type());
- std::vector<uint8_t> values(batch_size * value_byte_size);
- int64_t values_read = 0;
- while (col_reader->HasNext()) {
- total_rows[col] +=
- ScanAllValues(batch_size, def_levels.data(), rep_levels.data(),
- values.data(), &values_read, col_reader.get());
- }
- col++;
- }
- }
+ int64_t total_rows = parquet::ScanFileContents(columns, batch_size, reader.get());
total_time = (std::clock() - start_time) / static_cast<double>(CLOCKS_PER_SEC);
- for (int ct = 1; ct < num_columns; ++ct) {
- if (total_rows[0] != total_rows[ct]) {
- std::cerr << "Parquet error: Total rows among columns do not match" << std::endl;
- }
- }
- std::cout << total_rows[0] << " rows scanned in " << total_time << " seconds."
+ std::cout << total_rows << " rows scanned in " << total_time << " seconds."
<< std::endl;
} catch (const std::exception& e) {
std::cerr << "Parquet error: " << e.what() << std::endl;