You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by uw...@apache.org on 2017/04/25 06:35:03 UTC
parquet-cpp git commit: PARQUET-958: [C++] Print Parquet metadata in
JSON format
Repository: parquet-cpp
Updated Branches:
refs/heads/master a54404ed0 -> 39ebf2afa
PARQUET-958: [C++] Print Parquet metadata in JSON format
Made minor formatting changes to DebugPrint
No support to print values. Only the metadata is JSON formatted in this patch.
Author: Deepak Majeti <de...@hpe.com>
Closes #310 from majetideepak/PARQUET-958 and squashes the following commits:
4d9cbbd [Deepak Majeti] change DebugPrint to take filename
3c78bc0 [Deepak Majeti] use raw string
97f016a [Deepak Majeti] add test and clang format
ec12ddb [Deepak Majeti] add JSONPrint
9c697e2 [Deepak Majeti] fix CMake flag for benchmarks
Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/39ebf2af
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/39ebf2af
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/39ebf2af
Branch: refs/heads/master
Commit: 39ebf2afafbf498fa5d584143248e7988a4c04dd
Parents: a54404e
Author: Deepak Majeti <de...@hpe.com>
Authored: Tue Apr 25 08:34:58 2017 +0200
Committer: Uwe L. Korn <uw...@apache.org>
Committed: Tue Apr 25 08:34:58 2017 +0200
----------------------------------------------------------------------
benchmarks/CMakeLists.txt | 2 +-
src/parquet/file/printer.cc | 118 +++++++++++++++++++++++++++++++++++----
src/parquet/file/printer.h | 8 ++-
src/parquet/reader-test.cc | 66 ++++++++++++++++++++++
tools/parquet_reader.cc | 11 +++-
5 files changed, 189 insertions(+), 16 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/39ebf2af/benchmarks/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 1df5dea..2ef8113 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -19,7 +19,7 @@ SET(LINK_LIBS
snappystatic
thriftstatic)
-if (PARQUET_BUILD_EXECUTABLES)
+if (PARQUET_BUILD_BENCHMARKS)
add_executable(decode_benchmark decode_benchmark.cc)
# This uses private APIs
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/39ebf2af/src/parquet/file/printer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/printer.cc b/src/parquet/file/printer.cc
index 8dd9d55..4d0dad4 100644
--- a/src/parquet/file/printer.cc
+++ b/src/parquet/file/printer.cc
@@ -33,10 +33,11 @@ namespace parquet {
#define COL_WIDTH "30"
void ParquetFilePrinter::DebugPrint(
- std::ostream& stream, std::list<int> selected_columns, bool print_values) {
+ std::ostream& stream, std::list<int> selected_columns, bool print_values,
+ const char* filename) {
const FileMetaData* file_metadata = fileReader->metadata().get();
- stream << "File statistics:\n";
+ stream << "File Name: " << filename << "\n";
stream << "Version: " << file_metadata->version() << "\n";
stream << "Created By: " << file_metadata->created_by() << "\n";
stream << "Total rows: " << file_metadata->num_rows() << "\n";
@@ -71,7 +72,7 @@ void ParquetFilePrinter::DebugPrint(
std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
stream << "--- Total Bytes " << group_metadata->total_byte_size() << " ---\n";
- stream << " rows: " << group_metadata->num_rows() << "---\n";
+ stream << " Rows: " << group_metadata->num_rows() << "---\n";
// Print column metadata
for (auto i : selected_columns) {
@@ -79,25 +80,25 @@ void ParquetFilePrinter::DebugPrint(
std::shared_ptr<RowGroupStatistics> stats = column_chunk->statistics();
const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
- stream << "Column " << i << std::endl << ", values: " << column_chunk->num_values();
+ stream << "Column " << i << std::endl << ", Values: " << column_chunk->num_values();
if (column_chunk->is_stats_set()) {
std::string min = stats->EncodeMin(), max = stats->EncodeMax();
- stream << ", null values: " << stats->null_count()
- << ", distinct values: " << stats->distinct_count() << std::endl
- << " max: " << FormatStatValue(descr->physical_type(), max.c_str())
- << ", min: " << FormatStatValue(descr->physical_type(), min.c_str());
+ stream << ", Null Values: " << stats->null_count()
+ << ", Distinct Values: " << stats->distinct_count() << std::endl
+ << " Max: " << FormatStatValue(descr->physical_type(), max.c_str())
+ << ", Min: " << FormatStatValue(descr->physical_type(), min.c_str());
} else {
stream << " Statistics Not Set";
}
stream << std::endl
- << " compression: " << CompressionToString(column_chunk->compression())
- << ", encodings: ";
+ << " Compression: " << CompressionToString(column_chunk->compression())
+ << ", Encodings: ";
for (auto encoding : column_chunk->encodings()) {
stream << EncodingToString(encoding) << " ";
}
stream << std::endl
- << " uncompressed size: " << column_chunk->total_uncompressed_size()
- << ", compressed size: " << column_chunk->total_compressed_size()
+ << " Uncompressed Size: " << column_chunk->total_uncompressed_size()
+ << ", Compressed Size: " << column_chunk->total_compressed_size()
<< std::endl;
}
@@ -140,4 +141,97 @@ void ParquetFilePrinter::DebugPrint(
}
}
+void ParquetFilePrinter::JSONPrint(
+ std::ostream& stream, std::list<int> selected_columns,
+ const char* filename) {
+ const FileMetaData* file_metadata = fileReader->metadata().get();
+ stream << "{\n";
+ stream << " \"FileName\": \"" << filename << "\",\n";
+ stream << " \"Version\": \"" << file_metadata->version() << "\",\n";
+ stream << " \"CreatedBy\": \"" << file_metadata->created_by() << "\",\n";
+ stream << " \"TotalRows\": \"" << file_metadata->num_rows() << "\",\n";
+ stream << " \"NumberOfRowGroups\": \"" << file_metadata->num_row_groups() << "\",\n";
+ stream << " \"NumberOfRealColumns\": \""
+ << file_metadata->schema()->group_node()->field_count() << "\",\n";
+ stream << " \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n";
+
+ if (selected_columns.size() == 0) {
+ for (int i = 0; i < file_metadata->num_columns(); i++) {
+ selected_columns.push_back(i);
+ }
+ } else {
+ for (auto i : selected_columns) {
+ if (i < 0 || i >= file_metadata->num_columns()) {
+ throw ParquetException("Selected column is out of range");
+ }
+ }
+ }
+
+ stream << " \"Columns\": [\n";
+ int c = 0;
+ for (auto i : selected_columns) {
+ const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
+ stream << " { \"Id\": \"" << i << "\", \"Name\": \"" << descr->name() << "\","
+ << " \"PhysicalType\": \"" << TypeToString(descr->physical_type()) << "\","
+ << " \"LogicalType\": \"" << LogicalTypeToString(descr->logical_type())
+ << "\" }";
+ c++;
+ if (c != static_cast<int>(selected_columns.size())) { stream << ",\n"; }
+ }
+
+ stream << "\n ],\n \"RowGroups\": [\n";
+ for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
+ stream << " {\n \"Id\": \"" << r << "\", ";
+
+ auto group_reader = fileReader->RowGroup(r);
+ std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
+
+ stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() << "\", ";
+ stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n";
+
+ // Print column metadata
+ stream << " \"ColumnChunks\": [\n";
+ int c1 = 0;
+ for (auto i : selected_columns) {
+ auto column_chunk = group_metadata->ColumnChunk(i);
+ std::shared_ptr<RowGroupStatistics> stats = column_chunk->statistics();
+
+ const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
+ stream << " {\"Id\": \"" << i << "\", \"Values\": \""
+ << column_chunk->num_values() << "\", "
+ << "\"StatsSet\": ";
+ if (column_chunk->is_stats_set()) {
+ stream << "\"True\", \"Stats\": {";
+ std::string min = stats->EncodeMin(), max = stats->EncodeMax();
+ stream << "\"NumNulls\": \"" << stats->null_count() << "\", "
+ << "\"DistinctValues\": \"" << stats->distinct_count() << "\", "
+ << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max.c_str())
+ << "\", "
+ << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min.c_str())
+ << "\" },";
+ } else {
+ stream << "\"False\",";
+ }
+ stream << "\n \"Compression\": \""
+ << CompressionToString(column_chunk->compression())
+ << "\", \"Encodings\": \"";
+ for (auto encoding : column_chunk->encodings()) {
+ stream << EncodingToString(encoding) << " ";
+ }
+ stream << "\", "
+ << "\"UncompressedSize\": \"" << column_chunk->total_uncompressed_size()
+ << "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size();
+
+ // end of a ColumnChunk
+ stream << "\" }";
+ c1++;
+ if (c1 != static_cast<int>(selected_columns.size())) { stream << ",\n"; }
+ }
+
+ stream << "\n ]\n }";
+ if ((r + 1) != static_cast<int>(file_metadata->num_row_groups())) { stream << ",\n"; }
+ }
+ stream << "\n ]\n}\n";
+}
+
} // namespace parquet
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/39ebf2af/src/parquet/file/printer.h
----------------------------------------------------------------------
diff --git a/src/parquet/file/printer.h b/src/parquet/file/printer.h
index 433f9e8..bd54e40 100644
--- a/src/parquet/file/printer.h
+++ b/src/parquet/file/printer.h
@@ -32,12 +32,18 @@ namespace parquet {
class PARQUET_EXPORT ParquetFilePrinter {
private:
ParquetFileReader* fileReader;
+
public:
explicit ParquetFilePrinter(ParquetFileReader* reader) : fileReader(reader) {}
~ParquetFilePrinter() {}
void DebugPrint(
- std::ostream& stream, std::list<int> selected_columns, bool print_values = true);
+ std::ostream& stream, std::list<int> selected_columns, bool print_values = true,
+ const char* fileame = "No Name");
+
+ void JSONPrint(
+ std::ostream& stream, std::list<int> selected_columns,
+ const char* filename = "No Name");
};
} // namespace parquet
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/39ebf2af/src/parquet/reader-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/reader-test.cc b/src/parquet/reader-test.cc
index f7c666c..71f982b 100644
--- a/src/parquet/reader-test.cc
+++ b/src/parquet/reader-test.cc
@@ -256,4 +256,70 @@ TEST(TestFileReaderAdHoc, NationDictTruncatedDataPage) {
ASSERT_EQ(ss2.str(), ss.str());
}
+TEST(TestJSONWithLocalFile, JSONOutput) {
+ std::string jsonOutput = R"###({
+ "FileName": "alltypes_plain.parquet",
+ "Version": "0",
+ "CreatedBy": "impala version 1.3.0-INTERNAL (build 8a48ddb1eff84592b3fc06bc6f51ec120e1fffc9)",
+ "TotalRows": "8",
+ "NumberOfRowGroups": "1",
+ "NumberOfRealColumns": "11",
+ "NumberOfColumns": "11",
+ "Columns": [
+ { "Id": "0", "Name": "id", "PhysicalType": "INT32", "LogicalType": "NONE" },
+ { "Id": "1", "Name": "bool_col", "PhysicalType": "BOOLEAN", "LogicalType": "NONE" },
+ { "Id": "2", "Name": "tinyint_col", "PhysicalType": "INT32", "LogicalType": "NONE" },
+ { "Id": "3", "Name": "smallint_col", "PhysicalType": "INT32", "LogicalType": "NONE" },
+ { "Id": "4", "Name": "int_col", "PhysicalType": "INT32", "LogicalType": "NONE" },
+ { "Id": "5", "Name": "bigint_col", "PhysicalType": "INT64", "LogicalType": "NONE" },
+ { "Id": "6", "Name": "float_col", "PhysicalType": "FLOAT", "LogicalType": "NONE" },
+ { "Id": "7", "Name": "double_col", "PhysicalType": "DOUBLE", "LogicalType": "NONE" },
+ { "Id": "8", "Name": "date_string_col", "PhysicalType": "BYTE_ARRAY", "LogicalType": "NONE" },
+ { "Id": "9", "Name": "string_col", "PhysicalType": "BYTE_ARRAY", "LogicalType": "NONE" },
+ { "Id": "10", "Name": "timestamp_col", "PhysicalType": "INT96", "LogicalType": "NONE" }
+ ],
+ "RowGroups": [
+ {
+ "Id": "0", "TotalBytes": "671", "Rows": "8",
+ "ColumnChunks": [
+ {"Id": "0", "Values": "8", "StatsSet": "False",
+ "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY PLAIN ", "UncompressedSize": "73", "CompressedSize": "73" },
+ {"Id": "1", "Values": "8", "StatsSet": "False",
+ "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY PLAIN ", "UncompressedSize": "24", "CompressedSize": "24" },
+ {"Id": "2", "Values": "8", "StatsSet": "False",
+ "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY PLAIN ", "UncompressedSize": "47", "CompressedSize": "47" },
+ {"Id": "3", "Values": "8", "StatsSet": "False",
+ "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY PLAIN ", "UncompressedSize": "47", "CompressedSize": "47" },
+ {"Id": "4", "Values": "8", "StatsSet": "False",
+ "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY PLAIN ", "UncompressedSize": "47", "CompressedSize": "47" },
+ {"Id": "5", "Values": "8", "StatsSet": "False",
+ "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY PLAIN ", "UncompressedSize": "55", "CompressedSize": "55" },
+ {"Id": "6", "Values": "8", "StatsSet": "False",
+ "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY PLAIN ", "UncompressedSize": "47", "CompressedSize": "47" },
+ {"Id": "7", "Values": "8", "StatsSet": "False",
+ "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY PLAIN ", "UncompressedSize": "55", "CompressedSize": "55" },
+ {"Id": "8", "Values": "8", "StatsSet": "False",
+ "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY PLAIN ", "UncompressedSize": "88", "CompressedSize": "88" },
+ {"Id": "9", "Values": "8", "StatsSet": "False",
+ "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY PLAIN ", "UncompressedSize": "49", "CompressedSize": "49" },
+ {"Id": "10", "Values": "8", "StatsSet": "False",
+ "Compression": "UNCOMPRESSED", "Encodings": "RLE PLAIN_DICTIONARY PLAIN ", "UncompressedSize": "139", "CompressedSize": "139" }
+ ]
+ }
+ ]
+}
+)###";
+
+ std::stringstream ss;
+ // empty list means print all
+ std::list<int> columns;
+
+ auto reader =
+ ParquetFileReader::OpenFile(alltypes_plain(), false, default_reader_properties());
+ ParquetFilePrinter printer(reader.get());
+ printer.JSONPrint(ss, columns, "alltypes_plain.parquet");
+
+ ASSERT_EQ(jsonOutput, ss.str());
+}
+
} // namespace parquet
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/39ebf2af/tools/parquet_reader.cc
----------------------------------------------------------------------
diff --git a/tools/parquet_reader.cc b/tools/parquet_reader.cc
index 25f81c1..7ef59dc 100644
--- a/tools/parquet_reader.cc
+++ b/tools/parquet_reader.cc
@@ -23,7 +23,7 @@
int main(int argc, char** argv) {
if (argc > 5 || argc < 2) {
- std::cerr << "Usage: parquet_reader [--only-metadata] [--no-memory-map] "
+ std::cerr << "Usage: parquet_reader [--only-metadata] [--no-memory-map] [--json]"
"[--columns=...] <file>"
<< std::endl;
return -1;
@@ -32,6 +32,7 @@ int main(int argc, char** argv) {
std::string filename;
bool print_values = true;
bool memory_map = true;
+ bool format_json = false;
// Read command-line options
const std::string COLUMNS_PREFIX = "--columns=";
@@ -43,6 +44,8 @@ int main(int argc, char** argv) {
print_values = false;
} else if ((param = std::strstr(argv[i], "--no-memory-map"))) {
memory_map = false;
+ } else if ((param = std::strstr(argv[i], "--json"))) {
+ format_json = true;
} else if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) {
value = std::strtok(param + COLUMNS_PREFIX.length(), ",");
while (value) {
@@ -58,7 +61,11 @@ int main(int argc, char** argv) {
std::unique_ptr<parquet::ParquetFileReader> reader =
parquet::ParquetFileReader::OpenFile(filename, memory_map);
parquet::ParquetFilePrinter printer(reader.get());
- printer.DebugPrint(std::cout, columns, print_values);
+ if (format_json) {
+ printer.JSONPrint(std::cout, columns, filename.c_str());
+ } else {
+ printer.DebugPrint(std::cout, columns, print_values, filename.c_str());
+ }
} catch (const std::exception& e) {
std::cerr << "Parquet error: " << e.what() << std::endl;
return -1;