You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2019/05/26 12:21:50 UTC
[arrow] branch master updated: PARQUET-1586: [C++] Add --dump
options to parquet-reader tool to dump def/rep levels
This is an automated email from the ASF dual-hosted git repository.
uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new d82ac40 PARQUET-1586: [C++] Add --dump options to parquet-reader tool to dump def/rep levels
d82ac40 is described below
commit d82ac407fab1d4b28669b8f7a940f88d39dfd874
Author: Renat Valiullin <ri...@gmail.com>
AuthorDate: Sun May 26 14:21:27 2019 +0200
PARQUET-1586: [C++] Add --dump options to parquet-reader tool to dump def/rep levels
Author: Renat Valiullin <ri...@gmail.com>
Closes #4385 from rip-nsk/PARQUET-1586 and squashes the following commits:
2d5c9e38f <Renat Valiullin> fixes for lint
1e370238b <Renat Valiullin> fix lint
a7bcacee3 <Renat Valiullin> parquet-reader --dump
---
cpp/src/parquet/column_scanner.h | 17 ++++--
cpp/src/parquet/printer.cc | 44 +++++++++-------
cpp/src/parquet/printer.h | 3 +-
cpp/src/parquet/reader-test.cc | 101 +++++++++++++++++++++++++++++++++---
cpp/tools/parquet/parquet-reader.cc | 9 ++--
5 files changed, 140 insertions(+), 34 deletions(-)
diff --git a/cpp/src/parquet/column_scanner.h b/cpp/src/parquet/column_scanner.h
index cb0da2c..1e084a9 100644
--- a/cpp/src/parquet/column_scanner.h
+++ b/cpp/src/parquet/column_scanner.h
@@ -62,7 +62,7 @@ class PARQUET_EXPORT Scanner {
int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
- virtual void PrintNext(std::ostream& out, int width) = 0;
+ virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) = 0;
bool HasNext() { return level_offset_ < levels_buffered_ || reader_->HasNext(); }
@@ -171,15 +171,24 @@ class PARQUET_TEMPLATE_CLASS_EXPORT TypedScanner : public Scanner {
return true;
}
- virtual void PrintNext(std::ostream& out, int width) {
+ virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) {
T val;
+ int16_t def_level = -1;
+ int16_t rep_level = -1;
bool is_null = false;
- char buffer[25];
+ char buffer[80];
- if (!NextValue(&val, &is_null)) {
+ if (!Next(&val, &def_level, &rep_level, &is_null)) {
throw ParquetException("No more values buffered");
}
+ if (with_levels) {
+ out << " D:" << def_level << " R:" << rep_level << " ";
+ if (!is_null) {
+ out << "V:";
+ }
+ }
+
if (is_null) {
std::string null_fmt = format_fwf<ByteArrayType>(width);
snprintf(buffer, sizeof(buffer), null_fmt.c_str(), "NULL");
diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc
index 82c52ff..d78d9b9 100644
--- a/cpp/src/parquet/printer.cc
+++ b/cpp/src/parquet/printer.cc
@@ -42,11 +42,11 @@ class ColumnReader;
// ParquetFilePrinter::DebugPrint
// the fixed initial size is just for an example
-#define COL_WIDTH "30"
+#define COL_WIDTH 30
void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selected_columns,
- bool print_values, bool print_key_value_metadata,
- const char* filename) {
+ bool print_values, bool format_dump,
+ bool print_key_value_metadata, const char* filename) {
const FileMetaData* file_metadata = fileReader->metadata().get();
stream << "File Name: " << filename << "\n";
@@ -54,7 +54,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
stream << "Created By: " << file_metadata->created_by() << "\n";
stream << "Total rows: " << file_metadata->num_rows() << "\n";
- if (print_key_value_metadata) {
+ if (print_key_value_metadata && file_metadata->key_value_metadata()) {
auto key_value_metadata = file_metadata->key_value_metadata();
int64_t size_of_key_value_metadata = key_value_metadata->size();
stream << "Key Value File Metadata: " << size_of_key_value_metadata << " entries\n";
@@ -95,7 +95,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
stream << "--- Total Bytes " << group_metadata->total_byte_size() << " ---\n";
- stream << " Rows: " << group_metadata->num_rows() << "---\n";
+ stream << "--- Rows: " << group_metadata->num_rows() << " ---\n";
// Print column metadata
for (auto i : selected_columns) {
@@ -103,7 +103,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
std::shared_ptr<Statistics> stats = column_chunk->statistics();
const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
- stream << "Column " << i << std::endl << ", Values: " << column_chunk->num_values();
+ stream << "Column " << i << std::endl << " Values: " << column_chunk->num_values();
if (column_chunk->is_stats_set()) {
std::string min = stats->EncodeMin(), max = stats->EncodeMax();
stream << ", Null Values: " << stats->null_count()
@@ -115,9 +115,9 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
}
stream << std::endl
<< " Compression: " << CompressionToString(column_chunk->compression())
- << ", Encodings: ";
+ << ", Encodings:";
for (auto encoding : column_chunk->encodings()) {
- stream << EncodingToString(encoding) << " ";
+ stream << " " << EncodingToString(encoding);
}
stream << std::endl
<< " Uncompressed Size: " << column_chunk->total_uncompressed_size()
@@ -128,8 +128,9 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
if (!print_values) {
continue;
}
+ stream << "--- Values ---\n";
- static constexpr int bufsize = 25;
+ static constexpr int bufsize = COL_WIDTH + 1;
char buffer[bufsize];
// Create readers for selected columns and print contents
@@ -137,18 +138,25 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
int j = 0;
for (auto i : selected_columns) {
std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i);
+ // This is OK in this method as long as the RowGroupReader does not get
+ // deleted
+ auto& scanner = scanners[j++] = Scanner::Make(col_reader);
- std::stringstream ss;
- ss << "%-" << COL_WIDTH << "s";
- std::string fmt = ss.str();
+ if (format_dump) {
+ stream << "Column " << i << std::endl;
+ while (scanner->HasNext()) {
+ scanner->PrintNext(stream, 0, true);
+ stream << "\n";
+ }
+ continue;
+ }
- snprintf(buffer, bufsize, fmt.c_str(),
+ snprintf(buffer, bufsize, "%-*s", COL_WIDTH,
file_metadata->schema()->Column(i)->name().c_str());
stream << buffer;
-
- // This is OK in this method as long as the RowGroupReader does not get
- // deleted
- scanners[j++] = Scanner::Make(col_reader);
+ }
+ if (format_dump) {
+ continue;
}
stream << "\n";
@@ -158,7 +166,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
for (auto scanner : scanners) {
if (scanner->HasNext()) {
hasRow = true;
- scanner->PrintNext(stream, 27);
+ scanner->PrintNext(stream, COL_WIDTH);
}
}
stream << "\n";
diff --git a/cpp/src/parquet/printer.h b/cpp/src/parquet/printer.h
index 4591e7a..9071270 100644
--- a/cpp/src/parquet/printer.h
+++ b/cpp/src/parquet/printer.h
@@ -36,7 +36,8 @@ class PARQUET_EXPORT ParquetFilePrinter {
~ParquetFilePrinter() {}
void DebugPrint(std::ostream& stream, std::list<int> selected_columns,
- bool print_values = true, bool print_key_value_metadata = false,
+ bool print_values = false, bool format_dump = false,
+ bool print_key_value_metadata = false,
const char* filename = "No Name");
void JSONPrint(std::ostream& stream, std::list<int> selected_columns,
diff --git a/cpp/src/parquet/reader-test.cc b/cpp/src/parquet/reader-test.cc
index 9c721c0..aad89b2 100644
--- a/cpp/src/parquet/reader-test.cc
+++ b/cpp/src/parquet/reader-test.cc
@@ -37,20 +37,17 @@ namespace parquet {
using ReadableFile = ::arrow::io::ReadableFile;
-std::string alltypes_plain() {
+std::string data_file(const char* file) {
std::string dir_string(test::get_data_dir());
std::stringstream ss;
- ss << dir_string << "/"
- << "alltypes_plain.parquet";
+ ss << dir_string << "/" << file;
return ss.str();
}
+std::string alltypes_plain() { return data_file("alltypes_plain.parquet"); }
+
std::string nation_dict_truncated_data_page() {
- std::string dir_string(test::get_data_dir());
- std::stringstream ss;
- ss << dir_string << "/"
- << "nation.dict-malformed.parquet";
- return ss.str();
+ return data_file("nation.dict-malformed.parquet");
}
class TestAllTypesPlain : public ::testing::Test {
@@ -255,6 +252,94 @@ TEST(TestFileReaderAdHoc, NationDictTruncatedDataPage) {
ASSERT_EQ(ss2.str(), ss.str());
}
+TEST(TestDumpWithLocalFile, DumpOutput) {
+ std::string headerOutput = R"###(File Name: nested_lists.snappy.parquet
+Version: 1.0
+Created By: parquet-mr version 1.8.2 (build c6522788629e590a53eb79874b95f6c3ff11f16c)
+Total rows: 3
+Number of RowGroups: 1
+Number of Real Columns: 2
+Number of Columns: 2
+Number of Selected Columns: 2
+Column 0: element (BYTE_ARRAY)
+Column 1: b (INT32)
+--- Row Group 0 ---
+--- Total Bytes 155 ---
+--- Rows: 3 ---
+Column 0
+ Values: 18 Statistics Not Set
+ Compression: SNAPPY, Encodings: RLE PLAIN_DICTIONARY
+ Uncompressed Size: 103, Compressed Size: 104
+Column 1
+ Values: 3, Null Values: 0, Distinct Values: 0
+ Max: 1, Min: 1
+ Compression: SNAPPY, Encodings: BIT_PACKED PLAIN_DICTIONARY
+ Uncompressed Size: 52, Compressed Size: 56
+)###";
+ std::string valuesOutput = R"###(--- Values ---
+element b
+a 1
+b 1
+c 1
+NULL
+d
+a
+b
+c
+d
+NULL
+e
+a
+b
+c
+d
+e
+NULL
+f
+
+)###";
+ std::string dumpOutput = R"###(--- Values ---
+Column 0
+ D:7 R:0 V:a
+ D:7 R:3 V:b
+ D:7 R:2 V:c
+ D:4 R:1 NULL
+ D:7 R:2 V:d
+ D:7 R:0 V:a
+ D:7 R:3 V:b
+ D:7 R:2 V:c
+ D:7 R:3 V:d
+ D:4 R:1 NULL
+ D:7 R:2 V:e
+ D:7 R:0 V:a
+ D:7 R:3 V:b
+ D:7 R:2 V:c
+ D:7 R:3 V:d
+ D:7 R:2 V:e
+ D:4 R:1 NULL
+ D:7 R:2 V:f
+Column 1
+ D:0 R:0 V:1
+ D:0 R:0 V:1
+ D:0 R:0 V:1
+)###";
+
+ std::stringstream ssValues, ssDump;
+ // empty list means print all
+ std::list<int> columns;
+
+ const char* file = "nested_lists.snappy.parquet";
+ auto reader_props = default_reader_properties();
+ auto reader = ParquetFileReader::OpenFile(data_file(file), false, reader_props);
+ ParquetFilePrinter printer(reader.get());
+
+ printer.DebugPrint(ssValues, columns, true, false, false, file);
+ ASSERT_EQ(headerOutput + valuesOutput, ssValues.str());
+
+ printer.DebugPrint(ssDump, columns, true, true, false, file);
+ ASSERT_EQ(headerOutput + dumpOutput, ssDump.str());
+}
+
TEST(TestJSONWithLocalFile, JSONOutput) {
std::string jsonOutput = R"###({
"FileName": "alltypes_plain.parquet",
diff --git a/cpp/tools/parquet/parquet-reader.cc b/cpp/tools/parquet/parquet-reader.cc
index a5b7db1..c9194ab 100644
--- a/cpp/tools/parquet/parquet-reader.cc
+++ b/cpp/tools/parquet/parquet-reader.cc
@@ -23,8 +23,8 @@
int main(int argc, char** argv) {
if (argc > 5 || argc < 2) {
- std::cerr << "Usage: parquet-reader [--only-metadata] [--no-memory-map] [--json]"
- "[--print-key-value-metadata] [--columns=...] <file>"
+ std::cerr << "Usage: parquet-reader [--only-metadata] [--no-memory-map] [--json] "
+ << "[--dump] [--print-key-value-metadata] [--columns=...] <file>"
<< std::endl;
return -1;
}
@@ -34,6 +34,7 @@ int main(int argc, char** argv) {
bool print_key_value_metadata = false;
bool memory_map = true;
bool format_json = false;
+ bool format_dump = false;
// Read command-line options
const std::string COLUMNS_PREFIX = "--columns=";
@@ -49,6 +50,8 @@ int main(int argc, char** argv) {
memory_map = false;
} else if ((param = std::strstr(argv[i], "--json"))) {
format_json = true;
+ } else if ((param = std::strstr(argv[i], "--dump"))) {
+ format_dump = true;
} else if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) {
value = std::strtok(param + COLUMNS_PREFIX.length(), ",");
while (value) {
@@ -68,7 +71,7 @@ int main(int argc, char** argv) {
printer.JSONPrint(std::cout, columns, filename.c_str());
} else {
printer.DebugPrint(std::cout, columns, print_values,
- print_key_value_metadata, filename.c_str());
+ format_dump, print_key_value_metadata, filename.c_str());
}
} catch (const std::exception& e) {
std::cerr << "Parquet error: " << e.what() << std::endl;