You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/05/31 18:52:24 UTC
[arrow] branch master updated: ARROW-5433: [C++][Parquet] Improve
parquet-reader columns information, strip trailing whitespace from test case
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 0054ef3 ARROW-5433: [C++][Parquet] Improve parquet-reader columns information, strip trailing whitespace from test case
0054ef3 is described below
commit 0054ef3bbd424f0a87527090d15e2ff953ee7d83
Author: Renat Valiullin <ri...@gmail.com>
AuthorDate: Fri May 31 13:52:15 2019 -0500
ARROW-5433: [C++][Parquet] Improve parquet-reader columns information, strip trailing whitespace from test case
Author: Renat Valiullin <ri...@gmail.com>
Closes #4403 from rip-nsk/ARROW-5433 and squashes the following commits:
2daadd94c <Renat Valiullin> replace column name by column path and better type information
---
cpp/src/parquet/printer.cc | 18 +++--
cpp/src/parquet/reader-test.cc | 161 ++++++++++++++++++++---------------------
2 files changed, 92 insertions(+), 87 deletions(-)
diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc
index 2e44d90..6e49753 100644
--- a/cpp/src/parquet/printer.cc
+++ b/cpp/src/parquet/printer.cc
@@ -84,17 +84,24 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
stream << "Number of Selected Columns: " << selected_columns.size() << "\n";
for (auto i : selected_columns) {
const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
- stream << "Column " << i << ": " << descr->name() << " ("
- << TypeToString(descr->physical_type()) << ")" << std::endl;
+ stream << "Column " << i << ": " << descr->path()->ToDotString() << " ("
+ << TypeToString(descr->physical_type());
+ if (descr->logical_type() != LogicalType::NONE) {
+ stream << "/" << LogicalTypeToString(descr->logical_type());
+ }
+ if (descr->logical_type() == LogicalType::DECIMAL) {
+ stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")";
+ }
+ stream << ")" << std::endl;
}
for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
- stream << "--- Row Group " << r << " ---\n";
+ stream << "--- Row Group: " << r << " ---\n";
auto group_reader = fileReader->RowGroup(r);
std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
- stream << "--- Total Bytes " << group_metadata->total_byte_size() << " ---\n";
+ stream << "--- Total Bytes: " << group_metadata->total_byte_size() << " ---\n";
stream << "--- Rows: " << group_metadata->num_rows() << " ---\n";
// Print column metadata
@@ -153,7 +160,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
snprintf(buffer, bufsize, "%-*s", COL_WIDTH,
file_metadata->schema()->Column(i)->name().c_str());
- stream << buffer;
+ stream << buffer << '|';
}
if (format_dump) {
continue;
@@ -167,6 +174,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
if (scanner->HasNext()) {
hasRow = true;
scanner->PrintNext(stream, COL_WIDTH);
+ stream << '|';
}
}
stream << "\n";
diff --git a/cpp/src/parquet/reader-test.cc b/cpp/src/parquet/reader-test.cc
index 80316f0..e68052e 100644
--- a/cpp/src/parquet/reader-test.cc
+++ b/cpp/src/parquet/reader-test.cc
@@ -230,98 +230,95 @@ TEST(TestFileReaderAdHoc, NationDictTruncatedDataPage) {
}
TEST(TestDumpWithLocalFile, DumpOutput) {
- std::stringstream ssValues, ssDump;
+ std::string header_output = R"###(File Name: nested_lists.snappy.parquet
+Version: 1.0
+Created By: parquet-mr version 1.8.2 (build c6522788629e590a53eb79874b95f6c3ff11f16c)
+Total rows: 3
+Number of RowGroups: 1
+Number of Real Columns: 2
+Number of Columns: 2
+Number of Selected Columns: 2
+Column 0: a.list.element.list.element.list.element (BYTE_ARRAY/UTF8)
+Column 1: b (INT32)
+--- Row Group: 0 ---
+--- Total Bytes: 155 ---
+--- Rows: 3 ---
+Column 0
+ Values: 18 Statistics Not Set
+ Compression: SNAPPY, Encodings: RLE PLAIN_DICTIONARY
+ Uncompressed Size: 103, Compressed Size: 104
+Column 1
+ Values: 3, Null Values: 0, Distinct Values: 0
+ Max: 1, Min: 1
+ Compression: SNAPPY, Encodings: BIT_PACKED PLAIN_DICTIONARY
+ Uncompressed Size: 52, Compressed Size: 56
+)###";
+ std::string values_output = R"###(--- Values ---
+element |b |
+a |1 |
+b |1 |
+c |1 |
+NULL |
+d |
+a |
+b |
+c |
+d |
+NULL |
+e |
+a |
+b |
+c |
+d |
+e |
+NULL |
+f |
+
+)###";
+ std::string dump_output = R"###(--- Values ---
+Column 0
+ D:7 R:0 V:a
+ D:7 R:3 V:b
+ D:7 R:2 V:c
+ D:4 R:1 NULL
+ D:7 R:2 V:d
+ D:7 R:0 V:a
+ D:7 R:3 V:b
+ D:7 R:2 V:c
+ D:7 R:3 V:d
+ D:4 R:1 NULL
+ D:7 R:2 V:e
+ D:7 R:0 V:a
+ D:7 R:3 V:b
+ D:7 R:2 V:c
+ D:7 R:3 V:d
+ D:7 R:2 V:e
+ D:4 R:1 NULL
+ D:7 R:2 V:f
+Column 1
+ D:0 R:0 V:1
+ D:0 R:0 V:1
+ D:0 R:0 V:1
+)###";
+
// empty list means print all
std::list<int> columns;
+ std::stringstream ss_values, ss_dump;
const char* file = "nested_lists.snappy.parquet";
auto reader_props = default_reader_properties();
auto reader = ParquetFileReader::OpenFile(data_file(file), false, reader_props);
ParquetFilePrinter printer(reader.get());
- printer.DebugPrint(ssValues, columns, true, false, false, file);
- printer.DebugPrint(ssDump, columns, true, true, false, file);
-
- // TODO(wesm): How to check this output without having a bunch of
- // trailing whitespace lines?
-
- // std::string headerOutput = R"###(File Name: nested_lists.snappy.parquet
- // Version: 1.0
- // Created By: parquet-mr version 1.8.2 (build c6522788629e590a53eb79874b95f6c3ff11f16c)
- // Total rows: 3
- // Number of RowGroups: 1
- // Number of Real Columns: 2
- // Number of Columns: 2
- // Number of Selected Columns: 2
- // Column 0: element (BYTE_ARRAY)
- // Column 1: b (INT32)
- // --- Row Group 0 ---
- // --- Total Bytes 155 ---
- // --- Rows: 3 ---
- // Column 0
- // Values: 18 Statistics Not Set
- // Compression: SNAPPY, Encodings: RLE PLAIN_DICTIONARY
- // Uncompressed Size: 103, Compressed Size: 104
- // Column 1
- // Values: 3, Null Values: 0, Distinct Values: 0
- // Max: 1, Min: 1
- // Compression: SNAPPY, Encodings: BIT_PACKED PLAIN_DICTIONARY
- // Uncompressed Size: 52, Compressed Size: 56
- // )###";
- // std::string valuesOutput = R"###(--- Values ---
- // element b
- // a 1
- // b 1
- // c 1
- // NULL
- // d
- // a
- // b
- // c
- // d
- // NULL
- // e
- // a
- // b
- // c
- // d
- // e
- // NULL
- // f
-
- // )###";
- // std::string dumpOutput = R"###(--- Values ---
- // Column 0
- // D:7 R:0 V:a
- // D:7 R:3 V:b
- // D:7 R:2 V:c
- // D:4 R:1 NULL
- // D:7 R:2 V:d
- // D:7 R:0 V:a
- // D:7 R:3 V:b
- // D:7 R:2 V:c
- // D:7 R:3 V:d
- // D:4 R:1 NULL
- // D:7 R:2 V:e
- // D:7 R:0 V:a
- // D:7 R:3 V:b
- // D:7 R:2 V:c
- // D:7 R:3 V:d
- // D:7 R:2 V:e
- // D:4 R:1 NULL
- // D:7 R:2 V:f
- // Column 1
- // D:0 R:0 V:1
- // D:0 R:0 V:1
- // D:0 R:0 V:1
- // )###";
-
- // ASSERT_EQ(headerOutput + valuesOutput, ssValues.str());
- // ASSERT_EQ(headerOutput + dumpOutput, ssDump.str());
+ printer.DebugPrint(ss_values, columns, true, false, false, file);
+ printer.DebugPrint(ss_dump, columns, true, true, false, file);
+
+ ASSERT_EQ(header_output + values_output, ss_values.str());
+ ASSERT_EQ(header_output + dump_output, ss_dump.str());
}
TEST(TestJSONWithLocalFile, JSONOutput) {
- std::string jsonOutput = R"###({
+ std::string json_output = R"###({
"FileName": "alltypes_plain.parquet",
"Version": "0",
"CreatedBy": "impala version 1.3.0-INTERNAL (build 8a48ddb1eff84592b3fc06bc6f51ec120e1fffc9)",
@@ -383,7 +380,7 @@ TEST(TestJSONWithLocalFile, JSONOutput) {
ParquetFilePrinter printer(reader.get());
printer.JSONPrint(ss, columns, "alltypes_plain.parquet");
- ASSERT_EQ(jsonOutput, ss.str());
+ ASSERT_EQ(json_output, ss.str());
}
} // namespace parquet