You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/05/31 18:52:24 UTC

[arrow] branch master updated: ARROW-5433: [C++][Parquet] Improve parquet-reader columns information, strip trailing whitespace from test case

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 0054ef3  ARROW-5433: [C++][Parquet] Improve parquet-reader columns information, strip trailing whitespace from test case
0054ef3 is described below

commit 0054ef3bbd424f0a87527090d15e2ff953ee7d83
Author: Renat Valiullin <ri...@gmail.com>
AuthorDate: Fri May 31 13:52:15 2019 -0500

    ARROW-5433: [C++][Parquet] Improve parquet-reader columns information, strip trailing whitespace from test case
    
    Author: Renat Valiullin <ri...@gmail.com>
    
    Closes #4403 from rip-nsk/ARROW-5433 and squashes the following commits:
    
    2daadd94c <Renat Valiullin> replace column name by column path and better type information
---
 cpp/src/parquet/printer.cc     |  18 +++--
 cpp/src/parquet/reader-test.cc | 161 ++++++++++++++++++++---------------------
 2 files changed, 92 insertions(+), 87 deletions(-)

diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc
index 2e44d90..6e49753 100644
--- a/cpp/src/parquet/printer.cc
+++ b/cpp/src/parquet/printer.cc
@@ -84,17 +84,24 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
   stream << "Number of Selected Columns: " << selected_columns.size() << "\n";
   for (auto i : selected_columns) {
     const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
-    stream << "Column " << i << ": " << descr->name() << " ("
-           << TypeToString(descr->physical_type()) << ")" << std::endl;
+    stream << "Column " << i << ": " << descr->path()->ToDotString() << " ("
+           << TypeToString(descr->physical_type());
+    if (descr->logical_type() != LogicalType::NONE) {
+      stream << "/" << LogicalTypeToString(descr->logical_type());
+    }
+    if (descr->logical_type() == LogicalType::DECIMAL) {
+      stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")";
+    }
+    stream << ")" << std::endl;
   }
 
   for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
-    stream << "--- Row Group " << r << " ---\n";
+    stream << "--- Row Group: " << r << " ---\n";
 
     auto group_reader = fileReader->RowGroup(r);
     std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
 
-    stream << "--- Total Bytes " << group_metadata->total_byte_size() << " ---\n";
+    stream << "--- Total Bytes: " << group_metadata->total_byte_size() << " ---\n";
     stream << "--- Rows: " << group_metadata->num_rows() << " ---\n";
 
     // Print column metadata
@@ -153,7 +160,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
 
       snprintf(buffer, bufsize, "%-*s", COL_WIDTH,
                file_metadata->schema()->Column(i)->name().c_str());
-      stream << buffer;
+      stream << buffer << '|';
     }
     if (format_dump) {
       continue;
@@ -167,6 +174,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
         if (scanner->HasNext()) {
           hasRow = true;
           scanner->PrintNext(stream, COL_WIDTH);
+          stream << '|';
         }
       }
       stream << "\n";
diff --git a/cpp/src/parquet/reader-test.cc b/cpp/src/parquet/reader-test.cc
index 80316f0..e68052e 100644
--- a/cpp/src/parquet/reader-test.cc
+++ b/cpp/src/parquet/reader-test.cc
@@ -230,98 +230,95 @@ TEST(TestFileReaderAdHoc, NationDictTruncatedDataPage) {
 }
 
 TEST(TestDumpWithLocalFile, DumpOutput) {
-  std::stringstream ssValues, ssDump;
+  std::string header_output = R"###(File Name: nested_lists.snappy.parquet
+Version: 1.0
+Created By: parquet-mr version 1.8.2 (build c6522788629e590a53eb79874b95f6c3ff11f16c)
+Total rows: 3
+Number of RowGroups: 1
+Number of Real Columns: 2
+Number of Columns: 2
+Number of Selected Columns: 2
+Column 0: a.list.element.list.element.list.element (BYTE_ARRAY/UTF8)
+Column 1: b (INT32)
+--- Row Group: 0 ---
+--- Total Bytes: 155 ---
+--- Rows: 3 ---
+Column 0
+  Values: 18  Statistics Not Set
+  Compression: SNAPPY, Encodings: RLE PLAIN_DICTIONARY
+  Uncompressed Size: 103, Compressed Size: 104
+Column 1
+  Values: 3, Null Values: 0, Distinct Values: 0
+  Max: 1, Min: 1
+  Compression: SNAPPY, Encodings: BIT_PACKED PLAIN_DICTIONARY
+  Uncompressed Size: 52, Compressed Size: 56
+)###";
+  std::string values_output = R"###(--- Values ---
+element                       |b                             |
+a                             |1                             |
+b                             |1                             |
+c                             |1                             |
+NULL                          |
+d                             |
+a                             |
+b                             |
+c                             |
+d                             |
+NULL                          |
+e                             |
+a                             |
+b                             |
+c                             |
+d                             |
+e                             |
+NULL                          |
+f                             |
+
+)###";
+  std::string dump_output = R"###(--- Values ---
+Column 0
+  D:7 R:0 V:a
+  D:7 R:3 V:b
+  D:7 R:2 V:c
+  D:4 R:1 NULL
+  D:7 R:2 V:d
+  D:7 R:0 V:a
+  D:7 R:3 V:b
+  D:7 R:2 V:c
+  D:7 R:3 V:d
+  D:4 R:1 NULL
+  D:7 R:2 V:e
+  D:7 R:0 V:a
+  D:7 R:3 V:b
+  D:7 R:2 V:c
+  D:7 R:3 V:d
+  D:7 R:2 V:e
+  D:4 R:1 NULL
+  D:7 R:2 V:f
+Column 1
+  D:0 R:0 V:1
+  D:0 R:0 V:1
+  D:0 R:0 V:1
+)###";
+
   // empty list means print all
   std::list<int> columns;
 
+  std::stringstream ss_values, ss_dump;
   const char* file = "nested_lists.snappy.parquet";
   auto reader_props = default_reader_properties();
   auto reader = ParquetFileReader::OpenFile(data_file(file), false, reader_props);
   ParquetFilePrinter printer(reader.get());
 
-  printer.DebugPrint(ssValues, columns, true, false, false, file);
-  printer.DebugPrint(ssDump, columns, true, true, false, file);
-
-  // TODO(wesm): How to check this output without having a bunch of
-  // trailing whitespace lines?
-
-  //   std::string headerOutput = R"###(File Name: nested_lists.snappy.parquet
-  // Version: 1.0
-  // Created By: parquet-mr version 1.8.2 (build c6522788629e590a53eb79874b95f6c3ff11f16c)
-  // Total rows: 3
-  // Number of RowGroups: 1
-  // Number of Real Columns: 2
-  // Number of Columns: 2
-  // Number of Selected Columns: 2
-  // Column 0: element (BYTE_ARRAY)
-  // Column 1: b (INT32)
-  // --- Row Group 0 ---
-  // --- Total Bytes 155 ---
-  // --- Rows: 3 ---
-  // Column 0
-  //   Values: 18  Statistics Not Set
-  //   Compression: SNAPPY, Encodings: RLE PLAIN_DICTIONARY
-  //   Uncompressed Size: 103, Compressed Size: 104
-  // Column 1
-  //   Values: 3, Null Values: 0, Distinct Values: 0
-  //   Max: 1, Min: 1
-  //   Compression: SNAPPY, Encodings: BIT_PACKED PLAIN_DICTIONARY
-  //   Uncompressed Size: 52, Compressed Size: 56
-  // )###";
-  //   std::string valuesOutput = R"###(--- Values ---
-  // element                       b
-  // a                             1
-  // b                             1
-  // c                             1
-  // NULL
-  // d
-  // a
-  // b
-  // c
-  // d
-  // NULL
-  // e
-  // a
-  // b
-  // c
-  // d
-  // e
-  // NULL
-  // f
-
-  // )###";
-  //   std::string dumpOutput = R"###(--- Values ---
-  // Column 0
-  //   D:7 R:0 V:a
-  //   D:7 R:3 V:b
-  //   D:7 R:2 V:c
-  //   D:4 R:1 NULL
-  //   D:7 R:2 V:d
-  //   D:7 R:0 V:a
-  //   D:7 R:3 V:b
-  //   D:7 R:2 V:c
-  //   D:7 R:3 V:d
-  //   D:4 R:1 NULL
-  //   D:7 R:2 V:e
-  //   D:7 R:0 V:a
-  //   D:7 R:3 V:b
-  //   D:7 R:2 V:c
-  //   D:7 R:3 V:d
-  //   D:7 R:2 V:e
-  //   D:4 R:1 NULL
-  //   D:7 R:2 V:f
-  // Column 1
-  //   D:0 R:0 V:1
-  //   D:0 R:0 V:1
-  //   D:0 R:0 V:1
-  // )###";
-
-  //   ASSERT_EQ(headerOutput + valuesOutput, ssValues.str());
-  //   ASSERT_EQ(headerOutput + dumpOutput, ssDump.str());
+  printer.DebugPrint(ss_values, columns, true, false, false, file);
+  printer.DebugPrint(ss_dump, columns, true, true, false, file);
+
+  ASSERT_EQ(header_output + values_output, ss_values.str());
+  ASSERT_EQ(header_output + dump_output, ss_dump.str());
 }
 
 TEST(TestJSONWithLocalFile, JSONOutput) {
-  std::string jsonOutput = R"###({
+  std::string json_output = R"###({
   "FileName": "alltypes_plain.parquet",
   "Version": "0",
   "CreatedBy": "impala version 1.3.0-INTERNAL (build 8a48ddb1eff84592b3fc06bc6f51ec120e1fffc9)",
@@ -383,7 +380,7 @@ TEST(TestJSONWithLocalFile, JSONOutput) {
   ParquetFilePrinter printer(reader.get());
   printer.JSONPrint(ss, columns, "alltypes_plain.parquet");
 
-  ASSERT_EQ(jsonOutput, ss.str());
+  ASSERT_EQ(json_output, ss.str());
 }
 
 }  // namespace parquet