You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2019/05/26 12:21:50 UTC
[arrow] branch master updated: PARQUET-1586: [C++] Add --dump options to parquet-reader tool to dump def/rep levels

This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new d82ac40  PARQUET-1586: [C++] Add --dump options to parquet-reader tool to dump def/rep levels
d82ac40 is described below

commit d82ac407fab1d4b28669b8f7a940f88d39dfd874
Author: Renat Valiullin <ri...@gmail.com>
AuthorDate: Sun May 26 14:21:27 2019 +0200

    PARQUET-1586: [C++] Add --dump options to parquet-reader tool to dump def/rep levels
    
    Author: Renat Valiullin <ri...@gmail.com>
    
    Closes #4385 from rip-nsk/PARQUET-1586 and squashes the following commits:
    
    2d5c9e38f <Renat Valiullin> fixes for lint
    1e370238b <Renat Valiullin> fix lint
    a7bcacee3 <Renat Valiullin> parquet-reader --dump
---
 cpp/src/parquet/column_scanner.h    |  17 ++++--
 cpp/src/parquet/printer.cc          |  44 +++++++++-------
 cpp/src/parquet/printer.h           |   3 +-
 cpp/src/parquet/reader-test.cc      | 101 +++++++++++++++++++++++++++++++++---
 cpp/tools/parquet/parquet-reader.cc |   9 ++--
 5 files changed, 140 insertions(+), 34 deletions(-)

diff --git a/cpp/src/parquet/column_scanner.h b/cpp/src/parquet/column_scanner.h
index cb0da2c..1e084a9 100644
--- a/cpp/src/parquet/column_scanner.h
+++ b/cpp/src/parquet/column_scanner.h
@@ -62,7 +62,7 @@ class PARQUET_EXPORT Scanner {
       int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
       ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
 
-  virtual void PrintNext(std::ostream& out, int width) = 0;
+  virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) = 0;
 
   bool HasNext() { return level_offset_ < levels_buffered_ || reader_->HasNext(); }
 
@@ -171,15 +171,24 @@ class PARQUET_TEMPLATE_CLASS_EXPORT TypedScanner : public Scanner {
     return true;
   }
 
-  virtual void PrintNext(std::ostream& out, int width) {
+  virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) {
     T val;
+    int16_t def_level = -1;
+    int16_t rep_level = -1;
     bool is_null = false;
-    char buffer[25];
+    char buffer[80];
 
-    if (!NextValue(&val, &is_null)) {
+    if (!Next(&val, &def_level, &rep_level, &is_null)) {
       throw ParquetException("No more values buffered");
     }
 
+    if (with_levels) {
+      out << "  D:" << def_level << " R:" << rep_level << " ";
+      if (!is_null) {
+        out << "V:";
+      }
+    }
+
     if (is_null) {
       std::string null_fmt = format_fwf<ByteArrayType>(width);
       snprintf(buffer, sizeof(buffer), null_fmt.c_str(), "NULL");
diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc
index 82c52ff..d78d9b9 100644
--- a/cpp/src/parquet/printer.cc
+++ b/cpp/src/parquet/printer.cc
@@ -42,11 +42,11 @@ class ColumnReader;
 // ParquetFilePrinter::DebugPrint
 
 // the fixed initial size is just for an example
-#define COL_WIDTH "30"
+#define COL_WIDTH 30
 
 void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selected_columns,
-                                    bool print_values, bool print_key_value_metadata,
-                                    const char* filename) {
+                                    bool print_values, bool format_dump,
+                                    bool print_key_value_metadata, const char* filename) {
   const FileMetaData* file_metadata = fileReader->metadata().get();
 
   stream << "File Name: " << filename << "\n";
@@ -54,7 +54,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
   stream << "Created By: " << file_metadata->created_by() << "\n";
   stream << "Total rows: " << file_metadata->num_rows() << "\n";
 
-  if (print_key_value_metadata) {
+  if (print_key_value_metadata && file_metadata->key_value_metadata()) {
     auto key_value_metadata = file_metadata->key_value_metadata();
     int64_t size_of_key_value_metadata = key_value_metadata->size();
     stream << "Key Value File Metadata: " << size_of_key_value_metadata << " entries\n";
@@ -95,7 +95,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
     std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
 
     stream << "--- Total Bytes " << group_metadata->total_byte_size() << " ---\n";
-    stream << "  Rows: " << group_metadata->num_rows() << "---\n";
+    stream << "--- Rows: " << group_metadata->num_rows() << " ---\n";
 
     // Print column metadata
     for (auto i : selected_columns) {
@@ -103,7 +103,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
       std::shared_ptr<Statistics> stats = column_chunk->statistics();
 
       const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
-      stream << "Column " << i << std::endl << ", Values: " << column_chunk->num_values();
+      stream << "Column " << i << std::endl << "  Values: " << column_chunk->num_values();
       if (column_chunk->is_stats_set()) {
         std::string min = stats->EncodeMin(), max = stats->EncodeMax();
         stream << ", Null Values: " << stats->null_count()
@@ -115,9 +115,9 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
       }
       stream << std::endl
              << "  Compression: " << CompressionToString(column_chunk->compression())
-             << ", Encodings: ";
+             << ", Encodings:";
       for (auto encoding : column_chunk->encodings()) {
-        stream << EncodingToString(encoding) << " ";
+        stream << " " << EncodingToString(encoding);
       }
       stream << std::endl
              << "  Uncompressed Size: " << column_chunk->total_uncompressed_size()
@@ -128,8 +128,9 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
     if (!print_values) {
       continue;
     }
+    stream << "--- Values ---\n";
 
-    static constexpr int bufsize = 25;
+    static constexpr int bufsize = COL_WIDTH + 1;
     char buffer[bufsize];
 
     // Create readers for selected columns and print contents
@@ -137,18 +138,25 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
     int j = 0;
     for (auto i : selected_columns) {
       std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i);
+      // This is OK in this method as long as the RowGroupReader does not get
+      // deleted
+      auto& scanner = scanners[j++] = Scanner::Make(col_reader);
 
-      std::stringstream ss;
-      ss << "%-" << COL_WIDTH << "s";
-      std::string fmt = ss.str();
+      if (format_dump) {
+        stream << "Column " << i << std::endl;
+        while (scanner->HasNext()) {
+          scanner->PrintNext(stream, 0, true);
+          stream << "\n";
+        }
+        continue;
+      }
 
-      snprintf(buffer, bufsize, fmt.c_str(),
+      snprintf(buffer, bufsize, "%-*s", COL_WIDTH,
                file_metadata->schema()->Column(i)->name().c_str());
       stream << buffer;
-
-      // This is OK in this method as long as the RowGroupReader does not get
-      // deleted
-      scanners[j++] = Scanner::Make(col_reader);
+    }
+    if (format_dump) {
+      continue;
     }
     stream << "\n";
 
@@ -158,7 +166,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
       for (auto scanner : scanners) {
         if (scanner->HasNext()) {
           hasRow = true;
-          scanner->PrintNext(stream, 27);
+          scanner->PrintNext(stream, COL_WIDTH);
         }
       }
       stream << "\n";
diff --git a/cpp/src/parquet/printer.h b/cpp/src/parquet/printer.h
index 4591e7a..9071270 100644
--- a/cpp/src/parquet/printer.h
+++ b/cpp/src/parquet/printer.h
@@ -36,7 +36,8 @@ class PARQUET_EXPORT ParquetFilePrinter {
   ~ParquetFilePrinter() {}
 
   void DebugPrint(std::ostream& stream, std::list<int> selected_columns,
-                  bool print_values = true, bool print_key_value_metadata = false,
+                  bool print_values = false, bool format_dump = false,
+                  bool print_key_value_metadata = false,
                   const char* filename = "No Name");
 
   void JSONPrint(std::ostream& stream, std::list<int> selected_columns,
diff --git a/cpp/src/parquet/reader-test.cc b/cpp/src/parquet/reader-test.cc
index 9c721c0..aad89b2 100644
--- a/cpp/src/parquet/reader-test.cc
+++ b/cpp/src/parquet/reader-test.cc
@@ -37,20 +37,17 @@ namespace parquet {
 
 using ReadableFile = ::arrow::io::ReadableFile;
 
-std::string alltypes_plain() {
+std::string data_file(const char* file) {
   std::string dir_string(test::get_data_dir());
   std::stringstream ss;
-  ss << dir_string << "/"
-     << "alltypes_plain.parquet";
+  ss << dir_string << "/" << file;
   return ss.str();
 }
 
+std::string alltypes_plain() { return data_file("alltypes_plain.parquet"); }
+
 std::string nation_dict_truncated_data_page() {
-  std::string dir_string(test::get_data_dir());
-  std::stringstream ss;
-  ss << dir_string << "/"
-     << "nation.dict-malformed.parquet";
-  return ss.str();
+  return data_file("nation.dict-malformed.parquet");
 }
 
 class TestAllTypesPlain : public ::testing::Test {
@@ -255,6 +252,94 @@ TEST(TestFileReaderAdHoc, NationDictTruncatedDataPage) {
   ASSERT_EQ(ss2.str(), ss.str());
 }
 
+TEST(TestDumpWithLocalFile, DumpOutput) {
+  std::string headerOutput = R"###(File Name: nested_lists.snappy.parquet
+Version: 1.0
+Created By: parquet-mr version 1.8.2 (build c6522788629e590a53eb79874b95f6c3ff11f16c)
+Total rows: 3
+Number of RowGroups: 1
+Number of Real Columns: 2
+Number of Columns: 2
+Number of Selected Columns: 2
+Column 0: element (BYTE_ARRAY)
+Column 1: b (INT32)
+--- Row Group 0 ---
+--- Total Bytes 155 ---
+--- Rows: 3 ---
+Column 0
+  Values: 18  Statistics Not Set
+  Compression: SNAPPY, Encodings: RLE PLAIN_DICTIONARY
+  Uncompressed Size: 103, Compressed Size: 104
+Column 1
+  Values: 3, Null Values: 0, Distinct Values: 0
+  Max: 1, Min: 1
+  Compression: SNAPPY, Encodings: BIT_PACKED PLAIN_DICTIONARY
+  Uncompressed Size: 52, Compressed Size: 56
+)###";
+  std::string valuesOutput = R"###(--- Values ---
+element                       b                             
+a                             1                             
+b                             1                             
+c                             1                             
+NULL                          
+d                             
+a                             
+b                             
+c                             
+d                             
+NULL                          
+e                             
+a                             
+b                             
+c                             
+d                             
+e                             
+NULL                          
+f                             
+
+)###";
+  std::string dumpOutput = R"###(--- Values ---
+Column 0
+  D:7 R:0 V:a
+  D:7 R:3 V:b
+  D:7 R:2 V:c
+  D:4 R:1 NULL
+  D:7 R:2 V:d
+  D:7 R:0 V:a
+  D:7 R:3 V:b
+  D:7 R:2 V:c
+  D:7 R:3 V:d
+  D:4 R:1 NULL
+  D:7 R:2 V:e
+  D:7 R:0 V:a
+  D:7 R:3 V:b
+  D:7 R:2 V:c
+  D:7 R:3 V:d
+  D:7 R:2 V:e
+  D:4 R:1 NULL
+  D:7 R:2 V:f
+Column 1
+  D:0 R:0 V:1
+  D:0 R:0 V:1
+  D:0 R:0 V:1
+)###";
+
+  std::stringstream ssValues, ssDump;
+  // empty list means print all
+  std::list<int> columns;
+
+  const char* file = "nested_lists.snappy.parquet";
+  auto reader_props = default_reader_properties();
+  auto reader = ParquetFileReader::OpenFile(data_file(file), false, reader_props);
+  ParquetFilePrinter printer(reader.get());
+
+  printer.DebugPrint(ssValues, columns, true, false, false, file);
+  ASSERT_EQ(headerOutput + valuesOutput, ssValues.str());
+
+  printer.DebugPrint(ssDump, columns, true, true, false, file);
+  ASSERT_EQ(headerOutput + dumpOutput, ssDump.str());
+}
+
 TEST(TestJSONWithLocalFile, JSONOutput) {
   std::string jsonOutput = R"###({
   "FileName": "alltypes_plain.parquet",
diff --git a/cpp/tools/parquet/parquet-reader.cc b/cpp/tools/parquet/parquet-reader.cc
index a5b7db1..c9194ab 100644
--- a/cpp/tools/parquet/parquet-reader.cc
+++ b/cpp/tools/parquet/parquet-reader.cc
@@ -23,8 +23,8 @@
 
 int main(int argc, char** argv) {
   if (argc > 5 || argc < 2) {
-    std::cerr << "Usage: parquet-reader [--only-metadata] [--no-memory-map] [--json]"
-                 "[--print-key-value-metadata] [--columns=...] <file>"
+    std::cerr << "Usage: parquet-reader [--only-metadata] [--no-memory-map] [--json] "
+              << "[--dump] [--print-key-value-metadata] [--columns=...] <file>"
               << std::endl;
     return -1;
   }
@@ -34,6 +34,7 @@ int main(int argc, char** argv) {
   bool print_key_value_metadata = false;
   bool memory_map = true;
   bool format_json = false;
+  bool format_dump = false;
 
   // Read command-line options
   const std::string COLUMNS_PREFIX = "--columns=";
@@ -49,6 +50,8 @@ int main(int argc, char** argv) {
       memory_map = false;
     } else if ((param = std::strstr(argv[i], "--json"))) {
       format_json = true;
+    } else if ((param = std::strstr(argv[i], "--dump"))) {
+      format_dump = true;
     } else if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) {
       value = std::strtok(param + COLUMNS_PREFIX.length(), ",");
       while (value) {
@@ -68,7 +71,7 @@ int main(int argc, char** argv) {
       printer.JSONPrint(std::cout, columns, filename.c_str());
     } else {
       printer.DebugPrint(std::cout, columns, print_values,
-        print_key_value_metadata, filename.c_str());
+        format_dump, print_key_value_metadata, filename.c_str());
     }
   } catch (const std::exception& e) {
     std::cerr << "Parquet error: " << e.what() << std::endl;