You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/09/27 12:19:40 UTC

[arrow] 01/24: PARQUET-681: Add tool to scan a parquet file

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 6beeaf4d40ea36c048e9a1a2d4410ca4e5d66864
Author: Deepak Majeti <de...@hpe.com>
AuthorDate: Mon Sep 5 15:49:36 2016 -0400

    PARQUET-681: Add tool to scan a parquet file
    
    Added a ReadBatchValues() API to the Column class.
    Added a parquet-scan tool
    Separated examples into benchmarks/tools
    added clang tidy and clang format to benchmarks and tools
    
    Author: Deepak Majeti <de...@hpe.com>
    
    Closes #144 from majetideepak/parquetscan and squashes the following commits:
    
    cc7f183 [Deepak Majeti] Removed GetRemainingInPage API
    44da480 [Deepak Majeti] add scan all in public api
    20829b8 [Deepak Majeti] clang-format
    da62354 [Deepak Majeti] ScanAllValues API
    e385f61 [Deepak Majeti] put clang-* in the root directory
    9ff785c [Deepak Majeti] use c++ random
    d854bde [Deepak Majeti] parquet scan tool
    
    Change-Id: I1e5d1e42aa5a3e8dfbe6b556dd0081bb0ed7f4d8
---
 cpp/tools/parquet/CMakeLists.txt         |  34 ++++++++++
 cpp/tools/parquet/parquet-dump-schema.cc |  36 +++++++++++
 cpp/tools/parquet/parquet-scan.cc        | 108 +++++++++++++++++++++++++++++++
 cpp/tools/parquet/parquet_reader.cc      |  67 +++++++++++++++++++
 4 files changed, 245 insertions(+)

diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt
new file mode 100644
index 0000000..5c4eaa8
--- /dev/null
+++ b/cpp/tools/parquet/CMakeLists.txt
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+SET(LINK_LIBS
+  snappystatic
+  thriftstatic)
+
+if (PARQUET_BUILD_EXECUTABLES)
+  add_executable(parquet-dump-schema parquet-dump-schema.cc)
+  target_link_libraries(parquet-dump-schema ${LINK_LIBS}
+	parquet_static)
+
+  add_executable(parquet_reader parquet_reader.cc)
+  target_link_libraries(parquet_reader ${LINK_LIBS}
+	parquet_static)
+
+  add_executable(parquet-scan parquet-scan.cc)
+  target_link_libraries(parquet-scan ${LINK_LIBS}
+	parquet_static)
+endif()
diff --git a/cpp/tools/parquet/parquet-dump-schema.cc b/cpp/tools/parquet/parquet-dump-schema.cc
new file mode 100644
index 0000000..deef2fd
--- /dev/null
+++ b/cpp/tools/parquet/parquet-dump-schema.cc
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <iostream>
+
+#include "parquet/api/reader.h"
+#include "parquet/api/schema.h"
+
+int main(int argc, char** argv) {
+  std::string filename = argv[1];
+
+  try {
+    std::unique_ptr<parquet::ParquetFileReader> reader =
+        parquet::ParquetFileReader::OpenFile(filename);
+    PrintSchema(reader->metadata()->schema_descriptor()->schema().get(), std::cout);
+  } catch (const std::exception& e) {
+    std::cerr << "Parquet error: " << e.what() << std::endl;
+    return -1;
+  }
+
+  return 0;
+}
diff --git a/cpp/tools/parquet/parquet-scan.cc b/cpp/tools/parquet/parquet-scan.cc
new file mode 100644
index 0000000..d146a1d
--- /dev/null
+++ b/cpp/tools/parquet/parquet-scan.cc
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <ctime>
+#include <iostream>
+#include <memory>
+#include <list>
+
+#include "parquet/api/reader.h"
+
+int main(int argc, char** argv) {
+  if (argc > 4 || argc < 1) {
+    std::cerr << "Usage: parquet-scan [--batch-size=] [--columns=...] <file>"
+              << std::endl;
+    return -1;
+  }
+
+  std::string filename;
+
+  // Read command-line options
+  int batch_size = 256;
+  const std::string COLUMNS_PREFIX = "--columns=";
+  const std::string BATCH_SIZE_PREFIX = "--batch-size=";
+  std::vector<int> columns;
+  int num_columns = 0;
+
+  char *param, *value;
+  for (int i = 1; i < argc; i++) {
+    if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) {
+      value = std::strtok(param + COLUMNS_PREFIX.length(), ",");
+      while (value) {
+        columns.push_back(std::atoi(value));
+        value = std::strtok(nullptr, ",");
+        num_columns++;
+      }
+    } else if ((param = std::strstr(argv[i], BATCH_SIZE_PREFIX.c_str()))) {
+      value = std::strtok(param + BATCH_SIZE_PREFIX.length(), " ");
+      if (value) { batch_size = std::atoi(value); }
+    } else {
+      filename = argv[i];
+    }
+  }
+
+  std::vector<int16_t> rep_levels(batch_size);
+  std::vector<int16_t> def_levels(batch_size);
+  try {
+    double total_time;
+    std::clock_t start_time = std::clock();
+    std::unique_ptr<parquet::ParquetFileReader> reader =
+        parquet::ParquetFileReader::OpenFile(filename);
+    // columns are not specified explicitly. Add all columns
+    if (num_columns == 0) {
+      num_columns = reader->metadata()->num_columns();
+      columns.resize(num_columns);
+      for (int i = 0; i < num_columns; i++) {
+        columns[i] = i;
+      }
+    }
+
+    int64_t total_rows[num_columns];
+
+    for (int r = 0; r < reader->metadata()->num_row_groups(); ++r) {
+      auto group_reader = reader->RowGroup(r);
+      int col = 0;
+      for (auto i : columns) {
+        total_rows[col] = 0;
+        std::shared_ptr<parquet::ColumnReader> col_reader = group_reader->Column(i);
+        size_t value_byte_size = GetTypeByteSize(col_reader->descr()->physical_type());
+        std::vector<uint8_t> values(batch_size * value_byte_size);
+
+        int64_t values_read = 0;
+        while (col_reader->HasNext()) {
+          total_rows[col] += ScanAllValues(batch_size, def_levels.data(),
+              rep_levels.data(), values.data(), &values_read, col_reader.get());
+        }
+        col++;
+      }
+    }
+
+    total_time = (std::clock() - start_time) / static_cast<double>(CLOCKS_PER_SEC);
+    for (int ct = 1; ct < num_columns; ++ct) {
+      if (total_rows[0] != total_rows[ct]) {
+        std::cerr << "Parquet error: Total rows among columns do not match" << std::endl;
+      }
+    }
+    std::cout << total_rows[0] << " rows scanned in " << total_time << " seconds."
+              << std::endl;
+  } catch (const std::exception& e) {
+    std::cerr << "Parquet error: " << e.what() << std::endl;
+    return -1;
+  }
+
+  return 0;
+}
diff --git a/cpp/tools/parquet/parquet_reader.cc b/cpp/tools/parquet/parquet_reader.cc
new file mode 100644
index 0000000..ced84d5
--- /dev/null
+++ b/cpp/tools/parquet/parquet_reader.cc
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <iostream>
+#include <memory>
+#include <list>
+
+#include "parquet/api/reader.h"
+
+int main(int argc, char** argv) {
+  if (argc > 5 || argc < 2) {
+    std::cerr << "Usage: parquet_reader [--only-metadata] [--no-memory-map] "
+                 "[--columns=...] <file>"
+              << std::endl;
+    return -1;
+  }
+
+  std::string filename;
+  bool print_values = true;
+  bool memory_map = true;
+
+  // Read command-line options
+  const std::string COLUMNS_PREFIX = "--columns=";
+  std::list<int> columns;
+
+  char *param, *value;
+  for (int i = 1; i < argc; i++) {
+    if ((param = std::strstr(argv[i], "--only-metadata"))) {
+      print_values = false;
+    } else if ((param = std::strstr(argv[i], "--no-memory-map"))) {
+      memory_map = false;
+    } else if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) {
+      value = std::strtok(param + COLUMNS_PREFIX.length(), ",");
+      while (value) {
+        columns.push_back(std::atoi(value));
+        value = std::strtok(nullptr, ",");
+      }
+    } else {
+      filename = argv[i];
+    }
+  }
+
+  try {
+    std::unique_ptr<parquet::ParquetFileReader> reader =
+        parquet::ParquetFileReader::OpenFile(filename, memory_map);
+    reader->DebugPrint(std::cout, columns, print_values);
+  } catch (const std::exception& e) {
+    std::cerr << "Parquet error: " << e.what() << std::endl;
+    return -1;
+  }
+
+  return 0;
+}