You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by we...@apache.org on 2016/09/26 02:35:18 UTC

parquet-cpp git commit: PARQUET-721: benchmarks for reading into Arrow

Repository: parquet-cpp
Updated Branches:
  refs/heads/master 5c1d9e94b -> 549a58b01


PARQUET-721: benchmarks for reading into Arrow

Author: Uwe L. Korn <uw...@xhochy.com>

Closes #165 from xhochy/parquet-721 and squashes the following commits:

1c34bb1 [Uwe L. Korn] Fix cmake problems
b4a4448 [Uwe L. Korn] PARQUET-721: benchmarks for reading into Arrow


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/549a58b0
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/549a58b0
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/549a58b0

Branch: refs/heads/master
Commit: 549a58b010df5e5f5dcf601108e62fd1696ef788
Parents: 5c1d9e9
Author: Uwe L. Korn <uw...@xhochy.com>
Authored: Sun Sep 25 22:35:01 2016 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Sun Sep 25 22:35:01 2016 -0400

----------------------------------------------------------------------
 src/parquet/arrow/CMakeLists.txt                |   9 ++
 .../arrow/arrow-reader-writer-benchmark.cc      | 157 +++++++++++++++++++
 2 files changed, 166 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/549a58b0/src/parquet/arrow/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/CMakeLists.txt b/src/parquet/arrow/CMakeLists.txt
index f36abe6..376e84a 100644
--- a/src/parquet/arrow/CMakeLists.txt
+++ b/src/parquet/arrow/CMakeLists.txt
@@ -83,6 +83,15 @@ else()
   ADD_PARQUET_LINK_LIBRARIES(arrow-reader-writer-test parquet_arrow_shared)
 endif()
 
+if(PARQUET_BUILD_BENCHMARKS)
+  ADD_PARQUET_BENCHMARK(arrow-reader-writer-benchmark)
+  if (PARQUET_BUILD_STATIC)
+    ADD_PARQUET_LINK_LIBRARIES(arrow-reader-writer-benchmark parquet_arrow_static)
+  else()
+    ADD_PARQUET_LINK_LIBRARIES(arrow-reader-writer-benchmark parquet_arrow_shared)
+  endif()
+endif()
+
 # Headers: top level
 install(FILES
   io.h

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/549a58b0/src/parquet/arrow/arrow-reader-writer-benchmark.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/arrow-reader-writer-benchmark.cc b/src/parquet/arrow/arrow-reader-writer-benchmark.cc
new file mode 100644
index 0000000..5f0b3f5
--- /dev/null
+++ b/src/parquet/arrow/arrow-reader-writer-benchmark.cc
@@ -0,0 +1,157 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "benchmark/benchmark.h"
+
+#include "parquet/arrow/reader.h"
+#include "parquet/arrow/writer.h"
+#include "parquet/file/reader-internal.h"
+#include "parquet/file/writer-internal.h"
+#include "parquet/column/reader.h"
+#include "parquet/column/writer.h"
+#include "parquet/util/input.h"
+
+#include "arrow/column.h"
+#include "arrow/schema.h"
+#include "arrow/table.h"
+#include "arrow/types/primitive.h"
+
+using arrow::NumericBuilder;
+
+namespace parquet {
+
+using arrow::FileReader;
+using arrow::WriteFlatTable;
+using schema::PrimitiveNode;
+
+namespace benchmark {
+
+// This should result in multiple pages for most primitive types
+constexpr int64_t BENCHMARK_SIZE = 10 * 1024 * 1024;
+
+template <typename ParquetType>
+struct benchmark_traits {};
+
+template <>
+struct benchmark_traits<Int32Type> {
+  using arrow_type = ::arrow::Int32Type;
+};
+
+template <>
+struct benchmark_traits<Int64Type> {
+  using arrow_type = ::arrow::Int64Type;
+};
+
+template <>
+struct benchmark_traits<DoubleType> {
+  using arrow_type = ::arrow::DoubleType;
+};
+
+template <typename ParquetType>
+using ArrowType = typename benchmark_traits<ParquetType>::arrow_type;
+
+template <typename ParquetType>
+std::shared_ptr<ColumnDescriptor> MakeSchema(Repetition::type repetition) {
+  auto node = PrimitiveNode::Make("int64", repetition, ParquetType::type_num);
+  return std::make_shared<ColumnDescriptor>(
+      node, repetition != Repetition::REQUIRED, repetition == Repetition::REPEATED);
+}
+
+template <bool nullable, typename ParquetType>
+void SetBytesProcessed(::benchmark::State& state) {
+  int64_t bytes_processed =
+      state.iterations() * BENCHMARK_SIZE * sizeof(typename ParquetType::c_type);
+  if (nullable) {
+    bytes_processed += state.iterations() * BENCHMARK_SIZE * sizeof(int16_t);
+  }
+  state.SetBytesProcessed(bytes_processed);
+}
+
+template <bool nullable, typename ParquetType>
+std::shared_ptr<::arrow::Table> TableFromVector(
+    const std::vector<typename ParquetType::c_type>& vec) {
+  ::arrow::TypePtr type = std::make_shared<ArrowType<ParquetType>>();
+  NumericBuilder<ArrowType<ParquetType>> builder(::arrow::default_memory_pool(), type);
+  if (nullable) {
+    std::vector<uint8_t> valid_bytes(BENCHMARK_SIZE, 0);
+    int n = {0};
+    std::generate(valid_bytes.begin(), valid_bytes.end(), [&n] { return n++ % 2; });
+    builder.Append(vec.data(), vec.size(), valid_bytes.data());
+  } else {
+    builder.Append(vec.data(), vec.size(), nullptr);
+  }
+  std::shared_ptr<::arrow::Array> array = builder.Finish();
+  auto field = std::make_shared<::arrow::Field>("column", type, nullable);
+  auto schema = std::make_shared<::arrow::Schema>(
+      std::vector<std::shared_ptr<::arrow::Field>>({field}));
+  auto column = std::make_shared<::arrow::Column>(field, array);
+  return std::make_shared<::arrow::Table>(
+      "table", schema, std::vector<std::shared_ptr<::arrow::Column>>({column}));
+}
+
+template <bool nullable, typename ParquetType>
+static void BM_WriteColumn(::benchmark::State& state) {
+  format::ColumnChunk thrift_metadata;
+  std::vector<typename ParquetType::c_type> values(BENCHMARK_SIZE, 128);
+  std::shared_ptr<::arrow::Table> table = TableFromVector<nullable, ParquetType>(values);
+
+  while (state.KeepRunning()) {
+    auto output = std::make_shared<InMemoryOutputStream>();
+    WriteFlatTable(table.get(), ::arrow::default_memory_pool(), output, BENCHMARK_SIZE);
+  }
+  SetBytesProcessed<nullable, ParquetType>(state);
+}
+
+BENCHMARK_TEMPLATE(BM_WriteColumn, false, Int32Type);
+BENCHMARK_TEMPLATE(BM_WriteColumn, true, Int32Type);
+
+BENCHMARK_TEMPLATE(BM_WriteColumn, false, Int64Type);
+BENCHMARK_TEMPLATE(BM_WriteColumn, true, Int64Type);
+
+BENCHMARK_TEMPLATE(BM_WriteColumn, false, DoubleType);
+BENCHMARK_TEMPLATE(BM_WriteColumn, true, DoubleType);
+
+template <bool nullable, typename ParquetType>
+static void BM_ReadColumn(::benchmark::State& state) {
+  std::vector<typename ParquetType::c_type> values(BENCHMARK_SIZE, 128);
+  std::shared_ptr<::arrow::Table> table = TableFromVector<nullable, ParquetType>(values);
+  auto output = std::make_shared<InMemoryOutputStream>();
+  WriteFlatTable(table.get(), ::arrow::default_memory_pool(), output, BENCHMARK_SIZE);
+  std::shared_ptr<Buffer> buffer = output->GetBuffer();
+
+  while (state.KeepRunning()) {
+    auto reader = ParquetFileReader::Open(
+        std::unique_ptr<RandomAccessSource>(new BufferReader(buffer)));
+    FileReader filereader(::arrow::default_memory_pool(), std::move(reader));
+    std::shared_ptr<::arrow::Table> table;
+    filereader.ReadFlatTable(&table);
+  }
+  SetBytesProcessed<nullable, ParquetType>(state);
+}
+
+BENCHMARK_TEMPLATE(BM_ReadColumn, false, Int32Type);
+BENCHMARK_TEMPLATE(BM_ReadColumn, true, Int32Type);
+
+BENCHMARK_TEMPLATE(BM_ReadColumn, false, Int64Type);
+BENCHMARK_TEMPLATE(BM_ReadColumn, true, Int64Type);
+
+BENCHMARK_TEMPLATE(BM_ReadColumn, false, DoubleType);
+BENCHMARK_TEMPLATE(BM_ReadColumn, true, DoubleType);
+
+}  // namespace benchmark
+
+}  // namespace parquet