You are viewing a plain text version of this content. The canonical link for it is here.

Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/09/16 10:05:56 UTC

[GitHub] [arrow] pitrou opened a new pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

pitrou opened a new pull request #8203:
URL: https://github.com/apache/arrow/pull/8203


   These benchmarks only measure one-level nesting (struct, list) for now.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] pitrou commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

pitrou commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-694318637


   Thanks for the review :-)


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] github-actions[bot] commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

github-actions[bot] commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693306850


   https://issues.apache.org/jira/browse/ARROW-10024


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] pitrou commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

pitrou commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489573749



##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -280,6 +298,81 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
     ->Args({kAlternatingOrNa, 1})
     ->Args({5, 10});
 
+//
+// Benchmark reading a nested column
+//
+
+static void BM_ReadStructColumn(::benchmark::State& state) {
+  constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+  const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+  const bool nullable = (null_probability != 0.0);
+
+  ARROW_CHECK_GE(null_probability, 0.0);
+
+  ::arrow::random::RandomArrayGenerator rng(42);
+
+  auto values1 = rng.Int32(kNumValues, -5, 5, null_probability);
+  auto values2 =
+      rng.Int64(kNumValues, -12345678912345LL, 12345678912345LL, null_probability);
+
+  const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
+
+  std::shared_ptr<::arrow::Buffer> null_bitmap;
+  if (nullable) {
+    null_bitmap = rng.NullBitmap(kNumValues, null_probability);
+  }
+  auto array = *::arrow::StructArray::Make(
+      {values1, values2},
+      ::arrow::FieldVector{field("a", values1->type(), nullable),
+                           field("b", values2->type(), nullable)},
+      null_bitmap);
+  auto schema = ::arrow::schema({field("s", array->type(), nullable)});
+  auto table = ::arrow::Table::Make(schema, {array}, array->length());
+
+  EXIT_NOT_OK(table->Validate());
+
+  BenchmarkReadTable(state, *table, kNumValues, kBytesPerValue);
+}
+
+BENCHMARK(BM_ReadStructColumn)
+    ->Arg(/*null_percentage=*/0)
+    ->Arg(/*null_percentage=*/1)
+    ->Arg(/*null_percentage=*/50)
+    ->Arg(/*null_percentage=*/99);
+
+static void BM_ReadListColumn(::benchmark::State& state) {
+  constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+  const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+  const bool nullable = (null_probability != 0.0);
+
+  ARROW_CHECK_GE(null_probability, 0.0);
+
+  ::arrow::random::RandomArrayGenerator rng(42);
+
+  auto values = rng.Int64(kNumValues, -5, 5, null_probability);
+  auto offsets = rng.Offsets(kNumValues / 10, 0, values->length(), null_probability);

Review comment:
       It's not really meant to be representative of anything real, just not outlandish :-)




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] pitrou commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

pitrou commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489555668



##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -280,6 +298,81 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
     ->Args({kAlternatingOrNa, 1})
     ->Args({5, 10});
 
+//
+// Benchmark reading a nested column
+//
+
+static void BM_ReadStructColumn(::benchmark::State& state) {
+  constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+  const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+  const bool nullable = (null_probability != 0.0);
+
+  ARROW_CHECK_GE(null_probability, 0.0);
+
+  ::arrow::random::RandomArrayGenerator rng(42);
+
+  auto values1 = rng.Int32(kNumValues, -5, 5, null_probability);
+  auto values2 =
+      rng.Int64(kNumValues, -12345678912345LL, 12345678912345LL, null_probability);
+
+  const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
+
+  std::shared_ptr<::arrow::Buffer> null_bitmap;
+  if (nullable) {
+    null_bitmap = rng.NullBitmap(kNumValues, null_probability);
+  }
+  auto array = *::arrow::StructArray::Make(
+      {values1, values2},
+      ::arrow::FieldVector{field("a", values1->type(), nullable),
+                           field("b", values2->type(), nullable)},
+      null_bitmap);
+  auto schema = ::arrow::schema({field("s", array->type(), nullable)});
+  auto table = ::arrow::Table::Make(schema, {array}, array->length());
+
+  EXIT_NOT_OK(table->Validate());
+
+  BenchmarkReadTable(state, *table, kNumValues, kBytesPerValue);
+}
+
+BENCHMARK(BM_ReadStructColumn)
+    ->Arg(/*null_percentage=*/0)
+    ->Arg(/*null_percentage=*/1)
+    ->Arg(/*null_percentage=*/50)
+    ->Arg(/*null_percentage=*/99);
+
+static void BM_ReadListColumn(::benchmark::State& state) {
+  constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+  const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+  const bool nullable = (null_probability != 0.0);
+
+  ARROW_CHECK_GE(null_probability, 0.0);
+
+  ::arrow::random::RandomArrayGenerator rng(42);
+
+  auto values = rng.Int64(kNumValues, -5, 5, null_probability);
+  auto offsets = rng.Offsets(kNumValues / 10, 0, values->length(), null_probability);

Review comment:
       Ah, you meant the size of individual lists? Well, since the list array will be `kNumValues / 10` large, individual lists should have 10 items average.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] pitrou commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

pitrou commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489553561



##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -280,6 +298,81 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
     ->Args({kAlternatingOrNa, 1})
     ->Args({5, 10});
 
+//
+// Benchmark reading a nested column
+//
+
+static void BM_ReadStructColumn(::benchmark::State& state) {
+  constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+  const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+  const bool nullable = (null_probability != 0.0);
+
+  ARROW_CHECK_GE(null_probability, 0.0);
+
+  ::arrow::random::RandomArrayGenerator rng(42);
+
+  auto values1 = rng.Int32(kNumValues, -5, 5, null_probability);
+  auto values2 =
+      rng.Int64(kNumValues, -12345678912345LL, 12345678912345LL, null_probability);
+
+  const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
+
+  std::shared_ptr<::arrow::Buffer> null_bitmap;
+  if (nullable) {
+    null_bitmap = rng.NullBitmap(kNumValues, null_probability);
+  }
+  auto array = *::arrow::StructArray::Make(
+      {values1, values2},
+      ::arrow::FieldVector{field("a", values1->type(), nullable),
+                           field("b", values2->type(), nullable)},
+      null_bitmap);
+  auto schema = ::arrow::schema({field("s", array->type(), nullable)});
+  auto table = ::arrow::Table::Make(schema, {array}, array->length());
+
+  EXIT_NOT_OK(table->Validate());
+
+  BenchmarkReadTable(state, *table, kNumValues, kBytesPerValue);
+}
+
+BENCHMARK(BM_ReadStructColumn)
+    ->Arg(/*null_percentage=*/0)
+    ->Arg(/*null_percentage=*/1)
+    ->Arg(/*null_percentage=*/50)
+    ->Arg(/*null_percentage=*/99);
+
+static void BM_ReadListColumn(::benchmark::State& state) {
+  constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+  const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+  const bool nullable = (null_probability != 0.0);
+
+  ARROW_CHECK_GE(null_probability, 0.0);
+
+  ::arrow::random::RandomArrayGenerator rng(42);
+
+  auto values = rng.Int64(kNumValues, -5, 5, null_probability);
+  auto offsets = rng.Offsets(kNumValues / 10, 0, values->length(), null_probability);

Review comment:
       `kNumValues / 10`




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] pitrou commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

pitrou commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489573502



##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -280,6 +298,81 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
     ->Args({kAlternatingOrNa, 1})
     ->Args({5, 10});
 
+//
+// Benchmark reading a nested column
+//
+
+static void BM_ReadStructColumn(::benchmark::State& state) {
+  constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+  const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+  const bool nullable = (null_probability != 0.0);
+
+  ARROW_CHECK_GE(null_probability, 0.0);
+
+  ::arrow::random::RandomArrayGenerator rng(42);
+
+  auto values1 = rng.Int32(kNumValues, -5, 5, null_probability);

Review comment:
       Will do.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] emkornfield commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

emkornfield commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489554513



##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -280,6 +298,81 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
     ->Args({kAlternatingOrNa, 1})
     ->Args({5, 10});
 
+//
+// Benchmark reading a nested column
+//
+
+static void BM_ReadStructColumn(::benchmark::State& state) {
+  constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+  const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+  const bool nullable = (null_probability != 0.0);
+
+  ARROW_CHECK_GE(null_probability, 0.0);
+
+  ::arrow::random::RandomArrayGenerator rng(42);
+
+  auto values1 = rng.Int32(kNumValues, -5, 5, null_probability);
+  auto values2 =
+      rng.Int64(kNumValues, -12345678912345LL, 12345678912345LL, null_probability);
+
+  const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
+
+  std::shared_ptr<::arrow::Buffer> null_bitmap;
+  if (nullable) {
+    null_bitmap = rng.NullBitmap(kNumValues, null_probability);
+  }
+  auto array = *::arrow::StructArray::Make(
+      {values1, values2},
+      ::arrow::FieldVector{field("a", values1->type(), nullable),
+                           field("b", values2->type(), nullable)},
+      null_bitmap);
+  auto schema = ::arrow::schema({field("s", array->type(), nullable)});
+  auto table = ::arrow::Table::Make(schema, {array}, array->length());
+
+  EXIT_NOT_OK(table->Validate());
+
+  BenchmarkReadTable(state, *table, kNumValues, kBytesPerValue);
+}
+
+BENCHMARK(BM_ReadStructColumn)
+    ->Arg(/*null_percentage=*/0)
+    ->Arg(/*null_percentage=*/1)
+    ->Arg(/*null_percentage=*/50)
+    ->Arg(/*null_percentage=*/99);
+
+static void BM_ReadListColumn(::benchmark::State& state) {
+  constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+  const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+  const bool nullable = (null_probability != 0.0);
+
+  ARROW_CHECK_GE(null_probability, 0.0);
+
+  ::arrow::random::RandomArrayGenerator rng(42);
+
+  auto values = rng.Int64(kNumValues, -5, 5, null_probability);
+  auto offsets = rng.Offsets(kNumValues / 10, 0, values->length(), null_probability);

Review comment:
       so lists get larger as overall array goes up?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] pitrou commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

pitrou commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693311458






----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] pitrou commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

pitrou commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489593595



##########
File path: cpp/src/arrow/testing/random.cc
##########
@@ -250,12 +267,20 @@ std::shared_ptr<Array> RandomArrayGenerator::StringWithRepeats(int64_t size,
 }
 
 std::shared_ptr<Array> RandomArrayGenerator::Offsets(int64_t size, int32_t first_offset,
-                                                     int32_t last_offset) {
+                                                     int32_t last_offset,
+                                                     double null_probability) {
   using GenOpt = GenerateOptions<int32_t, std::uniform_int_distribution<int32_t>>;
-  GenOpt options(seed(), first_offset, last_offset, /*null_probability=*/0);
+  GenOpt options(seed(), first_offset, last_offset, null_probability);
 
   BufferVector buffers{2};
 
+  int64_t null_count = 0;
+  buffers[0] = *AllocateEmptyBitmap(size);
+  options.GenerateBitmap(buffers[0]->mutable_data(), size, &null_count);
+  // Make sure the first and last entry are non-null

Review comment:
       I'd rather keep this after all. It doesn't hurt and at least half of it is necessary.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] pitrou removed a comment on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

pitrou removed a comment on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693308298


   @ursabot archery benchmark run --suite-filter=parquet-arrow-reader-writer-benchmark


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] pitrou commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

pitrou commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693308298






----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] pitrou commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

pitrou commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489554019



##########
File path: cpp/src/arrow/testing/random.cc
##########
@@ -250,12 +267,20 @@ std::shared_ptr<Array> RandomArrayGenerator::StringWithRepeats(int64_t size,
 }
 
 std::shared_ptr<Array> RandomArrayGenerator::Offsets(int64_t size, int32_t first_offset,
-                                                     int32_t last_offset) {
+                                                     int32_t last_offset,
+                                                     double null_probability) {
   using GenOpt = GenerateOptions<int32_t, std::uniform_int_distribution<int32_t>>;
-  GenOpt options(seed(), first_offset, last_offset, /*null_probability=*/0);
+  GenOpt options(seed(), first_offset, last_offset, null_probability);
 
   BufferVector buffers{2};
 
+  int64_t null_count = 0;
+  buffers[0] = *AllocateEmptyBitmap(size);
+  options.GenerateBitmap(buffers[0]->mutable_data(), size, &null_count);
+  // Make sure the first and last entry are non-null

Review comment:
       Hmm... you're right, it's probably not :-S




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] pitrou removed a comment on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

pitrou removed a comment on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693311458


   @ursabot archery benchmark --suite-filter=parquet-arrow-reader-writer-benchmark


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] kszucs commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

kszucs commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693309587


   @ursabot benchmark run --help


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] pitrou commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

pitrou commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693307330


   Here are the results here:
   ```
   BM_ReadStructColumn/0     1706505 ns      1704955 ns          409 bytes_per_second=6.87335G/s items_per_second=615.017M/s
   BM_ReadStructColumn/1     7827665 ns      7821031 ns           90 bytes_per_second=1.49836G/s items_per_second=134.071M/s
   BM_ReadStructColumn/50   14142868 ns     14135647 ns           49 bytes_per_second=848.918M/s items_per_second=74.1796M/s
   BM_ReadStructColumn/99    3846944 ns      3844119 ns          182 bytes_per_second=3.04849G/s items_per_second=272.774M/s
   
   BM_ReadListColumn/0     7949690 ns      7946976 ns           87 bytes_per_second=1006.67M/s items_per_second=131.947M/s
   BM_ReadListColumn/1     9302755 ns      9299473 ns           76 bytes_per_second=860.264M/s items_per_second=112.756M/s
   BM_ReadListColumn/50   19390741 ns     19386146 ns           36 bytes_per_second=412.666M/s items_per_second=54.0889M/s
   BM_ReadListColumn/99    7162830 ns      7159609 ns           97 bytes_per_second=1117.38M/s items_per_second=146.457M/s
   ```
   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] emkornfield commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

emkornfield commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489556466



##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -280,6 +298,81 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
     ->Args({kAlternatingOrNa, 1})
     ->Args({5, 10});
 
+//
+// Benchmark reading a nested column
+//
+
+static void BM_ReadStructColumn(::benchmark::State& state) {
+  constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+  const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+  const bool nullable = (null_probability != 0.0);
+
+  ARROW_CHECK_GE(null_probability, 0.0);
+
+  ::arrow::random::RandomArrayGenerator rng(42);
+
+  auto values1 = rng.Int32(kNumValues, -5, 5, null_probability);
+  auto values2 =
+      rng.Int64(kNumValues, -12345678912345LL, 12345678912345LL, null_probability);
+
+  const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
+
+  std::shared_ptr<::arrow::Buffer> null_bitmap;
+  if (nullable) {
+    null_bitmap = rng.NullBitmap(kNumValues, null_probability);
+  }
+  auto array = *::arrow::StructArray::Make(
+      {values1, values2},
+      ::arrow::FieldVector{field("a", values1->type(), nullable),
+                           field("b", values2->type(), nullable)},
+      null_bitmap);
+  auto schema = ::arrow::schema({field("s", array->type(), nullable)});
+  auto table = ::arrow::Table::Make(schema, {array}, array->length());
+
+  EXIT_NOT_OK(table->Validate());
+
+  BenchmarkReadTable(state, *table, kNumValues, kBytesPerValue);
+}
+
+BENCHMARK(BM_ReadStructColumn)
+    ->Arg(/*null_percentage=*/0)
+    ->Arg(/*null_percentage=*/1)
+    ->Arg(/*null_percentage=*/50)
+    ->Arg(/*null_percentage=*/99);
+
+static void BM_ReadListColumn(::benchmark::State& state) {
+  constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+  const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+  const bool nullable = (null_probability != 0.0);
+
+  ARROW_CHECK_GE(null_probability, 0.0);
+
+  ::arrow::random::RandomArrayGenerator rng(42);
+
+  auto values = rng.Int64(kNumValues, -5, 5, null_probability);
+  auto offsets = rng.Offsets(kNumValues / 10, 0, values->length(), null_probability);

Review comment:
       ah, OK, I'm not sure if this is representative of real workloads but it is better then nothing.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] wesm commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

wesm commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693456506


   Looks like there is something wrong with the Ursabot result reporting, but you can see the results here:
   
   https://ci.ursalabs.org/#/builders/73/builds/103/steps/3/logs/result


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] pitrou commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

pitrou commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489591904



##########
File path: cpp/src/arrow/testing/random.cc
##########
@@ -250,12 +267,20 @@ std::shared_ptr<Array> RandomArrayGenerator::StringWithRepeats(int64_t size,
 }
 
 std::shared_ptr<Array> RandomArrayGenerator::Offsets(int64_t size, int32_t first_offset,
-                                                     int32_t last_offset) {
+                                                     int32_t last_offset,
+                                                     double null_probability) {
   using GenOpt = GenerateOptions<int32_t, std::uniform_int_distribution<int32_t>>;
-  GenOpt options(seed(), first_offset, last_offset, /*null_probability=*/0);
+  GenOpt options(seed(), first_offset, last_offset, null_probability);
 
   BufferVector buffers{2};
 
+  int64_t null_count = 0;
+  buffers[0] = *AllocateEmptyBitmap(size);
+  options.GenerateBitmap(buffers[0]->mutable_data(), size, &null_count);
+  // Make sure the first and last entry are non-null

Review comment:
       Well, at least the last offset should be non-null (according to `ListArray::FromArrays`). The first one needn't, apparently.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] emkornfield commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

emkornfield commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489541639



##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -280,6 +298,81 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
     ->Args({kAlternatingOrNa, 1})
     ->Args({5, 10});
 
+//
+// Benchmark reading a nested column
+//
+
+static void BM_ReadStructColumn(::benchmark::State& state) {
+  constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+  const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+  const bool nullable = (null_probability != 0.0);
+
+  ARROW_CHECK_GE(null_probability, 0.0);
+
+  ::arrow::random::RandomArrayGenerator rng(42);
+
+  auto values1 = rng.Int32(kNumValues, -5, 5, null_probability);

Review comment:
       could you comment, the -5, and 5 to indicate what these parameters represent?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] ursabot commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

ursabot commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693311487


   ```
   No such command "archery".
   ```


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] emkornfield commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

emkornfield commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489538445



##########
File path: cpp/src/arrow/testing/random.cc
##########
@@ -250,12 +267,20 @@ std::shared_ptr<Array> RandomArrayGenerator::StringWithRepeats(int64_t size,
 }
 
 std::shared_ptr<Array> RandomArrayGenerator::Offsets(int64_t size, int32_t first_offset,
-                                                     int32_t last_offset) {
+                                                     int32_t last_offset,
+                                                     double null_probability) {
   using GenOpt = GenerateOptions<int32_t, std::uniform_int_distribution<int32_t>>;
-  GenOpt options(seed(), first_offset, last_offset, /*null_probability=*/0);
+  GenOpt options(seed(), first_offset, last_offset, null_probability);
 
   BufferVector buffers{2};
 
+  int64_t null_count = 0;
+  buffers[0] = *AllocateEmptyBitmap(size);
+  options.GenerateBitmap(buffers[0]->mutable_data(), size, &null_count);
+  // Make sure the first and last entry are non-null

Review comment:
       why is this important?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] pitrou commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

pitrou commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693574989


   I think I addressed your review comments @emkornfield .


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] pitrou commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

pitrou commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489588294



##########
File path: cpp/src/arrow/testing/random.h
##########
@@ -52,7 +60,7 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
   std::shared_ptr<Array> Boolean(int64_t size, double true_probability,
                                  double null_probability = 0);
 
-  /// \brief Generates a random UInt8Array
+  /// \brief Generate a random UInt8Array

Review comment:
       Hmm, I can add this somewhere in the docs. Most of our docstrings use infinite, AFAIK.

##########
File path: cpp/src/arrow/testing/random.h
##########
@@ -52,7 +60,7 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
   std::shared_ptr<Array> Boolean(int64_t size, double true_probability,
                                  double null_probability = 0);
 
-  /// \brief Generates a random UInt8Array
+  /// \brief Generate a random UInt8Array

Review comment:
       Hmm, I can add this somewhere in the docs. Most of our docstrings use the infinitive, AFAIK.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] ursabot commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

ursabot commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693308768


   [AMD64 Ubuntu 18.04 C++ Benchmark (#126385)](https://ci.ursalabs.org/#builders/73/builds/102) builder failed.
   
   Revision: e0b975e83ad6bae138811326e8f82c27d1cb5676
   
   Archery: `'archery benchmark ...'` step's stderr:
   ```
   Using optional gold linker (version 1.15)
   Configured for RELEASE build (set with cmake -DCMAKE_BUILD_TYPE={release,debug,...})
   Building Apache Thrift from source
   CMake Warning at cmake_modules/ThirdpartyToolchain.cmake:189 (find_package):
     No "Findbenchmark.cmake" found in CMAKE_MODULE_PATH.
   Call Stack (most recent call first):
     cmake_modules/ThirdpartyToolchain.cmake:1756 (resolve_dependency)
     CMakeLists.txt:495 (include)
   
   
   CMake Warning (dev) at cmake_modules/ThirdpartyToolchain.cmake:189 (find_package):
     Findbenchmark.cmake must either be part of this project itself, in this
     case adjust CMAKE_MODULE_PATH so that it points to the correct location
     inside its source tree.
   
     Or it must be installed by a package which has already been found via
     find_package().  In this case make sure that package has indeed been found
     and adjust CMAKE_MODULE_PATH to contain the location where that package has
     installed Findbenchmark.cmake.  This must be a location provided by that
     package.  This error in general means that the buildsystem of this project
     is relying on a Find-module without ensuring that it is actually available.
   
   Call Stack (most recent call first):
     cmake_modules/ThirdpartyToolchain.cmake:1756 (resolve_dependency)
     CMakeLists.txt:495 (include)
   This warning is for project developers.  Use -Wno-dev to suppress it.
   
   Cloning into '/tmp/arrow-archery-udq6fnfw/run/arrow'...
   done.
   Checking out files:  95% (4663/4863)   
   Checking out files:  96% (4669/4863)   
   Checking out files:  97% (4718/4863)   
   Checking out files:  98% (4766/4863)   
   Checking out files:  99% (4815/4863)   
   Checking out files: 100% (4863/4863)   
   Checking out files: 100% (4863/4863), done.
   fatal: ambiguous argument 'run': unknown revision or path not in the working tree.
   Use '--' to separate paths from revisions, like this:
   'git <command> [<revision>...] -- [<file>...]'
   Traceback (most recent call last):
     File "/usr/local/bin/archery", line 11, in <module>
       load_entry_point('archery', 'console_scripts', 'archery')()
     File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 764, in __call__
       return self.main(*args, **kwargs)
     File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 717, in main
       rv = self.invoke(ctx)
     File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 1137, in invoke
       return _process_result(sub_ctx.command.invoke(sub_ctx))
     File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 1137, in invoke
       return _process_result(sub_ctx.command.invoke(sub_ctx))
     File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 956, in invoke
       return ctx.invoke(self.callback, **ctx.params)
     File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 555, in invoke
       return callback(*args, **kwargs)
     File "/usr/local/lib/python3.6/dist-packages/click/decorators.py", line 17, in new_func
       return f(get_current_context(), *args, **kwargs)
     File "/buildbot/AMD64_Ubuntu_18_04_C___Benchmark/dev/archery/archery/cli.py", line 569, in benchmark_diff
       benchmark_filter=benchmark_filter)
     File "/buildbot/AMD64_Ubuntu_18_04_C___Benchmark/dev/archery/archery/benchmark/runner.py", line 83, in from_rev_or_path
       src_rev, _ = src.at_revision(rev_or_path, clone_dir)
     File "/buildbot/AMD64_Ubuntu_18_04_C___Benchmark/dev/archery/archery/utils/source.py", line 153, in at_revision
       original_revision = git.rev_parse(revision)
     File "/buildbot/AMD64_Ubuntu_18_04_C___Benchmark/dev/archery/archery/utils/command.py", line 48, in wrapper
       return list_it(strip_it(f(*argv, **kwargs).stdout))
     File "/buildbot/AMD64_Ubuntu_18_04_C___Benchmark/dev/archery/archery/utils/git.py", line 29, in wrapper
       return fn(self, sub_cmd, *argv, **kwargs)
     File "/buildbot/AMD64_Ubuntu_18_04_C___Benchmark/dev/archery/archery/utils/git.py", line 78, in rev_parse
       return self.run_cmd(*argv, **kwargs)
     File "/buildbot/AMD64_Ubuntu_18_04_C___Benchmark/dev/archery/archery/utils/git.py", line 43, in run_cmd
       return self.run(*opts, cmd, *argv, **kwargs)
     File "/buildbot/AMD64_Ubuntu_18_04_C___Benchmark/dev/archery/archery/utils/command.py", line 74, in run
       return subprocess.run(invocation, **kwargs)
     File "/usr/lib/python3.6/subprocess.py", line 438, in run
       output=stdout, stderr=stderr)
   subprocess.CalledProcessError: Command '['git', 'rev-parse', 'run']' returned non-zero exit status 128.
   ```


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] emkornfield closed pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

emkornfield closed pull request #8203:
URL: https://github.com/apache/arrow/pull/8203


   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] ursabot commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

ursabot commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693308311


   ```
   No such command "archery".
   ```


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] pitrou commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

pitrou commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693460386


   Yeah, that's not what I wanted anyway. I want to run the new benchmarks, not diff existing benchmarks. Nevermind...


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] emkornfield commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

emkornfield commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-694307078






----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] emkornfield commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

emkornfield commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489540876



##########
File path: cpp/src/arrow/testing/random.h
##########
@@ -52,7 +60,7 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
   std::shared_ptr<Array> Boolean(int64_t size, double true_probability,
                                  double null_probability = 0);
 
-  /// \brief Generates a random UInt8Array
+  /// \brief Generate a random UInt8Array

Review comment:
       nit: why the change?  If we want to standardize on one vs the other we should probably update the style guide?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] ursabot removed a comment on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

ursabot removed a comment on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693311487






----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] ursabot commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

ursabot commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693309595


   ```
   Usage: @ursabot benchmark [OPTIONS] [<baseline>]
   
     Run the benchmark suite in comparison mode.
   
     This command will run the benchmark suite for tip of the branch commit
     against `<baseline>` (or master if not provided).
   
     Examples:
   
     # Run the all the benchmarks
     @ursabot benchmark
   
     # Compare only benchmarks where the name matches the /^Sum/ regex
     @ursabot benchmark --benchmark-filter=^Sum
   
     # Compare only benchmarks where the suite matches the /compute-/ regex.
     # A suite is the C++ binary.
     @ursabot benchmark --suite-filter=compute-
   
     # Sometimes a new optimization requires the addition of new benchmarks to
     # quantify the performance increase. When doing this be sure to add the
     # benchmark in a separate commit before introducing the optimization.
     #
     # Note that specifying the baseline is the only way to compare using a new
     # benchmark, since master does not contain the new benchmark and no
     # comparison is possible.
     #
     # The following command compares the results of matching benchmarks,
     # compiling against HEAD and the provided baseline commit, e.g. eaf8302.
     # You can use this to quantify the performance improvement of new
     # optimizations or to check for regressions.
     @ursabot benchmark --benchmark-filter=MyBenchmark eaf8302
   
   Options:
     --suite-filter <regex>      Regex filtering benchmark suites.
     --benchmark-filter <regex>  Regex filtering benchmarks.
     --cc <compiler>             C compiler.
     --cxx <compiler>            C++ compiler.
     --cxx-flags TEXT            C++ compiler flags.
     --repetitions INTEGER       Number of repetitions of each benchmark.
                                 Increasing may improve result precision.
                                 [default: 1]
     --help                      Show this message and exit.
   ```


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [arrow] emkornfield commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks

Posted by GitBox <gi...@apache.org>.

emkornfield commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489542938



##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -280,6 +298,81 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
     ->Args({kAlternatingOrNa, 1})
     ->Args({5, 10});
 
+//
+// Benchmark reading a nested column
+//
+
+static void BM_ReadStructColumn(::benchmark::State& state) {
+  constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+  const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+  const bool nullable = (null_probability != 0.0);
+
+  ARROW_CHECK_GE(null_probability, 0.0);
+
+  ::arrow::random::RandomArrayGenerator rng(42);
+
+  auto values1 = rng.Int32(kNumValues, -5, 5, null_probability);
+  auto values2 =
+      rng.Int64(kNumValues, -12345678912345LL, 12345678912345LL, null_probability);
+
+  const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
+
+  std::shared_ptr<::arrow::Buffer> null_bitmap;
+  if (nullable) {
+    null_bitmap = rng.NullBitmap(kNumValues, null_probability);
+  }
+  auto array = *::arrow::StructArray::Make(
+      {values1, values2},
+      ::arrow::FieldVector{field("a", values1->type(), nullable),
+                           field("b", values2->type(), nullable)},
+      null_bitmap);
+  auto schema = ::arrow::schema({field("s", array->type(), nullable)});
+  auto table = ::arrow::Table::Make(schema, {array}, array->length());
+
+  EXIT_NOT_OK(table->Validate());
+
+  BenchmarkReadTable(state, *table, kNumValues, kBytesPerValue);
+}
+
+BENCHMARK(BM_ReadStructColumn)
+    ->Arg(/*null_percentage=*/0)
+    ->Arg(/*null_percentage=*/1)
+    ->Arg(/*null_percentage=*/50)
+    ->Arg(/*null_percentage=*/99);
+
+static void BM_ReadListColumn(::benchmark::State& state) {
+  constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+  const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+  const bool nullable = (null_probability != 0.0);
+
+  ARROW_CHECK_GE(null_probability, 0.0);
+
+  ::arrow::random::RandomArrayGenerator rng(42);
+
+  auto values = rng.Int64(kNumValues, -5, 5, null_probability);
+  auto offsets = rng.Offsets(kNumValues / 10, 0, values->length(), null_probability);

Review comment:
       approximately what size lists does this generate?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org