You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/09/16 10:05:56 UTC
[GitHub] [arrow] pitrou opened a new pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
pitrou opened a new pull request #8203:
URL: https://github.com/apache/arrow/pull/8203
These benchmarks only measure one-level nesting (struct, list) for now.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
pitrou commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-694318637
Thanks for the review :-)
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693306850
https://issues.apache.org/jira/browse/ARROW-10024
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489573749
##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -280,6 +298,81 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
->Args({kAlternatingOrNa, 1})
->Args({5, 10});
+//
+// Benchmark reading a nested column
+//
+
+static void BM_ReadStructColumn(::benchmark::State& state) {
+ constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+ const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+ const bool nullable = (null_probability != 0.0);
+
+ ARROW_CHECK_GE(null_probability, 0.0);
+
+ ::arrow::random::RandomArrayGenerator rng(42);
+
+ auto values1 = rng.Int32(kNumValues, -5, 5, null_probability);
+ auto values2 =
+ rng.Int64(kNumValues, -12345678912345LL, 12345678912345LL, null_probability);
+
+ const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
+
+ std::shared_ptr<::arrow::Buffer> null_bitmap;
+ if (nullable) {
+ null_bitmap = rng.NullBitmap(kNumValues, null_probability);
+ }
+ auto array = *::arrow::StructArray::Make(
+ {values1, values2},
+ ::arrow::FieldVector{field("a", values1->type(), nullable),
+ field("b", values2->type(), nullable)},
+ null_bitmap);
+ auto schema = ::arrow::schema({field("s", array->type(), nullable)});
+ auto table = ::arrow::Table::Make(schema, {array}, array->length());
+
+ EXIT_NOT_OK(table->Validate());
+
+ BenchmarkReadTable(state, *table, kNumValues, kBytesPerValue);
+}
+
+BENCHMARK(BM_ReadStructColumn)
+ ->Arg(/*null_percentage=*/0)
+ ->Arg(/*null_percentage=*/1)
+ ->Arg(/*null_percentage=*/50)
+ ->Arg(/*null_percentage=*/99);
+
+static void BM_ReadListColumn(::benchmark::State& state) {
+ constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+ const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+ const bool nullable = (null_probability != 0.0);
+
+ ARROW_CHECK_GE(null_probability, 0.0);
+
+ ::arrow::random::RandomArrayGenerator rng(42);
+
+ auto values = rng.Int64(kNumValues, -5, 5, null_probability);
+ auto offsets = rng.Offsets(kNumValues / 10, 0, values->length(), null_probability);
Review comment:
It's not really meant to be representative of anything real, just not outlandish :-)
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489555668
##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -280,6 +298,81 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
->Args({kAlternatingOrNa, 1})
->Args({5, 10});
+//
+// Benchmark reading a nested column
+//
+
+static void BM_ReadStructColumn(::benchmark::State& state) {
+ constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+ const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+ const bool nullable = (null_probability != 0.0);
+
+ ARROW_CHECK_GE(null_probability, 0.0);
+
+ ::arrow::random::RandomArrayGenerator rng(42);
+
+ auto values1 = rng.Int32(kNumValues, -5, 5, null_probability);
+ auto values2 =
+ rng.Int64(kNumValues, -12345678912345LL, 12345678912345LL, null_probability);
+
+ const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
+
+ std::shared_ptr<::arrow::Buffer> null_bitmap;
+ if (nullable) {
+ null_bitmap = rng.NullBitmap(kNumValues, null_probability);
+ }
+ auto array = *::arrow::StructArray::Make(
+ {values1, values2},
+ ::arrow::FieldVector{field("a", values1->type(), nullable),
+ field("b", values2->type(), nullable)},
+ null_bitmap);
+ auto schema = ::arrow::schema({field("s", array->type(), nullable)});
+ auto table = ::arrow::Table::Make(schema, {array}, array->length());
+
+ EXIT_NOT_OK(table->Validate());
+
+ BenchmarkReadTable(state, *table, kNumValues, kBytesPerValue);
+}
+
+BENCHMARK(BM_ReadStructColumn)
+ ->Arg(/*null_percentage=*/0)
+ ->Arg(/*null_percentage=*/1)
+ ->Arg(/*null_percentage=*/50)
+ ->Arg(/*null_percentage=*/99);
+
+static void BM_ReadListColumn(::benchmark::State& state) {
+ constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+ const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+ const bool nullable = (null_probability != 0.0);
+
+ ARROW_CHECK_GE(null_probability, 0.0);
+
+ ::arrow::random::RandomArrayGenerator rng(42);
+
+ auto values = rng.Int64(kNumValues, -5, 5, null_probability);
+ auto offsets = rng.Offsets(kNumValues / 10, 0, values->length(), null_probability);
Review comment:
Ah, you meant the size of individual lists? Well, since the list array will be `kNumValues / 10` large, individual lists should have 10 items average.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489553561
##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -280,6 +298,81 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
->Args({kAlternatingOrNa, 1})
->Args({5, 10});
+//
+// Benchmark reading a nested column
+//
+
+static void BM_ReadStructColumn(::benchmark::State& state) {
+ constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+ const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+ const bool nullable = (null_probability != 0.0);
+
+ ARROW_CHECK_GE(null_probability, 0.0);
+
+ ::arrow::random::RandomArrayGenerator rng(42);
+
+ auto values1 = rng.Int32(kNumValues, -5, 5, null_probability);
+ auto values2 =
+ rng.Int64(kNumValues, -12345678912345LL, 12345678912345LL, null_probability);
+
+ const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
+
+ std::shared_ptr<::arrow::Buffer> null_bitmap;
+ if (nullable) {
+ null_bitmap = rng.NullBitmap(kNumValues, null_probability);
+ }
+ auto array = *::arrow::StructArray::Make(
+ {values1, values2},
+ ::arrow::FieldVector{field("a", values1->type(), nullable),
+ field("b", values2->type(), nullable)},
+ null_bitmap);
+ auto schema = ::arrow::schema({field("s", array->type(), nullable)});
+ auto table = ::arrow::Table::Make(schema, {array}, array->length());
+
+ EXIT_NOT_OK(table->Validate());
+
+ BenchmarkReadTable(state, *table, kNumValues, kBytesPerValue);
+}
+
+BENCHMARK(BM_ReadStructColumn)
+ ->Arg(/*null_percentage=*/0)
+ ->Arg(/*null_percentage=*/1)
+ ->Arg(/*null_percentage=*/50)
+ ->Arg(/*null_percentage=*/99);
+
+static void BM_ReadListColumn(::benchmark::State& state) {
+ constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+ const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+ const bool nullable = (null_probability != 0.0);
+
+ ARROW_CHECK_GE(null_probability, 0.0);
+
+ ::arrow::random::RandomArrayGenerator rng(42);
+
+ auto values = rng.Int64(kNumValues, -5, 5, null_probability);
+ auto offsets = rng.Offsets(kNumValues / 10, 0, values->length(), null_probability);
Review comment:
`kNumValues / 10`
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489573502
##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -280,6 +298,81 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
->Args({kAlternatingOrNa, 1})
->Args({5, 10});
+//
+// Benchmark reading a nested column
+//
+
+static void BM_ReadStructColumn(::benchmark::State& state) {
+ constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+ const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+ const bool nullable = (null_probability != 0.0);
+
+ ARROW_CHECK_GE(null_probability, 0.0);
+
+ ::arrow::random::RandomArrayGenerator rng(42);
+
+ auto values1 = rng.Int32(kNumValues, -5, 5, null_probability);
Review comment:
Will do.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
emkornfield commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489554513
##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -280,6 +298,81 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
->Args({kAlternatingOrNa, 1})
->Args({5, 10});
+//
+// Benchmark reading a nested column
+//
+
+static void BM_ReadStructColumn(::benchmark::State& state) {
+ constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+ const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+ const bool nullable = (null_probability != 0.0);
+
+ ARROW_CHECK_GE(null_probability, 0.0);
+
+ ::arrow::random::RandomArrayGenerator rng(42);
+
+ auto values1 = rng.Int32(kNumValues, -5, 5, null_probability);
+ auto values2 =
+ rng.Int64(kNumValues, -12345678912345LL, 12345678912345LL, null_probability);
+
+ const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
+
+ std::shared_ptr<::arrow::Buffer> null_bitmap;
+ if (nullable) {
+ null_bitmap = rng.NullBitmap(kNumValues, null_probability);
+ }
+ auto array = *::arrow::StructArray::Make(
+ {values1, values2},
+ ::arrow::FieldVector{field("a", values1->type(), nullable),
+ field("b", values2->type(), nullable)},
+ null_bitmap);
+ auto schema = ::arrow::schema({field("s", array->type(), nullable)});
+ auto table = ::arrow::Table::Make(schema, {array}, array->length());
+
+ EXIT_NOT_OK(table->Validate());
+
+ BenchmarkReadTable(state, *table, kNumValues, kBytesPerValue);
+}
+
+BENCHMARK(BM_ReadStructColumn)
+ ->Arg(/*null_percentage=*/0)
+ ->Arg(/*null_percentage=*/1)
+ ->Arg(/*null_percentage=*/50)
+ ->Arg(/*null_percentage=*/99);
+
+static void BM_ReadListColumn(::benchmark::State& state) {
+ constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+ const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+ const bool nullable = (null_probability != 0.0);
+
+ ARROW_CHECK_GE(null_probability, 0.0);
+
+ ::arrow::random::RandomArrayGenerator rng(42);
+
+ auto values = rng.Int64(kNumValues, -5, 5, null_probability);
+ auto offsets = rng.Offsets(kNumValues / 10, 0, values->length(), null_probability);
Review comment:
so lists get larger as overall array goes up?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
pitrou commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693311458
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489593595
##########
File path: cpp/src/arrow/testing/random.cc
##########
@@ -250,12 +267,20 @@ std::shared_ptr<Array> RandomArrayGenerator::StringWithRepeats(int64_t size,
}
std::shared_ptr<Array> RandomArrayGenerator::Offsets(int64_t size, int32_t first_offset,
- int32_t last_offset) {
+ int32_t last_offset,
+ double null_probability) {
using GenOpt = GenerateOptions<int32_t, std::uniform_int_distribution<int32_t>>;
- GenOpt options(seed(), first_offset, last_offset, /*null_probability=*/0);
+ GenOpt options(seed(), first_offset, last_offset, null_probability);
BufferVector buffers{2};
+ int64_t null_count = 0;
+ buffers[0] = *AllocateEmptyBitmap(size);
+ options.GenerateBitmap(buffers[0]->mutable_data(), size, &null_count);
+ // Make sure the first and last entry are non-null
Review comment:
I'd rather keep this after all. It doesn't hurt and at least half of it is necessary.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou removed a comment on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
pitrou removed a comment on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693308298
@ursabot archery benchmark run --suite-filter=parquet-arrow-reader-writer-benchmark
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
pitrou commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693308298
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489554019
##########
File path: cpp/src/arrow/testing/random.cc
##########
@@ -250,12 +267,20 @@ std::shared_ptr<Array> RandomArrayGenerator::StringWithRepeats(int64_t size,
}
std::shared_ptr<Array> RandomArrayGenerator::Offsets(int64_t size, int32_t first_offset,
- int32_t last_offset) {
+ int32_t last_offset,
+ double null_probability) {
using GenOpt = GenerateOptions<int32_t, std::uniform_int_distribution<int32_t>>;
- GenOpt options(seed(), first_offset, last_offset, /*null_probability=*/0);
+ GenOpt options(seed(), first_offset, last_offset, null_probability);
BufferVector buffers{2};
+ int64_t null_count = 0;
+ buffers[0] = *AllocateEmptyBitmap(size);
+ options.GenerateBitmap(buffers[0]->mutable_data(), size, &null_count);
+ // Make sure the first and last entry are non-null
Review comment:
Hmm... you're right, it's probably not :-S
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou removed a comment on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
pitrou removed a comment on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693311458
@ursabot archery benchmark --suite-filter=parquet-arrow-reader-writer-benchmark
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] kszucs commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
kszucs commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693309587
@ursabot benchmark run --help
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
pitrou commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693307330
Here are the results here:
```
BM_ReadStructColumn/0 1706505 ns 1704955 ns 409 bytes_per_second=6.87335G/s items_per_second=615.017M/s
BM_ReadStructColumn/1 7827665 ns 7821031 ns 90 bytes_per_second=1.49836G/s items_per_second=134.071M/s
BM_ReadStructColumn/50 14142868 ns 14135647 ns 49 bytes_per_second=848.918M/s items_per_second=74.1796M/s
BM_ReadStructColumn/99 3846944 ns 3844119 ns 182 bytes_per_second=3.04849G/s items_per_second=272.774M/s
BM_ReadListColumn/0 7949690 ns 7946976 ns 87 bytes_per_second=1006.67M/s items_per_second=131.947M/s
BM_ReadListColumn/1 9302755 ns 9299473 ns 76 bytes_per_second=860.264M/s items_per_second=112.756M/s
BM_ReadListColumn/50 19390741 ns 19386146 ns 36 bytes_per_second=412.666M/s items_per_second=54.0889M/s
BM_ReadListColumn/99 7162830 ns 7159609 ns 97 bytes_per_second=1117.38M/s items_per_second=146.457M/s
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
emkornfield commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489556466
##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -280,6 +298,81 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
->Args({kAlternatingOrNa, 1})
->Args({5, 10});
+//
+// Benchmark reading a nested column
+//
+
+static void BM_ReadStructColumn(::benchmark::State& state) {
+ constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+ const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+ const bool nullable = (null_probability != 0.0);
+
+ ARROW_CHECK_GE(null_probability, 0.0);
+
+ ::arrow::random::RandomArrayGenerator rng(42);
+
+ auto values1 = rng.Int32(kNumValues, -5, 5, null_probability);
+ auto values2 =
+ rng.Int64(kNumValues, -12345678912345LL, 12345678912345LL, null_probability);
+
+ const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
+
+ std::shared_ptr<::arrow::Buffer> null_bitmap;
+ if (nullable) {
+ null_bitmap = rng.NullBitmap(kNumValues, null_probability);
+ }
+ auto array = *::arrow::StructArray::Make(
+ {values1, values2},
+ ::arrow::FieldVector{field("a", values1->type(), nullable),
+ field("b", values2->type(), nullable)},
+ null_bitmap);
+ auto schema = ::arrow::schema({field("s", array->type(), nullable)});
+ auto table = ::arrow::Table::Make(schema, {array}, array->length());
+
+ EXIT_NOT_OK(table->Validate());
+
+ BenchmarkReadTable(state, *table, kNumValues, kBytesPerValue);
+}
+
+BENCHMARK(BM_ReadStructColumn)
+ ->Arg(/*null_percentage=*/0)
+ ->Arg(/*null_percentage=*/1)
+ ->Arg(/*null_percentage=*/50)
+ ->Arg(/*null_percentage=*/99);
+
+static void BM_ReadListColumn(::benchmark::State& state) {
+ constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+ const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+ const bool nullable = (null_probability != 0.0);
+
+ ARROW_CHECK_GE(null_probability, 0.0);
+
+ ::arrow::random::RandomArrayGenerator rng(42);
+
+ auto values = rng.Int64(kNumValues, -5, 5, null_probability);
+ auto offsets = rng.Offsets(kNumValues / 10, 0, values->length(), null_probability);
Review comment:
ah, OK, I'm not sure if this is representative of real workloads but it is better then nothing.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] wesm commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
wesm commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693456506
Looks like there is something wrong with the Ursabot result reporting, but you can see the results here:
https://ci.ursalabs.org/#/builders/73/builds/103/steps/3/logs/result
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489591904
##########
File path: cpp/src/arrow/testing/random.cc
##########
@@ -250,12 +267,20 @@ std::shared_ptr<Array> RandomArrayGenerator::StringWithRepeats(int64_t size,
}
std::shared_ptr<Array> RandomArrayGenerator::Offsets(int64_t size, int32_t first_offset,
- int32_t last_offset) {
+ int32_t last_offset,
+ double null_probability) {
using GenOpt = GenerateOptions<int32_t, std::uniform_int_distribution<int32_t>>;
- GenOpt options(seed(), first_offset, last_offset, /*null_probability=*/0);
+ GenOpt options(seed(), first_offset, last_offset, null_probability);
BufferVector buffers{2};
+ int64_t null_count = 0;
+ buffers[0] = *AllocateEmptyBitmap(size);
+ options.GenerateBitmap(buffers[0]->mutable_data(), size, &null_count);
+ // Make sure the first and last entry are non-null
Review comment:
Well, at least the last offset should be non-null (according to `ListArray::FromArrays`). The first one needn't, apparently.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
emkornfield commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489541639
##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -280,6 +298,81 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
->Args({kAlternatingOrNa, 1})
->Args({5, 10});
+//
+// Benchmark reading a nested column
+//
+
+static void BM_ReadStructColumn(::benchmark::State& state) {
+ constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+ const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+ const bool nullable = (null_probability != 0.0);
+
+ ARROW_CHECK_GE(null_probability, 0.0);
+
+ ::arrow::random::RandomArrayGenerator rng(42);
+
+ auto values1 = rng.Int32(kNumValues, -5, 5, null_probability);
Review comment:
could you comment, the -5, and 5 to indicate what these parameters represent?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] ursabot commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
ursabot commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693311487
```
No such command "archery".
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
emkornfield commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489538445
##########
File path: cpp/src/arrow/testing/random.cc
##########
@@ -250,12 +267,20 @@ std::shared_ptr<Array> RandomArrayGenerator::StringWithRepeats(int64_t size,
}
std::shared_ptr<Array> RandomArrayGenerator::Offsets(int64_t size, int32_t first_offset,
- int32_t last_offset) {
+ int32_t last_offset,
+ double null_probability) {
using GenOpt = GenerateOptions<int32_t, std::uniform_int_distribution<int32_t>>;
- GenOpt options(seed(), first_offset, last_offset, /*null_probability=*/0);
+ GenOpt options(seed(), first_offset, last_offset, null_probability);
BufferVector buffers{2};
+ int64_t null_count = 0;
+ buffers[0] = *AllocateEmptyBitmap(size);
+ options.GenerateBitmap(buffers[0]->mutable_data(), size, &null_count);
+ // Make sure the first and last entry are non-null
Review comment:
why is this important?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
pitrou commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693574989
I think I addressed your review comments @emkornfield .
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
pitrou commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489588294
##########
File path: cpp/src/arrow/testing/random.h
##########
@@ -52,7 +60,7 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
std::shared_ptr<Array> Boolean(int64_t size, double true_probability,
double null_probability = 0);
- /// \brief Generates a random UInt8Array
+ /// \brief Generate a random UInt8Array
Review comment:
Hmm, I can add this somewhere in the docs. Most of our docstrings use infinite, AFAIK.
##########
File path: cpp/src/arrow/testing/random.h
##########
@@ -52,7 +60,7 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
std::shared_ptr<Array> Boolean(int64_t size, double true_probability,
double null_probability = 0);
- /// \brief Generates a random UInt8Array
+ /// \brief Generate a random UInt8Array
Review comment:
Hmm, I can add this somewhere in the docs. Most of our docstrings use the infinitive, AFAIK.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] ursabot commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
ursabot commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693308768
[AMD64 Ubuntu 18.04 C++ Benchmark (#126385)](https://ci.ursalabs.org/#builders/73/builds/102) builder failed.
Revision: e0b975e83ad6bae138811326e8f82c27d1cb5676
Archery: `'archery benchmark ...'` step's stderr:
```
Using optional gold linker (version 1.15)
Configured for RELEASE build (set with cmake -DCMAKE_BUILD_TYPE={release,debug,...})
Building Apache Thrift from source
CMake Warning at cmake_modules/ThirdpartyToolchain.cmake:189 (find_package):
No "Findbenchmark.cmake" found in CMAKE_MODULE_PATH.
Call Stack (most recent call first):
cmake_modules/ThirdpartyToolchain.cmake:1756 (resolve_dependency)
CMakeLists.txt:495 (include)
CMake Warning (dev) at cmake_modules/ThirdpartyToolchain.cmake:189 (find_package):
Findbenchmark.cmake must either be part of this project itself, in this
case adjust CMAKE_MODULE_PATH so that it points to the correct location
inside its source tree.
Or it must be installed by a package which has already been found via
find_package(). In this case make sure that package has indeed been found
and adjust CMAKE_MODULE_PATH to contain the location where that package has
installed Findbenchmark.cmake. This must be a location provided by that
package. This error in general means that the buildsystem of this project
is relying on a Find-module without ensuring that it is actually available.
Call Stack (most recent call first):
cmake_modules/ThirdpartyToolchain.cmake:1756 (resolve_dependency)
CMakeLists.txt:495 (include)
This warning is for project developers. Use -Wno-dev to suppress it.
Cloning into '/tmp/arrow-archery-udq6fnfw/run/arrow'...
done.
Checking out files: 95% (4663/4863)
Checking out files: 96% (4669/4863)
Checking out files: 97% (4718/4863)
Checking out files: 98% (4766/4863)
Checking out files: 99% (4815/4863)
Checking out files: 100% (4863/4863)
Checking out files: 100% (4863/4863), done.
fatal: ambiguous argument 'run': unknown revision or path not in the working tree.
Use '--' to separate paths from revisions, like this:
'git <command> [<revision>...] -- [<file>...]'
Traceback (most recent call last):
File "/usr/local/bin/archery", line 11, in <module>
load_entry_point('archery', 'console_scripts', 'archery')()
File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 764, in __call__
return self.main(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 717, in main
rv = self.invoke(ctx)
File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 1137, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 1137, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 956, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/usr/local/lib/python3.6/dist-packages/click/core.py", line 555, in invoke
return callback(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/click/decorators.py", line 17, in new_func
return f(get_current_context(), *args, **kwargs)
File "/buildbot/AMD64_Ubuntu_18_04_C___Benchmark/dev/archery/archery/cli.py", line 569, in benchmark_diff
benchmark_filter=benchmark_filter)
File "/buildbot/AMD64_Ubuntu_18_04_C___Benchmark/dev/archery/archery/benchmark/runner.py", line 83, in from_rev_or_path
src_rev, _ = src.at_revision(rev_or_path, clone_dir)
File "/buildbot/AMD64_Ubuntu_18_04_C___Benchmark/dev/archery/archery/utils/source.py", line 153, in at_revision
original_revision = git.rev_parse(revision)
File "/buildbot/AMD64_Ubuntu_18_04_C___Benchmark/dev/archery/archery/utils/command.py", line 48, in wrapper
return list_it(strip_it(f(*argv, **kwargs).stdout))
File "/buildbot/AMD64_Ubuntu_18_04_C___Benchmark/dev/archery/archery/utils/git.py", line 29, in wrapper
return fn(self, sub_cmd, *argv, **kwargs)
File "/buildbot/AMD64_Ubuntu_18_04_C___Benchmark/dev/archery/archery/utils/git.py", line 78, in rev_parse
return self.run_cmd(*argv, **kwargs)
File "/buildbot/AMD64_Ubuntu_18_04_C___Benchmark/dev/archery/archery/utils/git.py", line 43, in run_cmd
return self.run(*opts, cmd, *argv, **kwargs)
File "/buildbot/AMD64_Ubuntu_18_04_C___Benchmark/dev/archery/archery/utils/command.py", line 74, in run
return subprocess.run(invocation, **kwargs)
File "/usr/lib/python3.6/subprocess.py", line 438, in run
output=stdout, stderr=stderr)
subprocess.CalledProcessError: Command '['git', 'rev-parse', 'run']' returned non-zero exit status 128.
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] emkornfield closed pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
emkornfield closed pull request #8203:
URL: https://github.com/apache/arrow/pull/8203
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] ursabot commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
ursabot commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693308311
```
No such command "archery".
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] pitrou commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
pitrou commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693460386
Yeah, that's not what I wanted anyway. I want to run the new benchmarks, not diff existing benchmarks. Nevermind...
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] emkornfield commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
emkornfield commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-694307078
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
emkornfield commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489540876
##########
File path: cpp/src/arrow/testing/random.h
##########
@@ -52,7 +60,7 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator {
std::shared_ptr<Array> Boolean(int64_t size, double true_probability,
double null_probability = 0);
- /// \brief Generates a random UInt8Array
+ /// \brief Generate a random UInt8Array
Review comment:
nit: why the change? If we want to standardize on one vs the other we should probably update the style guide?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] ursabot removed a comment on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
ursabot removed a comment on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693311487
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] ursabot commented on pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
ursabot commented on pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#issuecomment-693309595
```
Usage: @ursabot benchmark [OPTIONS] [<baseline>]
Run the benchmark suite in comparison mode.
This command will run the benchmark suite for tip of the branch commit
against `<baseline>` (or master if not provided).
Examples:
# Run the all the benchmarks
@ursabot benchmark
# Compare only benchmarks where the name matches the /^Sum/ regex
@ursabot benchmark --benchmark-filter=^Sum
# Compare only benchmarks where the suite matches the /compute-/ regex.
# A suite is the C++ binary.
@ursabot benchmark --suite-filter=compute-
# Sometimes a new optimization requires the addition of new benchmarks to
# quantify the performance increase. When doing this be sure to add the
# benchmark in a separate commit before introducing the optimization.
#
# Note that specifying the baseline is the only way to compare using a new
# benchmark, since master does not contain the new benchmark and no
# comparison is possible.
#
# The following command compares the results of matching benchmarks,
# compiling against HEAD and the provided baseline commit, e.g. eaf8302.
# You can use this to quantify the performance improvement of new
# optimizations or to check for regressions.
@ursabot benchmark --benchmark-filter=MyBenchmark eaf8302
Options:
--suite-filter <regex> Regex filtering benchmark suites.
--benchmark-filter <regex> Regex filtering benchmarks.
--cc <compiler> C compiler.
--cxx <compiler> C++ compiler.
--cxx-flags TEXT C++ compiler flags.
--repetitions INTEGER Number of repetitions of each benchmark.
Increasing may improve result precision.
[default: 1]
--help Show this message and exit.
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #8203: ARROW-10024: [C++][Parquet] Create nested reading benchmarks
Posted by GitBox <gi...@apache.org>.
emkornfield commented on a change in pull request #8203:
URL: https://github.com/apache/arrow/pull/8203#discussion_r489542938
##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -280,6 +298,81 @@ BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
->Args({kAlternatingOrNa, 1})
->Args({5, 10});
+//
+// Benchmark reading a nested column
+//
+
+static void BM_ReadStructColumn(::benchmark::State& state) {
+ constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+ const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+ const bool nullable = (null_probability != 0.0);
+
+ ARROW_CHECK_GE(null_probability, 0.0);
+
+ ::arrow::random::RandomArrayGenerator rng(42);
+
+ auto values1 = rng.Int32(kNumValues, -5, 5, null_probability);
+ auto values2 =
+ rng.Int64(kNumValues, -12345678912345LL, 12345678912345LL, null_probability);
+
+ const int64_t kBytesPerValue = sizeof(int32_t) + sizeof(int64_t);
+
+ std::shared_ptr<::arrow::Buffer> null_bitmap;
+ if (nullable) {
+ null_bitmap = rng.NullBitmap(kNumValues, null_probability);
+ }
+ auto array = *::arrow::StructArray::Make(
+ {values1, values2},
+ ::arrow::FieldVector{field("a", values1->type(), nullable),
+ field("b", values2->type(), nullable)},
+ null_bitmap);
+ auto schema = ::arrow::schema({field("s", array->type(), nullable)});
+ auto table = ::arrow::Table::Make(schema, {array}, array->length());
+
+ EXIT_NOT_OK(table->Validate());
+
+ BenchmarkReadTable(state, *table, kNumValues, kBytesPerValue);
+}
+
+BENCHMARK(BM_ReadStructColumn)
+ ->Arg(/*null_percentage=*/0)
+ ->Arg(/*null_percentage=*/1)
+ ->Arg(/*null_percentage=*/50)
+ ->Arg(/*null_percentage=*/99);
+
+static void BM_ReadListColumn(::benchmark::State& state) {
+ constexpr int64_t kNumValues = BENCHMARK_SIZE / 10;
+ const double null_probability = static_cast<double>(state.range(0)) / 100.0;
+ const bool nullable = (null_probability != 0.0);
+
+ ARROW_CHECK_GE(null_probability, 0.0);
+
+ ::arrow::random::RandomArrayGenerator rng(42);
+
+ auto values = rng.Int64(kNumValues, -5, 5, null_probability);
+ auto offsets = rng.Offsets(kNumValues / 10, 0, values->length(), null_probability);
Review comment:
approximately what size lists does this generate?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org