You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/11/02 13:32:00 UTC
[GitHub] [arrow] pitrou commented on a diff in pull request #14552: ARROW-18184: [C++] Improve JSON parser benchmarks

pitrou commented on code in PR #14552:
URL: https://github.com/apache/arrow/pull/14552#discussion_r1011771162


##########
cpp/src/arrow/json/test_common.h:
##########
@@ -133,31 +150,72 @@ struct GenerateImpl {
     return Status::NotImplemented("random generation of arrays of type ", t);
   }
 
+  static bool GetBool(Engine& e, double probability) {
+    return std::uniform_real_distribution<>{0, 1}(e) < probability;
+  }
+
+  static bool MaybeWriteNull(Engine& e, Writer* writer, double probability) {
+    bool outcome = GetBool(e, probability);
+    if (outcome) writer->Null();
+    return outcome;
+  }
+
+  static Status WriteValue(const DataType& type, Engine& e, Writer* writer,
+                           const GenerateOptions& options) {
+    GenerateImpl visitor = {e, *writer, options};
+    return VisitTypeInline(type, &visitor);
+  }
+
   Engine& e;
   rj::Writer<rj::StringBuffer>& writer;
+  const GenerateOptions& options;
 };
 
 template <typename Engine>
 inline static Status Generate(const std::shared_ptr<DataType>& type, Engine& e,
-                              Writer* writer) {
-  if (std::uniform_real_distribution<>{0, 1}(e) < .2) {
-    // one out of 5 chance of null, anywhere
-    writer->Null();
+                              Writer* writer, const GenerateOptions& options) {
+  using Impl = GenerateImpl<Engine>;
+  if (Impl::MaybeWriteNull(e, writer, options.null_probability)) {
     return Status::OK();
   }
-  GenerateImpl<Engine> visitor = {e, *writer};
-  return VisitTypeInline(*type, &visitor);
+  return Impl::WriteValue(*type, e, writer, options);
 }
 
 template <typename Engine>
 inline static Status Generate(const std::vector<std::shared_ptr<Field>>& fields,
-                              Engine& e, Writer* writer) {
+                              Engine& e, Writer* writer, const GenerateOptions& options) {
+  using Impl = GenerateImpl<Engine>;
+
   RETURN_NOT_OK(OK(writer->StartObject()));
-  for (const auto& f : fields) {
-    writer->Key(f->name().c_str());
-    RETURN_NOT_OK(Generate(f->type(), e, writer));
+  if (fields.empty()) {
+    return OK(writer->EndObject(0));
+  }
+
+  // Indices of the fields we plan to actually write, in order
+  std::vector<size_t> indices;
+  indices.reserve(static_cast<size_t>(fields.size() * options.field_probability));
+
+  for (size_t i = 0; i < fields.size(); ++i) {
+    if (Impl::GetBool(e, options.field_probability)) {
+      indices.push_back(i);
+    }
+  }
+  if (options.randomize_field_order) {
+    std::shuffle(indices.begin(), indices.end(), e);
   }
-  return OK(writer->EndObject(static_cast<int>(fields.size())));
+
+  // Scale the likelyhood of null values to account for any fields we've dropped
+  double null_probability = options.null_probability;
+  null_probability *= static_cast<double>(indices.size()) / fields.size();

Review Comment:
   This calculation is a bit weird. For example, if you have `options.null_probability = 0.2` and `option.field_probability = 0.1`, then you'll end using `null_probability = 0.02`, but in the end the proportion of non-null fields will be around 0.08 (far from 0.2).
   
   That said, it may not matter much that the calculation is incorrect, since the important parameter for this benchmark is bound to be the presence/absence fields rather than their null-ness.



##########
cpp/src/arrow/json/test_common.h:
##########
@@ -133,31 +150,72 @@ struct GenerateImpl {
     return Status::NotImplemented("random generation of arrays of type ", t);
   }
 
+  static bool GetBool(Engine& e, double probability) {
+    return std::uniform_real_distribution<>{0, 1}(e) < probability;
+  }
+
+  static bool MaybeWriteNull(Engine& e, Writer* writer, double probability) {
+    bool outcome = GetBool(e, probability);
+    if (outcome) writer->Null();
+    return outcome;
+  }
+
+  static Status WriteValue(const DataType& type, Engine& e, Writer* writer,
+                           const GenerateOptions& options) {
+    GenerateImpl visitor = {e, *writer, options};
+    return VisitTypeInline(type, &visitor);
+  }
+
   Engine& e;
   rj::Writer<rj::StringBuffer>& writer;
+  const GenerateOptions& options;
 };
 
 template <typename Engine>
 inline static Status Generate(const std::shared_ptr<DataType>& type, Engine& e,
-                              Writer* writer) {
-  if (std::uniform_real_distribution<>{0, 1}(e) < .2) {
-    // one out of 5 chance of null, anywhere
-    writer->Null();
+                              Writer* writer, const GenerateOptions& options) {
+  using Impl = GenerateImpl<Engine>;
+  if (Impl::MaybeWriteNull(e, writer, options.null_probability)) {
     return Status::OK();
   }
-  GenerateImpl<Engine> visitor = {e, *writer};
-  return VisitTypeInline(*type, &visitor);
+  return Impl::WriteValue(*type, e, writer, options);
 }
 
 template <typename Engine>
 inline static Status Generate(const std::vector<std::shared_ptr<Field>>& fields,
-                              Engine& e, Writer* writer) {
+                              Engine& e, Writer* writer, const GenerateOptions& options) {
+  using Impl = GenerateImpl<Engine>;
+
   RETURN_NOT_OK(OK(writer->StartObject()));
-  for (const auto& f : fields) {
-    writer->Key(f->name().c_str());
-    RETURN_NOT_OK(Generate(f->type(), e, writer));
+  if (fields.empty()) {
+    return OK(writer->EndObject(0));
+  }
+
+  // Indices of the fields we plan to actually write, in order
+  std::vector<size_t> indices;
+  indices.reserve(static_cast<size_t>(fields.size() * options.field_probability));
+
+  for (size_t i = 0; i < fields.size(); ++i) {
+    if (Impl::GetBool(e, options.field_probability)) {

Review Comment:
   This is paying the price of instantiating the distribution for each field, how about folding it here?
   (`GetBool` is trivial enough to perhaps not warrant a dedicated helper)
   
   Also, the idiomatic distribution would probably be `std::bernoulli_distribution`.



##########
cpp/src/arrow/json/parser_benchmark.cc:
##########
@@ -153,12 +187,44 @@ static void ReadJSONBlockWithSchemaMultiThread(
   BenchmarkReadJSONBlockWithSchema(state, true);
 }
 
+static void ParseFields(benchmark::State& state) {  // NOLINT non-const reference
+  const bool ordered = !!state.range(0);
+  const bool with_schema = !!state.range(1);
+  const double sparsity = state.range(2) / 10.0;
+  const auto num_fields = static_cast<int>(state.range(3));
+
+  const int32_t num_rows = 1000;
+
+  auto fields = GenerateTestFields(num_fields, 10);
+
+  auto parse_options = ParseOptions::Defaults();
+  if (with_schema) {
+    parse_options.explicit_schema = schema(fields);
+    parse_options.unexpected_field_behavior = UnexpectedFieldBehavior::Error;
+  }
+
+  auto gen_options = GenerateOptions::Defaults();
+  gen_options.field_probability = 1.0 - sparsity;
+  gen_options.randomize_field_order = !ordered;
+
+  auto json = GenerateTestData(fields, num_rows, gen_options);
+  BenchmarkJSONParsing(state, std::make_shared<Buffer>(json), parse_options);
+}
+
 BENCHMARK(ChunkJSONPrettyPrinted);
 BENCHMARK(ChunkJSONLineDelimited);
 BENCHMARK(ParseJSONBlockWithSchema);
 
 BENCHMARK(ReadJSONBlockWithSchemaSingleThread);
 BENCHMARK(ReadJSONBlockWithSchemaMultiThread)->UseRealTime();
 
+BENCHMARK(ParseFields)

Review Comment:
   Let's keep using a consistent nomenclature
   ```suggestion
   BENCHMARK(ParseJSONFields)
   ```



##########
cpp/src/arrow/json/parser_benchmark.cc:
##########
@@ -69,9 +102,9 @@ static void ChunkJSONPrettyPrinted(
 
   auto options = ParseOptions::Defaults();
   options.newlines_in_values = true;
-  options.explicit_schema = TestSchema();
+  options.explicit_schema = schema(TestFields());
 
-  auto json = TestJsonData(num_rows, /* pretty */ true);
+  auto json = GenerateTestData(options.explicit_schema, num_rows, true);

Review Comment:
   ```suggestion
     auto json = GenerateTestData(options.explicit_schema, num_rows, /*pretty=*/ true);
   ```



##########
cpp/src/arrow/json/parser_benchmark.cc:
##########
@@ -153,12 +187,44 @@ static void ReadJSONBlockWithSchemaMultiThread(
   BenchmarkReadJSONBlockWithSchema(state, true);
 }
 
+static void ParseFields(benchmark::State& state) {  // NOLINT non-const reference
+  const bool ordered = !!state.range(0);
+  const bool with_schema = !!state.range(1);
+  const double sparsity = state.range(2) / 10.0;
+  const auto num_fields = static_cast<int>(state.range(3));
+
+  const int32_t num_rows = 1000;
+
+  auto fields = GenerateTestFields(num_fields, 10);
+
+  auto parse_options = ParseOptions::Defaults();
+  if (with_schema) {
+    parse_options.explicit_schema = schema(fields);
+    parse_options.unexpected_field_behavior = UnexpectedFieldBehavior::Error;
+  }
+
+  auto gen_options = GenerateOptions::Defaults();
+  gen_options.field_probability = 1.0 - sparsity;
+  gen_options.randomize_field_order = !ordered;
+
+  auto json = GenerateTestData(fields, num_rows, gen_options);
+  BenchmarkJSONParsing(state, std::make_shared<Buffer>(json), parse_options);
+}
+
 BENCHMARK(ChunkJSONPrettyPrinted);
 BENCHMARK(ChunkJSONLineDelimited);
 BENCHMARK(ParseJSONBlockWithSchema);
 
 BENCHMARK(ReadJSONBlockWithSchemaSingleThread);
 BENCHMARK(ReadJSONBlockWithSchemaMultiThread)->UseRealTime();
 
+BENCHMARK(ParseFields)
+    // NOTE: "sparsity" is the proportion of missing fields from 0-10
+    ->ArgNames({"ordered", "schema", "sparsity", "n"})

Review Comment:
   Perhaps a little more explicit:
   ```suggestion
       ->ArgNames({"ordered", "schema", "sparsity", "num_fields"})
   ```



##########
cpp/src/arrow/json/parser_benchmark.cc:
##########
@@ -153,12 +187,44 @@ static void ReadJSONBlockWithSchemaMultiThread(
   BenchmarkReadJSONBlockWithSchema(state, true);
 }
 
+static void ParseFields(benchmark::State& state) {  // NOLINT non-const reference
+  const bool ordered = !!state.range(0);
+  const bool with_schema = !!state.range(1);
+  const double sparsity = state.range(2) / 10.0;
+  const auto num_fields = static_cast<int>(state.range(3));
+
+  const int32_t num_rows = 1000;
+
+  auto fields = GenerateTestFields(num_fields, 10);
+
+  auto parse_options = ParseOptions::Defaults();
+  if (with_schema) {
+    parse_options.explicit_schema = schema(fields);
+    parse_options.unexpected_field_behavior = UnexpectedFieldBehavior::Error;
+  }
+
+  auto gen_options = GenerateOptions::Defaults();
+  gen_options.field_probability = 1.0 - sparsity;
+  gen_options.randomize_field_order = !ordered;
+
+  auto json = GenerateTestData(fields, num_rows, gen_options);
+  BenchmarkJSONParsing(state, std::make_shared<Buffer>(json), parse_options);
+}
+
 BENCHMARK(ChunkJSONPrettyPrinted);
 BENCHMARK(ChunkJSONLineDelimited);
 BENCHMARK(ParseJSONBlockWithSchema);
 
 BENCHMARK(ReadJSONBlockWithSchemaSingleThread);
 BENCHMARK(ReadJSONBlockWithSchemaMultiThread)->UseRealTime();
 
+BENCHMARK(ParseFields)
+    // NOTE: "sparsity" is the proportion of missing fields from 0-10
+    ->ArgNames({"ordered", "schema", "sparsity", "n"})
+    // Ordered/unordered fields with/without an explicit schema
+    ->ArgsProduct({{1, 0}, {1, 0}, {0}, {10, 100, 1000}})
+    // Non-contiguous ordered fields with/without an explicit schema
+    ->ArgsProduct({{1}, {1, 0}, {1, 9}, {10, 100, 1000}});

Review Comment:
   Why not also test unordered field data here?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org