You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/11/02 17:31:53 UTC

[GitHub] [arrow] benibus commented on a diff in pull request #14552: ARROW-18184: [C++] Improve JSON parser benchmarks

benibus commented on code in PR #14552:
URL: https://github.com/apache/arrow/pull/14552#discussion_r1012095512


##########
cpp/src/arrow/json/test_common.h:
##########
@@ -133,31 +150,72 @@ struct GenerateImpl {
     return Status::NotImplemented("random generation of arrays of type ", t);
   }
 
+  static bool GetBool(Engine& e, double probability) {
+    return std::uniform_real_distribution<>{0, 1}(e) < probability;
+  }
+
+  static bool MaybeWriteNull(Engine& e, Writer* writer, double probability) {
+    bool outcome = GetBool(e, probability);
+    if (outcome) writer->Null();
+    return outcome;
+  }
+
+  static Status WriteValue(const DataType& type, Engine& e, Writer* writer,
+                           const GenerateOptions& options) {
+    GenerateImpl visitor = {e, *writer, options};
+    return VisitTypeInline(type, &visitor);
+  }
+
   Engine& e;
   rj::Writer<rj::StringBuffer>& writer;
+  const GenerateOptions& options;
 };
 
 template <typename Engine>
 inline static Status Generate(const std::shared_ptr<DataType>& type, Engine& e,
-                              Writer* writer) {
-  if (std::uniform_real_distribution<>{0, 1}(e) < .2) {
-    // one out of 5 chance of null, anywhere
-    writer->Null();
+                              Writer* writer, const GenerateOptions& options) {
+  using Impl = GenerateImpl<Engine>;
+  if (Impl::MaybeWriteNull(e, writer, options.null_probability)) {
     return Status::OK();
   }
-  GenerateImpl<Engine> visitor = {e, *writer};
-  return VisitTypeInline(*type, &visitor);
+  return Impl::WriteValue(*type, e, writer, options);
 }
 
 template <typename Engine>
 inline static Status Generate(const std::vector<std::shared_ptr<Field>>& fields,
-                              Engine& e, Writer* writer) {
+                              Engine& e, Writer* writer, const GenerateOptions& options) {
+  using Impl = GenerateImpl<Engine>;
+
   RETURN_NOT_OK(OK(writer->StartObject()));
-  for (const auto& f : fields) {
-    writer->Key(f->name().c_str());
-    RETURN_NOT_OK(Generate(f->type(), e, writer));
+  if (fields.empty()) {
+    return OK(writer->EndObject(0));
+  }
+
+  // Indices of the fields we plan to actually write, in order
+  std::vector<size_t> indices;
+  indices.reserve(static_cast<size_t>(fields.size() * options.field_probability));
+
+  for (size_t i = 0; i < fields.size(); ++i) {
+    if (Impl::GetBool(e, options.field_probability)) {
+      indices.push_back(i);
+    }
+  }
+  if (options.randomize_field_order) {
+    std::shuffle(indices.begin(), indices.end(), e);
   }
-  return OK(writer->EndObject(static_cast<int>(fields.size())));
+
+  // Scale the likelyhood of null values to account for any fields we've dropped
+  double null_probability = options.null_probability;
+  null_probability *= static_cast<double>(indices.size()) / fields.size();

Review Comment:
   Yeah... that calculation is egregiously wrong, actually. `null_probability` doesn't need to be adjusted at all. Fortunately, that simplifies everything else.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org