You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/04/19 12:36:31 UTC

[GitHub] [arrow] lidavidm commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

lidavidm commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r852998113


##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -108,13 +108,23 @@ void SetElement(size_t i, const Element& element, std::vector<T>* vector) {
   }
   (*vector)[i] = static_cast<T>(element);
 }
+
+template <typename Element, typename key, typename value>
+void SetMapElement(key i, const Element& element, std::unordered_map<key, value>* map) {
+  DCHECK_LE(i, 1 << 20);

Review Comment:
   Why the hardcoded limit? If it's important, can it be made constexpr?



##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -108,13 +108,23 @@ void SetElement(size_t i, const Element& element, std::vector<T>* vector) {
   }
   (*vector)[i] = static_cast<T>(element);
 }
+
+template <typename Element, typename key, typename value>
+void SetMapElement(key i, const Element& element, std::unordered_map<key, value>* map) {
+  DCHECK_LE(i, 1 << 20);
+  if (i >= map->size()) {
+    map->reserve(i + 1);
+  }
+  (*map)[i] = static_cast<value>(element);
+}
+
 }  // namespace
 
 Result<ExtensionSet> GetExtensionSetFromPlan(const substrait::Plan& plan,
                                              ExtensionIdRegistry* registry) {
-  std::vector<util::string_view> uris;
+  std::unordered_map<uint32_t, util::string_view> uris;
   for (const auto& uri : plan.extension_uris()) {
-    SetElement(uri.extension_uri_anchor(), uri.uri(), &uris);
+    SetMapElement(uri.extension_uri_anchor(), uri.uri(), &uris);

Review Comment:
   Why not just inline this?



##########
cpp/src/arrow/engine/substrait/extension_set.h:
##########
@@ -226,14 +246,10 @@ class ARROW_ENGINE_EXPORT ExtensionSet {
  private:
   ExtensionIdRegistry* registry_;
   /// The subset of extension registry URIs referenced by this extension set
-  std::vector<util::string_view> uris_;
+  std::unordered_map<uint32_t, util::string_view> uris_;
   std::vector<TypeRecord> types_;
-
   std::vector<FunctionRecord> functions_;
-
-  // pimpl pattern to hide lookup details
-  struct Impl;
-  std::unique_ptr<Impl, void (*)(Impl*)> impl_;
+  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_map_, functions_map_;

Review Comment:
   Does this need to be exposed in the public header? The pImpl at least kept these details out of the header.



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -40,99 +40,65 @@ struct TypePtrHashEq {
   }
 };
 
-struct IdHashEq {
-  using Id = ExtensionSet::Id;
-
-  size_t operator()(Id id) const {
-    constexpr ::arrow::internal::StringViewHash hash = {};
-    auto out = static_cast<size_t>(hash(id.uri));
-    ::arrow::internal::hash_combine(out, hash(id.name));
-    return out;
-  }
-
-  bool operator()(Id l, Id r) const { return l.uri == r.uri && l.name == r.name; }
-};
-
 }  // namespace
 
 // A builder used when creating a Substrait plan from an Arrow execution plan.  In
 // that situation we do not have a set of anchor values already defined so we keep
 // a map of what Ids we have seen.
-struct ExtensionSet::Impl {
-  void AddUri(util::string_view uri, ExtensionSet* self) {
-    if (uris_.find(uri) != uris_.end()) return;
-
-    self->uris_.push_back(uri);
-    uris_.insert(self->uris_.back());  // lookup helper's keys should reference memory
-                                       // owned by this ExtensionSet
-  }
-
-  Status CheckHasUri(util::string_view uri) {
-    if (uris_.find(uri) != uris_.end()) return Status::OK();
-
-    return Status::Invalid(
-        "Uri ", uri,
-        " was referenced by an extension but was not declared in the ExtensionSet.");
-  }
-
-  uint32_t EncodeType(ExtensionIdRegistry::TypeRecord type_record, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(type_record.id.uri, self);
-    auto it_success =
-        types_.emplace(type_record.id, static_cast<uint32_t>(types_.size()));
-
-    if (it_success.second) {
-      self->types_.push_back(
-          {type_record.id, type_record.type, type_record.is_variation});
-    }
-
-    return it_success.first->second;
-  }
-
-  uint32_t EncodeFunction(Id id, util::string_view function_name, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(id.uri, self);
-    auto it_success = functions_.emplace(id, static_cast<uint32_t>(functions_.size()));
-
-    if (it_success.second) {
-      self->functions_.push_back({id, function_name});
-    }
-
-    return it_success.first->second;
-  }
+ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry) : registry_(registry) {}
+
+Status ExtensionSet::CheckHasUri(util::string_view uri, ExtensionSet* self) {
+  auto it =
+      std::find_if(self->uris_.begin(), self->uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri;
+                   });
+  if (it != self->uris_.end()) return Status::OK();
+
+  return Status::Invalid(
+      "Uri ", uri,
+      " was referenced by an extension but was not declared in the ExtensionSet.");
+}
 
-  std::unordered_set<util::string_view, ::arrow::internal::StringViewHash> uris_;
-  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_, functions_;
-};
+void ExtensionSet::AddUri(std::pair<uint32_t, util::string_view> uri,
+                          ExtensionSet* self) {
+  auto it =
+      std::find_if(self->uris_.begin(), self->uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri.second;
+                   });
+  if (it != self->uris_.end()) return;
+  self->uris_[uri.first] = uri.second;
+}
 
-ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry)
-    : registry_(registry), impl_(new Impl(), [](Impl* impl) { delete impl; }) {}
+void ExtensionSet::AddUri(Id id, ExtensionSet* self) {
+  if (self->functions_map_.find(id) != self->functions_map_.end() &&
+      self->types_map_.find(id) != self->types_map_.end())
+    return;

Review Comment:
   This check isn't in the original, is it needed now? Also, it's missing braces, and shouldn't this be `||` not `&&`



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -40,99 +40,65 @@ struct TypePtrHashEq {
   }
 };
 
-struct IdHashEq {
-  using Id = ExtensionSet::Id;
-
-  size_t operator()(Id id) const {
-    constexpr ::arrow::internal::StringViewHash hash = {};
-    auto out = static_cast<size_t>(hash(id.uri));
-    ::arrow::internal::hash_combine(out, hash(id.name));
-    return out;
-  }
-
-  bool operator()(Id l, Id r) const { return l.uri == r.uri && l.name == r.name; }
-};
-
 }  // namespace
 
 // A builder used when creating a Substrait plan from an Arrow execution plan.  In
 // that situation we do not have a set of anchor values already defined so we keep
 // a map of what Ids we have seen.
-struct ExtensionSet::Impl {
-  void AddUri(util::string_view uri, ExtensionSet* self) {
-    if (uris_.find(uri) != uris_.end()) return;
-
-    self->uris_.push_back(uri);
-    uris_.insert(self->uris_.back());  // lookup helper's keys should reference memory
-                                       // owned by this ExtensionSet
-  }
-
-  Status CheckHasUri(util::string_view uri) {
-    if (uris_.find(uri) != uris_.end()) return Status::OK();
-
-    return Status::Invalid(
-        "Uri ", uri,
-        " was referenced by an extension but was not declared in the ExtensionSet.");
-  }
-
-  uint32_t EncodeType(ExtensionIdRegistry::TypeRecord type_record, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(type_record.id.uri, self);
-    auto it_success =
-        types_.emplace(type_record.id, static_cast<uint32_t>(types_.size()));
-
-    if (it_success.second) {
-      self->types_.push_back(
-          {type_record.id, type_record.type, type_record.is_variation});
-    }
-
-    return it_success.first->second;
-  }
-
-  uint32_t EncodeFunction(Id id, util::string_view function_name, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(id.uri, self);
-    auto it_success = functions_.emplace(id, static_cast<uint32_t>(functions_.size()));
-
-    if (it_success.second) {
-      self->functions_.push_back({id, function_name});
-    }
-
-    return it_success.first->second;
-  }
+ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry) : registry_(registry) {}
+
+Status ExtensionSet::CheckHasUri(util::string_view uri, ExtensionSet* self) {
+  auto it =
+      std::find_if(self->uris_.begin(), self->uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri;
+                   });
+  if (it != self->uris_.end()) return Status::OK();
+
+  return Status::Invalid(
+      "Uri ", uri,
+      " was referenced by an extension but was not declared in the ExtensionSet.");
+}
 
-  std::unordered_set<util::string_view, ::arrow::internal::StringViewHash> uris_;
-  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_, functions_;
-};
+void ExtensionSet::AddUri(std::pair<uint32_t, util::string_view> uri,
+                          ExtensionSet* self) {
+  auto it =
+      std::find_if(self->uris_.begin(), self->uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri.second;
+                   });
+  if (it != self->uris_.end()) return;
+  self->uris_[uri.first] = uri.second;
+}
 
-ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry)
-    : registry_(registry), impl_(new Impl(), [](Impl* impl) { delete impl; }) {}
+void ExtensionSet::AddUri(Id id, ExtensionSet* self) {
+  if (self->functions_map_.find(id) != self->functions_map_.end() &&
+      self->types_map_.find(id) != self->types_map_.end())
+    return;
+  self->uris_[self->uris_.size()] = id.uri;

Review Comment:
   This causes a compiler warning on MSVC



##########
cpp/src/arrow/engine/substrait/extension_set.h:
##########
@@ -143,6 +159,10 @@ class ARROW_ENGINE_EXPORT ExtensionSet {
   explicit ExtensionSet(ExtensionIdRegistry* = default_extension_id_registry());
   ARROW_DEFAULT_MOVE_AND_ASSIGN(ExtensionSet);
 
+  static Status CheckHasUri(util::string_view uri, ExtensionSet* self);
+  static void AddUri(std::pair<uint32_t, util::string_view> uri, ExtensionSet* self);
+  static void AddUri(Id id, ExtensionSet* self);

Review Comment:
   Make these `private`?



##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -108,13 +108,23 @@ void SetElement(size_t i, const Element& element, std::vector<T>* vector) {
   }
   (*vector)[i] = static_cast<T>(element);
 }
+
+template <typename Element, typename key, typename value>
+void SetMapElement(key i, const Element& element, std::unordered_map<key, value>* map) {
+  DCHECK_LE(i, 1 << 20);
+  if (i >= map->size()) {
+    map->reserve(i + 1);
+  }

Review Comment:
   I don't think reserve does much when you're doing it one-by-one. Maybe move it outside of the loop instead.



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -40,99 +40,65 @@ struct TypePtrHashEq {
   }
 };
 
-struct IdHashEq {
-  using Id = ExtensionSet::Id;
-
-  size_t operator()(Id id) const {
-    constexpr ::arrow::internal::StringViewHash hash = {};
-    auto out = static_cast<size_t>(hash(id.uri));
-    ::arrow::internal::hash_combine(out, hash(id.name));
-    return out;
-  }
-
-  bool operator()(Id l, Id r) const { return l.uri == r.uri && l.name == r.name; }
-};
-
 }  // namespace
 
 // A builder used when creating a Substrait plan from an Arrow execution plan.  In
 // that situation we do not have a set of anchor values already defined so we keep
 // a map of what Ids we have seen.
-struct ExtensionSet::Impl {
-  void AddUri(util::string_view uri, ExtensionSet* self) {
-    if (uris_.find(uri) != uris_.end()) return;
-
-    self->uris_.push_back(uri);
-    uris_.insert(self->uris_.back());  // lookup helper's keys should reference memory
-                                       // owned by this ExtensionSet
-  }
-
-  Status CheckHasUri(util::string_view uri) {
-    if (uris_.find(uri) != uris_.end()) return Status::OK();
-
-    return Status::Invalid(
-        "Uri ", uri,
-        " was referenced by an extension but was not declared in the ExtensionSet.");
-  }
-
-  uint32_t EncodeType(ExtensionIdRegistry::TypeRecord type_record, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(type_record.id.uri, self);
-    auto it_success =
-        types_.emplace(type_record.id, static_cast<uint32_t>(types_.size()));
-
-    if (it_success.second) {
-      self->types_.push_back(
-          {type_record.id, type_record.type, type_record.is_variation});
-    }
-
-    return it_success.first->second;
-  }
-
-  uint32_t EncodeFunction(Id id, util::string_view function_name, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(id.uri, self);
-    auto it_success = functions_.emplace(id, static_cast<uint32_t>(functions_.size()));
-
-    if (it_success.second) {
-      self->functions_.push_back({id, function_name});
-    }
-
-    return it_success.first->second;
-  }
+ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry) : registry_(registry) {}
+
+Status ExtensionSet::CheckHasUri(util::string_view uri, ExtensionSet* self) {
+  auto it =
+      std::find_if(self->uris_.begin(), self->uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri;
+                   });
+  if (it != self->uris_.end()) return Status::OK();
+
+  return Status::Invalid(
+      "Uri ", uri,
+      " was referenced by an extension but was not declared in the ExtensionSet.");
+}
 
-  std::unordered_set<util::string_view, ::arrow::internal::StringViewHash> uris_;
-  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_, functions_;
-};
+void ExtensionSet::AddUri(std::pair<uint32_t, util::string_view> uri,
+                          ExtensionSet* self) {
+  auto it =
+      std::find_if(self->uris_.begin(), self->uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri.second;
+                   });
+  if (it != self->uris_.end()) return;
+  self->uris_[uri.first] = uri.second;
+}
 
-ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry)
-    : registry_(registry), impl_(new Impl(), [](Impl* impl) { delete impl; }) {}
+void ExtensionSet::AddUri(Id id, ExtensionSet* self) {
+  if (self->functions_map_.find(id) != self->functions_map_.end() &&
+      self->types_map_.find(id) != self->types_map_.end())
+    return;
+  self->uris_[self->uris_.size()] = id.uri;
+}
 
-Result<ExtensionSet> ExtensionSet::Make(std::vector<util::string_view> uris,
-                                        std::vector<Id> type_ids,
-                                        std::vector<bool> type_is_variation,
-                                        std::vector<Id> function_ids,
-                                        ExtensionIdRegistry* registry) {
+Result<ExtensionSet> ExtensionSet::Make(
+    std::unordered_map<uint32_t, util::string_view> uris, std::vector<Id> type_ids,
+    std::vector<bool> type_is_variation, std::vector<Id> function_ids,
+    ExtensionIdRegistry* registry) {
   ExtensionSet set;
   set.registry_ = registry;
 
   // TODO(bkietz) move this into the registry as registry->OwnUris(&uris) or so
   std::unordered_set<util::string_view, ::arrow::internal::StringViewHash>
       uris_owned_by_registry;
-  for (util::string_view uri : registry->Uris()) {
+  for (auto uri : registry->Uris()) {
     uris_owned_by_registry.insert(uri);
   }
 
   for (auto& uri : uris) {
-    if (uri.empty()) continue;
-    auto it = uris_owned_by_registry.find(uri);
+    auto it = uris_owned_by_registry.find(uri.second);
     if (it == uris_owned_by_registry.end()) {
-      return Status::KeyError("Uri '", uri, "' not found in registry");
+      return Status::KeyError("Uri '", uri.second, "' not found in registry");
     }
-    uri = *it;  // Ensure uris point into the registry's memory
-    set.impl_->AddUri(*it, &set);
+    // uri = *it;  // Ensure uris point into the registry's memory

Review Comment:
   Commented code?



##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -43,7 +43,7 @@ Status AddExtensionSetToPlan(const ExtensionSet& ext_set, substrait::Plan* plan)
   auto uris = plan->mutable_extension_uris();
   uris->Reserve(static_cast<int>(ext_set.uris().size()));
   for (uint32_t anchor = 0; anchor < ext_set.uris().size(); ++anchor) {
-    auto uri = ext_set.uris()[anchor];
+    auto uri = ext_set.uris().at(anchor);

Review Comment:
   `at` will raise an exception if the element doesn't exist - is that what you want? Especially given the `empty` check below



##########
cpp/src/arrow/engine/substrait/extension_set.h:
##########
@@ -143,6 +159,10 @@ class ARROW_ENGINE_EXPORT ExtensionSet {
   explicit ExtensionSet(ExtensionIdRegistry* = default_extension_id_registry());
   ARROW_DEFAULT_MOVE_AND_ASSIGN(ExtensionSet);
 
+  static Status CheckHasUri(util::string_view uri, ExtensionSet* self);
+  static void AddUri(std::pair<uint32_t, util::string_view> uri, ExtensionSet* self);
+  static void AddUri(Id id, ExtensionSet* self);

Review Comment:
   And if you really are going to remove the pImpl, these shouldn't be static anymore.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org