You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/05/04 09:06:24 UTC

[GitHub] [arrow] sanjibansg commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

sanjibansg commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r864604482


##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -40,99 +40,65 @@ struct TypePtrHashEq {
   }
 };
 
-struct IdHashEq {
-  using Id = ExtensionSet::Id;
-
-  size_t operator()(Id id) const {
-    constexpr ::arrow::internal::StringViewHash hash = {};
-    auto out = static_cast<size_t>(hash(id.uri));
-    ::arrow::internal::hash_combine(out, hash(id.name));
-    return out;
-  }
-
-  bool operator()(Id l, Id r) const { return l.uri == r.uri && l.name == r.name; }
-};
-
 }  // namespace
 
 // A builder used when creating a Substrait plan from an Arrow execution plan.  In
 // that situation we do not have a set of anchor values already defined so we keep
 // a map of what Ids we have seen.
-struct ExtensionSet::Impl {
-  void AddUri(util::string_view uri, ExtensionSet* self) {
-    if (uris_.find(uri) != uris_.end()) return;
-
-    self->uris_.push_back(uri);
-    uris_.insert(self->uris_.back());  // lookup helper's keys should reference memory
-                                       // owned by this ExtensionSet
-  }
-
-  Status CheckHasUri(util::string_view uri) {
-    if (uris_.find(uri) != uris_.end()) return Status::OK();
-
-    return Status::Invalid(
-        "Uri ", uri,
-        " was referenced by an extension but was not declared in the ExtensionSet.");
-  }
-
-  uint32_t EncodeType(ExtensionIdRegistry::TypeRecord type_record, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(type_record.id.uri, self);
-    auto it_success =
-        types_.emplace(type_record.id, static_cast<uint32_t>(types_.size()));
-
-    if (it_success.second) {
-      self->types_.push_back(
-          {type_record.id, type_record.type, type_record.is_variation});
-    }
-
-    return it_success.first->second;
-  }
-
-  uint32_t EncodeFunction(Id id, util::string_view function_name, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(id.uri, self);
-    auto it_success = functions_.emplace(id, static_cast<uint32_t>(functions_.size()));
-
-    if (it_success.second) {
-      self->functions_.push_back({id, function_name});
-    }
-
-    return it_success.first->second;
-  }
+ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry) : registry_(registry) {}
+
+Status ExtensionSet::CheckHasUri(util::string_view uri, ExtensionSet* self) {
+  auto it =
+      std::find_if(self->uris_.begin(), self->uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri;
+                   });
+  if (it != self->uris_.end()) return Status::OK();
+
+  return Status::Invalid(
+      "Uri ", uri,
+      " was referenced by an extension but was not declared in the ExtensionSet.");
+}
 
-  std::unordered_set<util::string_view, ::arrow::internal::StringViewHash> uris_;
-  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_, functions_;
-};
+void ExtensionSet::AddUri(std::pair<uint32_t, util::string_view> uri,
+                          ExtensionSet* self) {
+  auto it =
+      std::find_if(self->uris_.begin(), self->uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri.second;
+                   });
+  if (it != self->uris_.end()) return;
+  self->uris_[uri.first] = uri.second;
+}
 
-ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry)
-    : registry_(registry), impl_(new Impl(), [](Impl* impl) { delete impl; }) {}
+void ExtensionSet::AddUri(Id id, ExtensionSet* self) {
+  if (self->functions_map_.find(id) != self->functions_map_.end() &&
+      self->types_map_.find(id) != self->types_map_.end())
+    return;
+  self->uris_[self->uris_.size()] = id.uri;
+}
 
-Result<ExtensionSet> ExtensionSet::Make(std::vector<util::string_view> uris,
-                                        std::vector<Id> type_ids,
-                                        std::vector<bool> type_is_variation,
-                                        std::vector<Id> function_ids,
-                                        ExtensionIdRegistry* registry) {
+Result<ExtensionSet> ExtensionSet::Make(
+    std::unordered_map<uint32_t, util::string_view> uris, std::vector<Id> type_ids,
+    std::vector<bool> type_is_variation, std::vector<Id> function_ids,
+    ExtensionIdRegistry* registry) {
   ExtensionSet set;
   set.registry_ = registry;
 
   // TODO(bkietz) move this into the registry as registry->OwnUris(&uris) or so
   std::unordered_set<util::string_view, ::arrow::internal::StringViewHash>
       uris_owned_by_registry;
-  for (util::string_view uri : registry->Uris()) {
+  for (auto uri : registry->Uris()) {
     uris_owned_by_registry.insert(uri);
   }
 
   for (auto& uri : uris) {
-    if (uri.empty()) continue;
-    auto it = uris_owned_by_registry.find(uri);
+    auto it = uris_owned_by_registry.find(uri.second);
     if (it == uris_owned_by_registry.end()) {
-      return Status::KeyError("Uri '", uri, "' not found in registry");
+      return Status::KeyError("Uri '", uri.second, "' not found in registry");
     }
-    uri = *it;  // Ensure uris point into the registry's memory
-    set.impl_->AddUri(*it, &set);
+    // uri = *it;  // Ensure uris point into the registry's memory

Review Comment:
   Sorry, mistake. Corrected that, thanks!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org