You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/04/11 04:11:24 UTC

[GitHub] [arrow] sanjibansg opened a new pull request, #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

sanjibansg opened a new pull request, #12852:
URL: https://github.com/apache/arrow/pull/12852

   This PR modifies the ExtensionSet in C++ Consumer to use an `unordered_map` instead of a `vector` to store the `uri` anchors as the lookup table. This also modifies the usage of the `impl` struct as now the included functions are defined directly with the ExtensionSet implementation.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] westonpace commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
westonpace commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r870765591


##########
cpp/src/arrow/engine/substrait/extension_set.h:
##########
@@ -226,14 +238,14 @@ class ARROW_ENGINE_EXPORT ExtensionSet {
  private:
   ExtensionIdRegistry* registry_;
   /// The subset of extension registry URIs referenced by this extension set
-  std::vector<util::string_view> uris_;
-  std::vector<TypeRecord> types_;
-
-  std::vector<FunctionRecord> functions_;
-
-  // pimpl pattern to hide lookup details
-  struct Impl;
-  std::unique_ptr<Impl, void (*)(Impl*)> impl_;
+  std::unordered_map<uint32_t, util::string_view> uris_;
+  std::unordered_map<uint32_t, TypeRecord> types_;
+  std::unordered_map<uint32_t, FunctionRecord> functions_;
+  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_map_, functions_map_;

Review Comment:
   Can we add some comments explaining these fields?  Also, combining declarations on one line is discouraged as it can be a bit tricky to read.
   ```suggestion
     // Map from anchor values to URI values
     std::unordered_map<uint32_t, util::string_view> uris_;
     // Map from anchor values to type definitions, used during Substrait->Arrow
     // and populated from the Substrait extension set
     std::unordered_map<uint32_t, TypeRecord> types_;
     // Map from anchor values to function definitions, used during Substrait->Arrow
     // and populated from the Substrait extension set
     std::unordered_map<uint32_t, FunctionRecord> functions_;
     // Map from type names to anchor values.  Used during Arrow->Substrait
     // and built as the plan is created.
     std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_map_;
     // Map from function names to anchor values.  Used during Arrow->Substrait
     // and built as the plan is created.
     std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> functions_map_;
   ```



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -16,7 +16,6 @@
 // under the License.
 
 #include "arrow/engine/substrait/extension_set.h"
-

Review Comment:
   Can you add this line back in?  We try to keep separation between groups of headers.



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -40,125 +39,86 @@ struct TypePtrHashEq {
   }
 };
 
-struct IdHashEq {
-  using Id = ExtensionSet::Id;
-
-  size_t operator()(Id id) const {
-    constexpr ::arrow::internal::StringViewHash hash = {};
-    auto out = static_cast<size_t>(hash(id.uri));
-    ::arrow::internal::hash_combine(out, hash(id.name));
-    return out;
-  }
-
-  bool operator()(Id l, Id r) const { return l.uri == r.uri && l.name == r.name; }
-};
-
 }  // namespace
 
 // A builder used when creating a Substrait plan from an Arrow execution plan.  In
 // that situation we do not have a set of anchor values already defined so we keep
 // a map of what Ids we have seen.
-struct ExtensionSet::Impl {
-  void AddUri(util::string_view uri, ExtensionSet* self) {
-    if (uris_.find(uri) != uris_.end()) return;
-
-    self->uris_.push_back(uri);
-    uris_.insert(self->uris_.back());  // lookup helper's keys should reference memory
-                                       // owned by this ExtensionSet
-  }
-
-  Status CheckHasUri(util::string_view uri) {
-    if (uris_.find(uri) != uris_.end()) return Status::OK();
-
-    return Status::Invalid(
-        "Uri ", uri,
-        " was referenced by an extension but was not declared in the ExtensionSet.");
-  }
-
-  uint32_t EncodeType(ExtensionIdRegistry::TypeRecord type_record, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(type_record.id.uri, self);
-    auto it_success =
-        types_.emplace(type_record.id, static_cast<uint32_t>(types_.size()));
-
-    if (it_success.second) {
-      self->types_.push_back(
-          {type_record.id, type_record.type, type_record.is_variation});
-    }
-
-    return it_success.first->second;
-  }
-
-  uint32_t EncodeFunction(Id id, util::string_view function_name, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(id.uri, self);
-    auto it_success = functions_.emplace(id, static_cast<uint32_t>(functions_.size()));
+ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry) : registry_(registry) {}
+
+Status ExtensionSet::CheckHasUri(util::string_view uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri;
+                   });
+  if (it != uris_.end()) return Status::OK();
+
+  return Status::Invalid(
+      "Uri ", uri,
+      " was referenced by an extension but was not declared in the ExtensionSet.");
+}
 
-    if (it_success.second) {
-      self->functions_.push_back({id, function_name});
-    }
+void ExtensionSet::AddUri(std::pair<uint32_t, util::string_view> uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri.second;
+                   });
+  if (it != uris_.end()) return;
+  uris_[uri.first] = uri.second;
+}
 
-    return it_success.first->second;
+Status ExtensionSet::AddUri(Id id) {
+  auto uris_size = static_cast<unsigned int>(uris_.size());
+  if (uris_.find(uris_size) != uris_.end()) {
+    return Status::Invalid("Key already exist in the uris map");

Review Comment:
   ```suggestion
       return Status::Invalid("Key already exists in the uris map");
   ```



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -40,125 +39,86 @@ struct TypePtrHashEq {
   }
 };
 
-struct IdHashEq {
-  using Id = ExtensionSet::Id;
-
-  size_t operator()(Id id) const {
-    constexpr ::arrow::internal::StringViewHash hash = {};
-    auto out = static_cast<size_t>(hash(id.uri));
-    ::arrow::internal::hash_combine(out, hash(id.name));
-    return out;
-  }
-
-  bool operator()(Id l, Id r) const { return l.uri == r.uri && l.name == r.name; }
-};
-
 }  // namespace
 
 // A builder used when creating a Substrait plan from an Arrow execution plan.  In
 // that situation we do not have a set of anchor values already defined so we keep
 // a map of what Ids we have seen.
-struct ExtensionSet::Impl {
-  void AddUri(util::string_view uri, ExtensionSet* self) {
-    if (uris_.find(uri) != uris_.end()) return;
-
-    self->uris_.push_back(uri);
-    uris_.insert(self->uris_.back());  // lookup helper's keys should reference memory
-                                       // owned by this ExtensionSet
-  }
-
-  Status CheckHasUri(util::string_view uri) {
-    if (uris_.find(uri) != uris_.end()) return Status::OK();
-
-    return Status::Invalid(
-        "Uri ", uri,
-        " was referenced by an extension but was not declared in the ExtensionSet.");
-  }
-
-  uint32_t EncodeType(ExtensionIdRegistry::TypeRecord type_record, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(type_record.id.uri, self);
-    auto it_success =
-        types_.emplace(type_record.id, static_cast<uint32_t>(types_.size()));
-
-    if (it_success.second) {
-      self->types_.push_back(
-          {type_record.id, type_record.type, type_record.is_variation});
-    }
-
-    return it_success.first->second;
-  }
-
-  uint32_t EncodeFunction(Id id, util::string_view function_name, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(id.uri, self);
-    auto it_success = functions_.emplace(id, static_cast<uint32_t>(functions_.size()));
+ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry) : registry_(registry) {}
+
+Status ExtensionSet::CheckHasUri(util::string_view uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri;
+                   });
+  if (it != uris_.end()) return Status::OK();
+
+  return Status::Invalid(
+      "Uri ", uri,
+      " was referenced by an extension but was not declared in the ExtensionSet.");
+}
 
-    if (it_success.second) {
-      self->functions_.push_back({id, function_name});
-    }
+void ExtensionSet::AddUri(std::pair<uint32_t, util::string_view> uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri.second;
+                   });
+  if (it != uris_.end()) return;
+  uris_[uri.first] = uri.second;
+}
 
-    return it_success.first->second;
+Status ExtensionSet::AddUri(Id id) {
+  auto uris_size = static_cast<unsigned int>(uris_.size());
+  if (uris_.find(uris_size) != uris_.end()) {
+    return Status::Invalid("Key already exist in the uris map");

Review Comment:
   ```suggestion
       // Substrait plans shouldn't have repeated URIs in the extension set
       return Status::Invalid("Key already exist in the uris map");
   ```
   It's correct, but a bit odd that we do `FindOrCreate` when adding an anchor/uri pair but we do `InsertOrFail` when adding by `Id` so a comment might help.



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -40,125 +39,86 @@ struct TypePtrHashEq {
   }
 };
 
-struct IdHashEq {
-  using Id = ExtensionSet::Id;
-
-  size_t operator()(Id id) const {
-    constexpr ::arrow::internal::StringViewHash hash = {};
-    auto out = static_cast<size_t>(hash(id.uri));
-    ::arrow::internal::hash_combine(out, hash(id.name));
-    return out;
-  }
-
-  bool operator()(Id l, Id r) const { return l.uri == r.uri && l.name == r.name; }
-};
-
 }  // namespace
 
 // A builder used when creating a Substrait plan from an Arrow execution plan.  In
 // that situation we do not have a set of anchor values already defined so we keep
 // a map of what Ids we have seen.
-struct ExtensionSet::Impl {
-  void AddUri(util::string_view uri, ExtensionSet* self) {
-    if (uris_.find(uri) != uris_.end()) return;
-
-    self->uris_.push_back(uri);
-    uris_.insert(self->uris_.back());  // lookup helper's keys should reference memory
-                                       // owned by this ExtensionSet
-  }
-
-  Status CheckHasUri(util::string_view uri) {
-    if (uris_.find(uri) != uris_.end()) return Status::OK();
-
-    return Status::Invalid(
-        "Uri ", uri,
-        " was referenced by an extension but was not declared in the ExtensionSet.");
-  }
-
-  uint32_t EncodeType(ExtensionIdRegistry::TypeRecord type_record, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(type_record.id.uri, self);
-    auto it_success =
-        types_.emplace(type_record.id, static_cast<uint32_t>(types_.size()));
-
-    if (it_success.second) {
-      self->types_.push_back(
-          {type_record.id, type_record.type, type_record.is_variation});
-    }
-
-    return it_success.first->second;
-  }
-
-  uint32_t EncodeFunction(Id id, util::string_view function_name, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(id.uri, self);
-    auto it_success = functions_.emplace(id, static_cast<uint32_t>(functions_.size()));
+ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry) : registry_(registry) {}
+
+Status ExtensionSet::CheckHasUri(util::string_view uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri;
+                   });
+  if (it != uris_.end()) return Status::OK();
+
+  return Status::Invalid(
+      "Uri ", uri,
+      " was referenced by an extension but was not declared in the ExtensionSet.");
+}
 
-    if (it_success.second) {
-      self->functions_.push_back({id, function_name});
-    }
+void ExtensionSet::AddUri(std::pair<uint32_t, util::string_view> uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri.second;
+                   });
+  if (it != uris_.end()) return;
+  uris_[uri.first] = uri.second;
+}
 
-    return it_success.first->second;
+Status ExtensionSet::AddUri(Id id) {
+  auto uris_size = static_cast<unsigned int>(uris_.size());
+  if (uris_.find(uris_size) != uris_.end()) {
+    return Status::Invalid("Key already exist in the uris map");
   }
+  uris_[uris_size] = id.uri;
+  return Status::OK();
+}
 
-  std::unordered_set<util::string_view, ::arrow::internal::StringViewHash> uris_;
-  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_, functions_;
-};
-
-ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry)
-    : registry_(registry), impl_(new Impl(), [](Impl* impl) { delete impl; }) {}
-
-Result<ExtensionSet> ExtensionSet::Make(std::vector<util::string_view> uris,
-                                        std::vector<Id> type_ids,
-                                        std::vector<bool> type_is_variation,
-                                        std::vector<Id> function_ids,
-                                        ExtensionIdRegistry* registry) {
+Result<ExtensionSet> ExtensionSet::Make(
+    std::unordered_map<uint32_t, util::string_view> uris,
+    std::unordered_map<uint32_t, Id> type_ids,
+    std::unordered_map<uint32_t, Id> function_ids, ExtensionIdRegistry* registry) {
   ExtensionSet set;
   set.registry_ = registry;
 
   // TODO(bkietz) move this into the registry as registry->OwnUris(&uris) or so
   std::unordered_set<util::string_view, ::arrow::internal::StringViewHash>
       uris_owned_by_registry;
-  for (util::string_view uri : registry->Uris()) {
+  for (auto uri : registry->Uris()) {

Review Comment:
   ```suggestion
     for (util::string_view uri : registry->Uris()) {
   ```
   I generally don't mind auto but I think it can be confusing to tell when you are dealing with a string or a string_view and having the explicit type might be helpful here.



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -174,31 +134,51 @@ Result<ExtensionSet> ExtensionSet::Make(std::vector<util::string_view> uris,
 }
 
 Result<ExtensionSet::TypeRecord> ExtensionSet::DecodeType(uint32_t anchor) const {
-  if (anchor >= types_.size() || types_[anchor].id.empty()) {
+  if (types_.find(anchor) == types_.end() || types_.at(anchor).id.empty()) {
     return Status::Invalid("User defined type reference ", anchor,
                            " did not have a corresponding anchor in the extension set");
   }
-  return types_[anchor];
+  return types_.at(anchor);
 }
 
 Result<uint32_t> ExtensionSet::EncodeType(const DataType& type) {
   if (auto rec = registry_->GetType(type)) {
-    return impl_->EncodeType(*rec, this);
+    RETURN_NOT_OK(this->AddUri(rec->id));
+    auto it_success =
+        types_map_.emplace(rec->id, static_cast<uint32_t>(types_map_.size()));
+    if (it_success.second) {
+      if (types_.find(static_cast<unsigned int>(types_.size())) != types_.end()) {
+        return Status::Invalid("Key already exist in the uris map");
+      }

Review Comment:
   ```suggestion
         DCHECK_EQ(types_.find(static_cast<unsigned int>(types_.size())), types_.end()) << "Type existed in types_ but not types_map_.  ExtensionSet is inconsistent";
   ```
   There really isn't much a user can do with this message.  If we get here I think it would have to be a bug (possibly reusing an ExtensionSet) so we might as well make it a `DCHECK`.



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -174,31 +134,51 @@ Result<ExtensionSet> ExtensionSet::Make(std::vector<util::string_view> uris,
 }
 
 Result<ExtensionSet::TypeRecord> ExtensionSet::DecodeType(uint32_t anchor) const {
-  if (anchor >= types_.size() || types_[anchor].id.empty()) {
+  if (types_.find(anchor) == types_.end() || types_.at(anchor).id.empty()) {
     return Status::Invalid("User defined type reference ", anchor,
                            " did not have a corresponding anchor in the extension set");
   }
-  return types_[anchor];
+  return types_.at(anchor);
 }
 
 Result<uint32_t> ExtensionSet::EncodeType(const DataType& type) {
   if (auto rec = registry_->GetType(type)) {
-    return impl_->EncodeType(*rec, this);
+    RETURN_NOT_OK(this->AddUri(rec->id));
+    auto it_success =
+        types_map_.emplace(rec->id, static_cast<uint32_t>(types_map_.size()));
+    if (it_success.second) {
+      if (types_.find(static_cast<unsigned int>(types_.size())) != types_.end()) {
+        return Status::Invalid("Key already exist in the uris map");
+      }
+      types_[static_cast<unsigned int>(types_.size())] = {rec->id, rec->type};
+    }
+    return it_success.first->second;
   }
   return Status::KeyError("type ", type.ToString(), " not found in the registry");
 }
 
 Result<ExtensionSet::FunctionRecord> ExtensionSet::DecodeFunction(uint32_t anchor) const {
-  if (anchor >= functions_.size() || functions_[anchor].id.empty()) {
+  if (functions_.find(anchor) == functions_.end() || functions_.at(anchor).id.empty()) {
     return Status::Invalid("User defined function reference ", anchor,
                            " did not have a corresponding anchor in the extension set");
   }
-  return functions_[anchor];
+  return functions_.at(anchor);
 }
 
 Result<uint32_t> ExtensionSet::EncodeFunction(util::string_view function_name) {
   if (auto rec = registry_->GetFunction(function_name)) {
-    return impl_->EncodeFunction(rec->id, rec->function_name, this);
+    RETURN_NOT_OK(this->AddUri(rec->id));
+    auto it_success =
+        functions_map_.emplace(rec->id, static_cast<uint32_t>(functions_map_.size()));
+    if (it_success.second) {
+      if (functions_.find(static_cast<unsigned int>(functions_.size())) !=
+          functions_.end()) {
+        return Status::Invalid("Key already exist in the uris map");
+      }

Review Comment:
   Same as above, let's make this a `DCHECK`.



##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -43,7 +43,7 @@ Status AddExtensionSetToPlan(const ExtensionSet& ext_set, substrait::Plan* plan)
   auto uris = plan->mutable_extension_uris();
   uris->Reserve(static_cast<int>(ext_set.uris().size()));
   for (uint32_t anchor = 0; anchor < ext_set.uris().size(); ++anchor) {
-    auto uri = ext_set.uris()[anchor];
+    auto uri = ext_set.uris().at(anchor);

Review Comment:
   Hm...does this work?
   ```
   util::string_view uri = ext_set.uris()[anchor];
   ```



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -40,125 +39,86 @@ struct TypePtrHashEq {
   }
 };
 
-struct IdHashEq {
-  using Id = ExtensionSet::Id;
-
-  size_t operator()(Id id) const {
-    constexpr ::arrow::internal::StringViewHash hash = {};
-    auto out = static_cast<size_t>(hash(id.uri));
-    ::arrow::internal::hash_combine(out, hash(id.name));
-    return out;
-  }
-
-  bool operator()(Id l, Id r) const { return l.uri == r.uri && l.name == r.name; }
-};
-
 }  // namespace
 
 // A builder used when creating a Substrait plan from an Arrow execution plan.  In
 // that situation we do not have a set of anchor values already defined so we keep
 // a map of what Ids we have seen.
-struct ExtensionSet::Impl {
-  void AddUri(util::string_view uri, ExtensionSet* self) {
-    if (uris_.find(uri) != uris_.end()) return;
-
-    self->uris_.push_back(uri);
-    uris_.insert(self->uris_.back());  // lookup helper's keys should reference memory
-                                       // owned by this ExtensionSet
-  }
-
-  Status CheckHasUri(util::string_view uri) {
-    if (uris_.find(uri) != uris_.end()) return Status::OK();
-
-    return Status::Invalid(
-        "Uri ", uri,
-        " was referenced by an extension but was not declared in the ExtensionSet.");
-  }
-
-  uint32_t EncodeType(ExtensionIdRegistry::TypeRecord type_record, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(type_record.id.uri, self);
-    auto it_success =
-        types_.emplace(type_record.id, static_cast<uint32_t>(types_.size()));
-
-    if (it_success.second) {
-      self->types_.push_back(
-          {type_record.id, type_record.type, type_record.is_variation});
-    }
-
-    return it_success.first->second;
-  }
-
-  uint32_t EncodeFunction(Id id, util::string_view function_name, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(id.uri, self);
-    auto it_success = functions_.emplace(id, static_cast<uint32_t>(functions_.size()));
+ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry) : registry_(registry) {}
+
+Status ExtensionSet::CheckHasUri(util::string_view uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri;
+                   });

Review Comment:
   Hmm...this is a linear search through the URIs map each time.  Admittedly, the URIs map should be quite small, but I wonder if we want this map to be bidirectional as well.  I think we can probably leave it for now though.



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -174,31 +134,51 @@ Result<ExtensionSet> ExtensionSet::Make(std::vector<util::string_view> uris,
 }
 
 Result<ExtensionSet::TypeRecord> ExtensionSet::DecodeType(uint32_t anchor) const {
-  if (anchor >= types_.size() || types_[anchor].id.empty()) {
+  if (types_.find(anchor) == types_.end() || types_.at(anchor).id.empty()) {
     return Status::Invalid("User defined type reference ", anchor,
                            " did not have a corresponding anchor in the extension set");
   }
-  return types_[anchor];
+  return types_.at(anchor);

Review Comment:
   ```suggestion
     return types_[anchor];
   ```
   We don't handle exceptions so there is generally no benefit to using `at` over `[]`.



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -40,125 +39,86 @@ struct TypePtrHashEq {
   }
 };
 
-struct IdHashEq {
-  using Id = ExtensionSet::Id;
-
-  size_t operator()(Id id) const {
-    constexpr ::arrow::internal::StringViewHash hash = {};
-    auto out = static_cast<size_t>(hash(id.uri));
-    ::arrow::internal::hash_combine(out, hash(id.name));
-    return out;
-  }
-
-  bool operator()(Id l, Id r) const { return l.uri == r.uri && l.name == r.name; }
-};
-
 }  // namespace
 
 // A builder used when creating a Substrait plan from an Arrow execution plan.  In
 // that situation we do not have a set of anchor values already defined so we keep
 // a map of what Ids we have seen.
-struct ExtensionSet::Impl {
-  void AddUri(util::string_view uri, ExtensionSet* self) {
-    if (uris_.find(uri) != uris_.end()) return;
-
-    self->uris_.push_back(uri);
-    uris_.insert(self->uris_.back());  // lookup helper's keys should reference memory
-                                       // owned by this ExtensionSet
-  }
-
-  Status CheckHasUri(util::string_view uri) {
-    if (uris_.find(uri) != uris_.end()) return Status::OK();
-
-    return Status::Invalid(
-        "Uri ", uri,
-        " was referenced by an extension but was not declared in the ExtensionSet.");
-  }
-
-  uint32_t EncodeType(ExtensionIdRegistry::TypeRecord type_record, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(type_record.id.uri, self);
-    auto it_success =
-        types_.emplace(type_record.id, static_cast<uint32_t>(types_.size()));
-
-    if (it_success.second) {
-      self->types_.push_back(
-          {type_record.id, type_record.type, type_record.is_variation});
-    }
-
-    return it_success.first->second;
-  }
-
-  uint32_t EncodeFunction(Id id, util::string_view function_name, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(id.uri, self);
-    auto it_success = functions_.emplace(id, static_cast<uint32_t>(functions_.size()));
+ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry) : registry_(registry) {}
+
+Status ExtensionSet::CheckHasUri(util::string_view uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri;
+                   });
+  if (it != uris_.end()) return Status::OK();
+
+  return Status::Invalid(
+      "Uri ", uri,
+      " was referenced by an extension but was not declared in the ExtensionSet.");
+}
 
-    if (it_success.second) {
-      self->functions_.push_back({id, function_name});
-    }
+void ExtensionSet::AddUri(std::pair<uint32_t, util::string_view> uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri.second;
+                   });
+  if (it != uris_.end()) return;
+  uris_[uri.first] = uri.second;
+}
 
-    return it_success.first->second;
+Status ExtensionSet::AddUri(Id id) {
+  auto uris_size = static_cast<unsigned int>(uris_.size());
+  if (uris_.find(uris_size) != uris_.end()) {
+    return Status::Invalid("Key already exist in the uris map");
   }
+  uris_[uris_size] = id.uri;
+  return Status::OK();
+}
 
-  std::unordered_set<util::string_view, ::arrow::internal::StringViewHash> uris_;
-  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_, functions_;
-};
-
-ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry)
-    : registry_(registry), impl_(new Impl(), [](Impl* impl) { delete impl; }) {}
-
-Result<ExtensionSet> ExtensionSet::Make(std::vector<util::string_view> uris,
-                                        std::vector<Id> type_ids,
-                                        std::vector<bool> type_is_variation,
-                                        std::vector<Id> function_ids,
-                                        ExtensionIdRegistry* registry) {
+Result<ExtensionSet> ExtensionSet::Make(

Review Comment:
   ```suggestion
   // Creates an extension set from the Substrait plan's top-level extensions block
   Result<ExtensionSet> ExtensionSet::Make(
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] jvanstraten commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
jvanstraten commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r856573840


##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -108,13 +108,23 @@ void SetElement(size_t i, const Element& element, std::vector<T>* vector) {
   }
   (*vector)[i] = static_cast<T>(element);
 }
+
+template <typename Element, typename key, typename value>
+void SetMapElement(key i, const Element& element, std::unordered_map<key, value>* map) {
+  DCHECK_LE(i, 1 << 20);
+  if (i >= map->size()) {
+    map->reserve(i + 1);
+  }
+  (*map)[i] = static_cast<value>(element);
+}
+
 }  // namespace
 
 Result<ExtensionSet> GetExtensionSetFromPlan(const substrait::Plan& plan,
                                              ExtensionIdRegistry* registry) {
-  std::vector<util::string_view> uris;
+  std::unordered_map<uint32_t, util::string_view> uris;
   for (const auto& uri : plan.extension_uris()) {
-    SetElement(uri.extension_uri_anchor(), uri.uri(), &uris);
+    SetMapElement(uri.extension_uri_anchor(), uri.uri(), &uris);
   }
 
   // NOTE: it's acceptable to use views to memory owned by plan; ExtensionSet::Make

Review Comment:
   I'm inclined to agree, because personally I still don't understand why Arrow should have producer code at all. The best argument I've heard yet is to communicate plans across language boundaries within Arrow, but it'd be so much easier to just make protobuf messages that actually correspond to the execution engine data structures and type system for that. Heck, throw out protobuf in favor of something that links sanely. The second best is roundtrip testing, but is all the code needed to not only produce but also bit-by-bit *reproduce* the same plan really worth a factor 2 to 3 in code size?
   
   But anyway, what you're describing as a non-goal for those functions is exactly how they are currently used. In practice the anchors will all already exist, since the producer code is AFAIK incomplete and can only reproduce an incoming plan, so the new-anchor-generation code would never trigger under normal circumstances. IMO that's no excuse for that function to just silently override the same anchor over and over again in that case, though. It should at least detect it and throw an error.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] sanjibansg commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
sanjibansg commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r864612657


##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -108,13 +108,23 @@ void SetElement(size_t i, const Element& element, std::vector<T>* vector) {
   }
   (*vector)[i] = static_cast<T>(element);
 }
+
+template <typename Element, typename key, typename value>
+void SetMapElement(key i, const Element& element, std::unordered_map<key, value>* map) {
+  DCHECK_LE(i, 1 << 20);
+  if (i >= map->size()) {
+    map->reserve(i + 1);
+  }

Review Comment:
   Removed it, now reserving space inside `GetExtensionSetFromPlan()`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] sanjibansg commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
sanjibansg commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r875293847


##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -16,7 +16,6 @@
 // under the License.
 
 #include "arrow/engine/substrait/extension_set.h"
-

Review Comment:
   Made the change, thanks!



##########
cpp/src/arrow/engine/substrait/extension_set.h:
##########
@@ -226,14 +238,14 @@ class ARROW_ENGINE_EXPORT ExtensionSet {
  private:
   ExtensionIdRegistry* registry_;
   /// The subset of extension registry URIs referenced by this extension set
-  std::vector<util::string_view> uris_;
-  std::vector<TypeRecord> types_;
-
-  std::vector<FunctionRecord> functions_;
-
-  // pimpl pattern to hide lookup details
-  struct Impl;
-  std::unique_ptr<Impl, void (*)(Impl*)> impl_;
+  std::unordered_map<uint32_t, util::string_view> uris_;
+  std::unordered_map<uint32_t, TypeRecord> types_;
+  std::unordered_map<uint32_t, FunctionRecord> functions_;
+  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_map_, functions_map_;

Review Comment:
   Made the change, thanks!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] sanjibansg commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
sanjibansg commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r864608540


##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -43,7 +43,7 @@ Status AddExtensionSetToPlan(const ExtensionSet& ext_set, substrait::Plan* plan)
   auto uris = plan->mutable_extension_uris();
   uris->Reserve(static_cast<int>(ext_set.uris().size()));
   for (uint32_t anchor = 0; anchor < ext_set.uris().size(); ++anchor) {
-    auto uri = ext_set.uris()[anchor];
+    auto uri = ext_set.uris().at(anchor);

Review Comment:
   As the `uris` is now an unordered_map, I think it will raise an error if we want to use the `[]` operator on a const unordered_map, and thus used the `.at()` implementation instead. Should I remove the `empty` check, or can there be any better approach?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] sanjibansg commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
sanjibansg commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r864614264


##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -108,13 +108,23 @@ void SetElement(size_t i, const Element& element, std::vector<T>* vector) {
   }
   (*vector)[i] = static_cast<T>(element);
 }
+
+template <typename Element, typename key, typename value>
+void SetMapElement(key i, const Element& element, std::unordered_map<key, value>* map) {
+  DCHECK_LE(i, 1 << 20);
+  if (i >= map->size()) {
+    map->reserve(i + 1);
+  }
+  (*map)[i] = static_cast<value>(element);
+}
+
 }  // namespace
 
 Result<ExtensionSet> GetExtensionSetFromPlan(const substrait::Plan& plan,
                                              ExtensionIdRegistry* registry) {
-  std::vector<util::string_view> uris;
+  std::unordered_map<uint32_t, util::string_view> uris;
   for (const auto& uri : plan.extension_uris()) {
-    SetElement(uri.extension_uri_anchor(), uri.uri(), &uris);
+    SetMapElement(uri.extension_uri_anchor(), uri.uri(), &uris);

Review Comment:
   Refactored that.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] sanjibansg commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
sanjibansg commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r864611814


##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -108,13 +108,23 @@ void SetElement(size_t i, const Element& element, std::vector<T>* vector) {
   }
   (*vector)[i] = static_cast<T>(element);
 }
+
+template <typename Element, typename key, typename value>
+void SetMapElement(key i, const Element& element, std::unordered_map<key, value>* map) {
+  DCHECK_LE(i, 1 << 20);

Review Comment:
   Refactored that code, and not currently using the `SetMapElement()` function, is this approach better?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] sanjibansg commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
sanjibansg commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r864613939


##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -108,13 +108,23 @@ void SetElement(size_t i, const Element& element, std::vector<T>* vector) {
   }
   (*vector)[i] = static_cast<T>(element);
 }
+
+template <typename Element, typename key, typename value>
+void SetMapElement(key i, const Element& element, std::unordered_map<key, value>* map) {
+  DCHECK_LE(i, 1 << 20);
+  if (i >= map->size()) {
+    map->reserve(i + 1);
+  }

Review Comment:
   Not using the function anymore, and doing it directly in the `GetExtensionSetFromPlan()`. Is this approach better?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] jvanstraten commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
jvanstraten commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r853049018


##########
cpp/src/arrow/engine/substrait/extension_set.h:
##########
@@ -158,14 +178,14 @@ class ARROW_ENGINE_EXPORT ExtensionSet {
   /// An extension set should instead be created using
   /// arrow::engine::GetExtensionSetFromPlan
   static Result<ExtensionSet> Make(
-      std::vector<util::string_view> uris, std::vector<Id> type_ids,
+      std::unordered_map<uint32_t, util::string_view> uris, std::vector<Id> type_ids,
       std::vector<bool> type_is_variation, std::vector<Id> function_ids,
       ExtensionIdRegistry* = default_extension_id_registry());
 
   // index in these vectors == value of _anchor/_reference fields
   /// TODO(ARROW-15583) this assumes that _anchor/_references won't be huge, which is not
   /// guaranteed. Could it be?

Review Comment:
   This comment is no longer applicable.



##########
cpp/src/arrow/engine/substrait/extension_set.h:
##########
@@ -226,14 +246,10 @@ class ARROW_ENGINE_EXPORT ExtensionSet {
  private:
   ExtensionIdRegistry* registry_;
   /// The subset of extension registry URIs referenced by this extension set
-  std::vector<util::string_view> uris_;
+  std::unordered_map<uint32_t, util::string_view> uris_;
   std::vector<TypeRecord> types_;
-
   std::vector<FunctionRecord> functions_;

Review Comment:
   These should also be converted to `unordered_map`s; in fact, they are more important to convert, as there will be lots more functions and types than URIs in a typical plan.



##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -108,13 +108,23 @@ void SetElement(size_t i, const Element& element, std::vector<T>* vector) {
   }
   (*vector)[i] = static_cast<T>(element);
 }

Review Comment:
   When the code is reimplemented as it should be, this function should become dead code (and be removed), as it is what's causing the potentially large allocations.



##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -108,13 +108,23 @@ void SetElement(size_t i, const Element& element, std::vector<T>* vector) {
   }
   (*vector)[i] = static_cast<T>(element);
 }
+
+template <typename Element, typename key, typename value>
+void SetMapElement(key i, const Element& element, std::unordered_map<key, value>* map) {
+  DCHECK_LE(i, 1 << 20);
+  if (i >= map->size()) {
+    map->reserve(i + 1);
+  }

Review Comment:
   This is no longer applicable when a map is used. The `reserve()` call is actually highly counterproductive, since a map with the single key 100000 will reserve room for 100001 key-value pairs (this is, in fact, the core problem that needs solving).
   
   Once these lines are removed, I'd argue that the whole function reduces to nothing and should just be inlined.



##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -108,13 +108,23 @@ void SetElement(size_t i, const Element& element, std::vector<T>* vector) {
   }
   (*vector)[i] = static_cast<T>(element);
 }
+
+template <typename Element, typename key, typename value>
+void SetMapElement(key i, const Element& element, std::unordered_map<key, value>* map) {
+  DCHECK_LE(i, 1 << 20);
+  if (i >= map->size()) {
+    map->reserve(i + 1);
+  }
+  (*map)[i] = static_cast<value>(element);
+}
+
 }  // namespace
 
 Result<ExtensionSet> GetExtensionSetFromPlan(const substrait::Plan& plan,
                                              ExtensionIdRegistry* registry) {
-  std::vector<util::string_view> uris;
+  std::unordered_map<uint32_t, util::string_view> uris;
   for (const auto& uri : plan.extension_uris()) {
-    SetElement(uri.extension_uri_anchor(), uri.uri(), &uris);
+    SetMapElement(uri.extension_uri_anchor(), uri.uri(), &uris);
   }
 
   // NOTE: it's acceptable to use views to memory owned by plan; ExtensionSet::Make

Review Comment:
   GitHub won't let me add comments to code this far away from a change, but these vectors a few lines further down (as well as the equivalent fields of `ExtensionSet`) should also be replaced with maps:
   
   ```c++
     std::vector<Id> type_ids, function_ids;
     std::vector<bool> type_is_variation;
   ```
   
   When you change this and everything affected by it, you'll find that `ExtensionSet::EncodeType` and `ExtensionSet::EncodeFunction` won't work anymore as written. Functionally, they include code to generate a key that wasn't used before, which is easy with a vector, but for a map would require searching for an unused key by iterating over all possible keys until you find one. It might therefore be better to use an ordered `map`, in which case you can use something like `map::rbegin()->first + 1` for the typical case (though you'd still need to implement the empty map and overflow edge cases; in the former case you should return 1, and in the latter case you'd need to exhaustively search for a free key anyway). Don't use anchor 0 (the current implementation probably does); that anchor value is reserved for undefined references in some contexts. If you really want to use an `unordered_map` instead of a `map`, you can also keep track of the latest key that was added to the map and use
  it as a start position for looking for free anchors; that'd still be pretty efficient in the typical case, but requires a bit more bookkeeping.
   
   Furthermore, as I pointed out in my comment to the JIRA, using a single anchor map for both types and type variations, and then using `type_is_variation` to distinguish between types and type variation, was a misinterpretation of the Substrait specification as far as I'm concerned: it's perfectly valid (or at least, unambiguous) for a plan to define a type with anchor 1 as well as a completely unrelated type variation with anchor 1. To be clear, a correct, complete implementation should look something like this:
   
   ```c++
     std::[unordered_]map<uint32_t, Id> type_ids, type_variation_ids, function_ids;
   ```
   
   That's a lot more involved though, as the assumption that a user-defined type or type variation is uniquely identified by a single number is pretty deeply ingrained in the round-tripping code. However, since type variations are not (correctly) implemented anyway, it's probably better for now to just remove the broken and incomplete code related to type variations for now, so it'd just become
   
   ```c++
     std::[unordered_]map<uint32_t, Id> type_ids, function_ids;
   ```
   
   On the consumer end, a nonzero type variation reference [already yields an error](https://github.com/sanjibansg/arrow/blob/b6b445c16f7829a26b91ab447e36d8cddbc32ce3/cpp/src/arrow/engine/substrait/type_internal.cc#L43) (this is why anchor 0 should not be used, by the way). The type variation anchor definitions in the `kExtensionTypeVariation` case in the loop below could just be silently ignored (which is what I would personally do), but it'd be more in line with the rest of the implementation to emit an error when encountering those too, as the implementation aims to error out on anything that wouldn't round-trip perfectly.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] sanjibansg commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
sanjibansg commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r875294359


##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -40,125 +39,86 @@ struct TypePtrHashEq {
   }
 };
 
-struct IdHashEq {
-  using Id = ExtensionSet::Id;
-
-  size_t operator()(Id id) const {
-    constexpr ::arrow::internal::StringViewHash hash = {};
-    auto out = static_cast<size_t>(hash(id.uri));
-    ::arrow::internal::hash_combine(out, hash(id.name));
-    return out;
-  }
-
-  bool operator()(Id l, Id r) const { return l.uri == r.uri && l.name == r.name; }
-};
-
 }  // namespace
 
 // A builder used when creating a Substrait plan from an Arrow execution plan.  In
 // that situation we do not have a set of anchor values already defined so we keep
 // a map of what Ids we have seen.
-struct ExtensionSet::Impl {
-  void AddUri(util::string_view uri, ExtensionSet* self) {
-    if (uris_.find(uri) != uris_.end()) return;
-
-    self->uris_.push_back(uri);
-    uris_.insert(self->uris_.back());  // lookup helper's keys should reference memory
-                                       // owned by this ExtensionSet
-  }
-
-  Status CheckHasUri(util::string_view uri) {
-    if (uris_.find(uri) != uris_.end()) return Status::OK();
-
-    return Status::Invalid(
-        "Uri ", uri,
-        " was referenced by an extension but was not declared in the ExtensionSet.");
-  }
-
-  uint32_t EncodeType(ExtensionIdRegistry::TypeRecord type_record, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(type_record.id.uri, self);
-    auto it_success =
-        types_.emplace(type_record.id, static_cast<uint32_t>(types_.size()));
-
-    if (it_success.second) {
-      self->types_.push_back(
-          {type_record.id, type_record.type, type_record.is_variation});
-    }
-
-    return it_success.first->second;
-  }
-
-  uint32_t EncodeFunction(Id id, util::string_view function_name, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(id.uri, self);
-    auto it_success = functions_.emplace(id, static_cast<uint32_t>(functions_.size()));
+ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry) : registry_(registry) {}
+
+Status ExtensionSet::CheckHasUri(util::string_view uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri;
+                   });
+  if (it != uris_.end()) return Status::OK();
+
+  return Status::Invalid(
+      "Uri ", uri,
+      " was referenced by an extension but was not declared in the ExtensionSet.");
+}
 
-    if (it_success.second) {
-      self->functions_.push_back({id, function_name});
-    }
+void ExtensionSet::AddUri(std::pair<uint32_t, util::string_view> uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri.second;
+                   });
+  if (it != uris_.end()) return;
+  uris_[uri.first] = uri.second;
+}
 
-    return it_success.first->second;
+Status ExtensionSet::AddUri(Id id) {
+  auto uris_size = static_cast<unsigned int>(uris_.size());
+  if (uris_.find(uris_size) != uris_.end()) {
+    return Status::Invalid("Key already exist in the uris map");
   }
+  uris_[uris_size] = id.uri;
+  return Status::OK();
+}
 
-  std::unordered_set<util::string_view, ::arrow::internal::StringViewHash> uris_;
-  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_, functions_;
-};
-
-ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry)
-    : registry_(registry), impl_(new Impl(), [](Impl* impl) { delete impl; }) {}
-
-Result<ExtensionSet> ExtensionSet::Make(std::vector<util::string_view> uris,
-                                        std::vector<Id> type_ids,
-                                        std::vector<bool> type_is_variation,
-                                        std::vector<Id> function_ids,
-                                        ExtensionIdRegistry* registry) {
+Result<ExtensionSet> ExtensionSet::Make(

Review Comment:
   Added the suggested comment.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] github-actions[bot] commented on pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on PR #12852:
URL: https://github.com/apache/arrow/pull/12852#issuecomment-1094525619

   https://issues.apache.org/jira/browse/ARROW-15583


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] sanjibansg commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
sanjibansg commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r864648104


##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -40,99 +40,65 @@ struct TypePtrHashEq {
   }
 };
 
-struct IdHashEq {
-  using Id = ExtensionSet::Id;
-
-  size_t operator()(Id id) const {
-    constexpr ::arrow::internal::StringViewHash hash = {};
-    auto out = static_cast<size_t>(hash(id.uri));
-    ::arrow::internal::hash_combine(out, hash(id.name));
-    return out;
-  }
-
-  bool operator()(Id l, Id r) const { return l.uri == r.uri && l.name == r.name; }
-};
-
 }  // namespace
 
 // A builder used when creating a Substrait plan from an Arrow execution plan.  In
 // that situation we do not have a set of anchor values already defined so we keep
 // a map of what Ids we have seen.
-struct ExtensionSet::Impl {
-  void AddUri(util::string_view uri, ExtensionSet* self) {
-    if (uris_.find(uri) != uris_.end()) return;
-
-    self->uris_.push_back(uri);
-    uris_.insert(self->uris_.back());  // lookup helper's keys should reference memory
-                                       // owned by this ExtensionSet
-  }
-
-  Status CheckHasUri(util::string_view uri) {
-    if (uris_.find(uri) != uris_.end()) return Status::OK();
-
-    return Status::Invalid(
-        "Uri ", uri,
-        " was referenced by an extension but was not declared in the ExtensionSet.");
-  }
-
-  uint32_t EncodeType(ExtensionIdRegistry::TypeRecord type_record, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(type_record.id.uri, self);
-    auto it_success =
-        types_.emplace(type_record.id, static_cast<uint32_t>(types_.size()));
-
-    if (it_success.second) {
-      self->types_.push_back(
-          {type_record.id, type_record.type, type_record.is_variation});
-    }
-
-    return it_success.first->second;
-  }
-
-  uint32_t EncodeFunction(Id id, util::string_view function_name, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(id.uri, self);
-    auto it_success = functions_.emplace(id, static_cast<uint32_t>(functions_.size()));
-
-    if (it_success.second) {
-      self->functions_.push_back({id, function_name});
-    }
-
-    return it_success.first->second;
-  }
+ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry) : registry_(registry) {}
+
+Status ExtensionSet::CheckHasUri(util::string_view uri, ExtensionSet* self) {
+  auto it =
+      std::find_if(self->uris_.begin(), self->uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri;
+                   });
+  if (it != self->uris_.end()) return Status::OK();
+
+  return Status::Invalid(
+      "Uri ", uri,
+      " was referenced by an extension but was not declared in the ExtensionSet.");
+}
 
-  std::unordered_set<util::string_view, ::arrow::internal::StringViewHash> uris_;
-  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_, functions_;
-};
+void ExtensionSet::AddUri(std::pair<uint32_t, util::string_view> uri,
+                          ExtensionSet* self) {
+  auto it =
+      std::find_if(self->uris_.begin(), self->uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri.second;
+                   });
+  if (it != self->uris_.end()) return;
+  self->uris_[uri.first] = uri.second;
+}
 
-ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry)
-    : registry_(registry), impl_(new Impl(), [](Impl* impl) { delete impl; }) {}
+void ExtensionSet::AddUri(Id id, ExtensionSet* self) {
+  if (self->functions_map_.find(id) != self->functions_map_.end() &&
+      self->types_map_.find(id) != self->types_map_.end())
+    return;
+  self->uris_[self->uris_.size()] = id.uri;

Review Comment:
   I believe this is because of the conversion of `size_t` to `unsigned int` while using the `[]` operator with `self->uris.size()`. We can static cast it to prevent this error.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] jvanstraten commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
jvanstraten commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r870817041


##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -174,31 +134,51 @@ Result<ExtensionSet> ExtensionSet::Make(std::vector<util::string_view> uris,
 }
 
 Result<ExtensionSet::TypeRecord> ExtensionSet::DecodeType(uint32_t anchor) const {
-  if (anchor >= types_.size() || types_[anchor].id.empty()) {
+  if (types_.find(anchor) == types_.end() || types_.at(anchor).id.empty()) {
     return Status::Invalid("User defined type reference ", anchor,
                            " did not have a corresponding anchor in the extension set");
   }
-  return types_[anchor];
+  return types_.at(anchor);

Review Comment:
   I haven't looked at the PR again as of yet, but this struck me as an odd take in my e-mail feed. Assuming types_ is now a map of some kind, keep in mind that if the key does not exist, an uninitialized value is silently *added to the map* and then returned by mutable reference, to make x[y] = z work at the expense of sanity in all other use cases. Both [] and at() are bad if there's a real possibility of this occurring (I didn't check), but I'd much rather get an unhandled exception than pants-on-head crazy behavior. For vectors there'd be a minor performance benefit to [] over at(), but that also doesn't apply to maps AFAIK.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
lidavidm commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r852998113


##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -108,13 +108,23 @@ void SetElement(size_t i, const Element& element, std::vector<T>* vector) {
   }
   (*vector)[i] = static_cast<T>(element);
 }
+
+template <typename Element, typename key, typename value>
+void SetMapElement(key i, const Element& element, std::unordered_map<key, value>* map) {
+  DCHECK_LE(i, 1 << 20);

Review Comment:
   Why the hardcoded limit? If it's important, can it be made constexpr?



##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -108,13 +108,23 @@ void SetElement(size_t i, const Element& element, std::vector<T>* vector) {
   }
   (*vector)[i] = static_cast<T>(element);
 }
+
+template <typename Element, typename key, typename value>
+void SetMapElement(key i, const Element& element, std::unordered_map<key, value>* map) {
+  DCHECK_LE(i, 1 << 20);
+  if (i >= map->size()) {
+    map->reserve(i + 1);
+  }
+  (*map)[i] = static_cast<value>(element);
+}
+
 }  // namespace
 
 Result<ExtensionSet> GetExtensionSetFromPlan(const substrait::Plan& plan,
                                              ExtensionIdRegistry* registry) {
-  std::vector<util::string_view> uris;
+  std::unordered_map<uint32_t, util::string_view> uris;
   for (const auto& uri : plan.extension_uris()) {
-    SetElement(uri.extension_uri_anchor(), uri.uri(), &uris);
+    SetMapElement(uri.extension_uri_anchor(), uri.uri(), &uris);

Review Comment:
   Why not just inline this?



##########
cpp/src/arrow/engine/substrait/extension_set.h:
##########
@@ -226,14 +246,10 @@ class ARROW_ENGINE_EXPORT ExtensionSet {
  private:
   ExtensionIdRegistry* registry_;
   /// The subset of extension registry URIs referenced by this extension set
-  std::vector<util::string_view> uris_;
+  std::unordered_map<uint32_t, util::string_view> uris_;
   std::vector<TypeRecord> types_;
-
   std::vector<FunctionRecord> functions_;
-
-  // pimpl pattern to hide lookup details
-  struct Impl;
-  std::unique_ptr<Impl, void (*)(Impl*)> impl_;
+  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_map_, functions_map_;

Review Comment:
   Does this need to be exposed in the public header? The pImpl at least kept these details out of the header.



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -40,99 +40,65 @@ struct TypePtrHashEq {
   }
 };
 
-struct IdHashEq {
-  using Id = ExtensionSet::Id;
-
-  size_t operator()(Id id) const {
-    constexpr ::arrow::internal::StringViewHash hash = {};
-    auto out = static_cast<size_t>(hash(id.uri));
-    ::arrow::internal::hash_combine(out, hash(id.name));
-    return out;
-  }
-
-  bool operator()(Id l, Id r) const { return l.uri == r.uri && l.name == r.name; }
-};
-
 }  // namespace
 
 // A builder used when creating a Substrait plan from an Arrow execution plan.  In
 // that situation we do not have a set of anchor values already defined so we keep
 // a map of what Ids we have seen.
-struct ExtensionSet::Impl {
-  void AddUri(util::string_view uri, ExtensionSet* self) {
-    if (uris_.find(uri) != uris_.end()) return;
-
-    self->uris_.push_back(uri);
-    uris_.insert(self->uris_.back());  // lookup helper's keys should reference memory
-                                       // owned by this ExtensionSet
-  }
-
-  Status CheckHasUri(util::string_view uri) {
-    if (uris_.find(uri) != uris_.end()) return Status::OK();
-
-    return Status::Invalid(
-        "Uri ", uri,
-        " was referenced by an extension but was not declared in the ExtensionSet.");
-  }
-
-  uint32_t EncodeType(ExtensionIdRegistry::TypeRecord type_record, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(type_record.id.uri, self);
-    auto it_success =
-        types_.emplace(type_record.id, static_cast<uint32_t>(types_.size()));
-
-    if (it_success.second) {
-      self->types_.push_back(
-          {type_record.id, type_record.type, type_record.is_variation});
-    }
-
-    return it_success.first->second;
-  }
-
-  uint32_t EncodeFunction(Id id, util::string_view function_name, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(id.uri, self);
-    auto it_success = functions_.emplace(id, static_cast<uint32_t>(functions_.size()));
-
-    if (it_success.second) {
-      self->functions_.push_back({id, function_name});
-    }
-
-    return it_success.first->second;
-  }
+ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry) : registry_(registry) {}
+
+Status ExtensionSet::CheckHasUri(util::string_view uri, ExtensionSet* self) {
+  auto it =
+      std::find_if(self->uris_.begin(), self->uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri;
+                   });
+  if (it != self->uris_.end()) return Status::OK();
+
+  return Status::Invalid(
+      "Uri ", uri,
+      " was referenced by an extension but was not declared in the ExtensionSet.");
+}
 
-  std::unordered_set<util::string_view, ::arrow::internal::StringViewHash> uris_;
-  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_, functions_;
-};
+void ExtensionSet::AddUri(std::pair<uint32_t, util::string_view> uri,
+                          ExtensionSet* self) {
+  auto it =
+      std::find_if(self->uris_.begin(), self->uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri.second;
+                   });
+  if (it != self->uris_.end()) return;
+  self->uris_[uri.first] = uri.second;
+}
 
-ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry)
-    : registry_(registry), impl_(new Impl(), [](Impl* impl) { delete impl; }) {}
+void ExtensionSet::AddUri(Id id, ExtensionSet* self) {
+  if (self->functions_map_.find(id) != self->functions_map_.end() &&
+      self->types_map_.find(id) != self->types_map_.end())
+    return;

Review Comment:
   This check isn't in the original, is it needed now? Also, it's missing braces, and shouldn't this be `||` not `&&`



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -40,99 +40,65 @@ struct TypePtrHashEq {
   }
 };
 
-struct IdHashEq {
-  using Id = ExtensionSet::Id;
-
-  size_t operator()(Id id) const {
-    constexpr ::arrow::internal::StringViewHash hash = {};
-    auto out = static_cast<size_t>(hash(id.uri));
-    ::arrow::internal::hash_combine(out, hash(id.name));
-    return out;
-  }
-
-  bool operator()(Id l, Id r) const { return l.uri == r.uri && l.name == r.name; }
-};
-
 }  // namespace
 
 // A builder used when creating a Substrait plan from an Arrow execution plan.  In
 // that situation we do not have a set of anchor values already defined so we keep
 // a map of what Ids we have seen.
-struct ExtensionSet::Impl {
-  void AddUri(util::string_view uri, ExtensionSet* self) {
-    if (uris_.find(uri) != uris_.end()) return;
-
-    self->uris_.push_back(uri);
-    uris_.insert(self->uris_.back());  // lookup helper's keys should reference memory
-                                       // owned by this ExtensionSet
-  }
-
-  Status CheckHasUri(util::string_view uri) {
-    if (uris_.find(uri) != uris_.end()) return Status::OK();
-
-    return Status::Invalid(
-        "Uri ", uri,
-        " was referenced by an extension but was not declared in the ExtensionSet.");
-  }
-
-  uint32_t EncodeType(ExtensionIdRegistry::TypeRecord type_record, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(type_record.id.uri, self);
-    auto it_success =
-        types_.emplace(type_record.id, static_cast<uint32_t>(types_.size()));
-
-    if (it_success.second) {
-      self->types_.push_back(
-          {type_record.id, type_record.type, type_record.is_variation});
-    }
-
-    return it_success.first->second;
-  }
-
-  uint32_t EncodeFunction(Id id, util::string_view function_name, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(id.uri, self);
-    auto it_success = functions_.emplace(id, static_cast<uint32_t>(functions_.size()));
-
-    if (it_success.second) {
-      self->functions_.push_back({id, function_name});
-    }
-
-    return it_success.first->second;
-  }
+ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry) : registry_(registry) {}
+
+Status ExtensionSet::CheckHasUri(util::string_view uri, ExtensionSet* self) {
+  auto it =
+      std::find_if(self->uris_.begin(), self->uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri;
+                   });
+  if (it != self->uris_.end()) return Status::OK();
+
+  return Status::Invalid(
+      "Uri ", uri,
+      " was referenced by an extension but was not declared in the ExtensionSet.");
+}
 
-  std::unordered_set<util::string_view, ::arrow::internal::StringViewHash> uris_;
-  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_, functions_;
-};
+void ExtensionSet::AddUri(std::pair<uint32_t, util::string_view> uri,
+                          ExtensionSet* self) {
+  auto it =
+      std::find_if(self->uris_.begin(), self->uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri.second;
+                   });
+  if (it != self->uris_.end()) return;
+  self->uris_[uri.first] = uri.second;
+}
 
-ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry)
-    : registry_(registry), impl_(new Impl(), [](Impl* impl) { delete impl; }) {}
+void ExtensionSet::AddUri(Id id, ExtensionSet* self) {
+  if (self->functions_map_.find(id) != self->functions_map_.end() &&
+      self->types_map_.find(id) != self->types_map_.end())
+    return;
+  self->uris_[self->uris_.size()] = id.uri;

Review Comment:
   This causes a compiler warning on MSVC



##########
cpp/src/arrow/engine/substrait/extension_set.h:
##########
@@ -143,6 +159,10 @@ class ARROW_ENGINE_EXPORT ExtensionSet {
   explicit ExtensionSet(ExtensionIdRegistry* = default_extension_id_registry());
   ARROW_DEFAULT_MOVE_AND_ASSIGN(ExtensionSet);
 
+  static Status CheckHasUri(util::string_view uri, ExtensionSet* self);
+  static void AddUri(std::pair<uint32_t, util::string_view> uri, ExtensionSet* self);
+  static void AddUri(Id id, ExtensionSet* self);

Review Comment:
   Make these `private`?



##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -108,13 +108,23 @@ void SetElement(size_t i, const Element& element, std::vector<T>* vector) {
   }
   (*vector)[i] = static_cast<T>(element);
 }
+
+template <typename Element, typename key, typename value>
+void SetMapElement(key i, const Element& element, std::unordered_map<key, value>* map) {
+  DCHECK_LE(i, 1 << 20);
+  if (i >= map->size()) {
+    map->reserve(i + 1);
+  }

Review Comment:
   I don't think reserve does much when you're doing it one-by-one. Maybe move it outside of the loop instead.



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -40,99 +40,65 @@ struct TypePtrHashEq {
   }
 };
 
-struct IdHashEq {
-  using Id = ExtensionSet::Id;
-
-  size_t operator()(Id id) const {
-    constexpr ::arrow::internal::StringViewHash hash = {};
-    auto out = static_cast<size_t>(hash(id.uri));
-    ::arrow::internal::hash_combine(out, hash(id.name));
-    return out;
-  }
-
-  bool operator()(Id l, Id r) const { return l.uri == r.uri && l.name == r.name; }
-};
-
 }  // namespace
 
 // A builder used when creating a Substrait plan from an Arrow execution plan.  In
 // that situation we do not have a set of anchor values already defined so we keep
 // a map of what Ids we have seen.
-struct ExtensionSet::Impl {
-  void AddUri(util::string_view uri, ExtensionSet* self) {
-    if (uris_.find(uri) != uris_.end()) return;
-
-    self->uris_.push_back(uri);
-    uris_.insert(self->uris_.back());  // lookup helper's keys should reference memory
-                                       // owned by this ExtensionSet
-  }
-
-  Status CheckHasUri(util::string_view uri) {
-    if (uris_.find(uri) != uris_.end()) return Status::OK();
-
-    return Status::Invalid(
-        "Uri ", uri,
-        " was referenced by an extension but was not declared in the ExtensionSet.");
-  }
-
-  uint32_t EncodeType(ExtensionIdRegistry::TypeRecord type_record, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(type_record.id.uri, self);
-    auto it_success =
-        types_.emplace(type_record.id, static_cast<uint32_t>(types_.size()));
-
-    if (it_success.second) {
-      self->types_.push_back(
-          {type_record.id, type_record.type, type_record.is_variation});
-    }
-
-    return it_success.first->second;
-  }
-
-  uint32_t EncodeFunction(Id id, util::string_view function_name, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(id.uri, self);
-    auto it_success = functions_.emplace(id, static_cast<uint32_t>(functions_.size()));
-
-    if (it_success.second) {
-      self->functions_.push_back({id, function_name});
-    }
-
-    return it_success.first->second;
-  }
+ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry) : registry_(registry) {}
+
+Status ExtensionSet::CheckHasUri(util::string_view uri, ExtensionSet* self) {
+  auto it =
+      std::find_if(self->uris_.begin(), self->uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri;
+                   });
+  if (it != self->uris_.end()) return Status::OK();
+
+  return Status::Invalid(
+      "Uri ", uri,
+      " was referenced by an extension but was not declared in the ExtensionSet.");
+}
 
-  std::unordered_set<util::string_view, ::arrow::internal::StringViewHash> uris_;
-  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_, functions_;
-};
+void ExtensionSet::AddUri(std::pair<uint32_t, util::string_view> uri,
+                          ExtensionSet* self) {
+  auto it =
+      std::find_if(self->uris_.begin(), self->uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri.second;
+                   });
+  if (it != self->uris_.end()) return;
+  self->uris_[uri.first] = uri.second;
+}
 
-ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry)
-    : registry_(registry), impl_(new Impl(), [](Impl* impl) { delete impl; }) {}
+void ExtensionSet::AddUri(Id id, ExtensionSet* self) {
+  if (self->functions_map_.find(id) != self->functions_map_.end() &&
+      self->types_map_.find(id) != self->types_map_.end())
+    return;
+  self->uris_[self->uris_.size()] = id.uri;
+}
 
-Result<ExtensionSet> ExtensionSet::Make(std::vector<util::string_view> uris,
-                                        std::vector<Id> type_ids,
-                                        std::vector<bool> type_is_variation,
-                                        std::vector<Id> function_ids,
-                                        ExtensionIdRegistry* registry) {
+Result<ExtensionSet> ExtensionSet::Make(
+    std::unordered_map<uint32_t, util::string_view> uris, std::vector<Id> type_ids,
+    std::vector<bool> type_is_variation, std::vector<Id> function_ids,
+    ExtensionIdRegistry* registry) {
   ExtensionSet set;
   set.registry_ = registry;
 
   // TODO(bkietz) move this into the registry as registry->OwnUris(&uris) or so
   std::unordered_set<util::string_view, ::arrow::internal::StringViewHash>
       uris_owned_by_registry;
-  for (util::string_view uri : registry->Uris()) {
+  for (auto uri : registry->Uris()) {
     uris_owned_by_registry.insert(uri);
   }
 
   for (auto& uri : uris) {
-    if (uri.empty()) continue;
-    auto it = uris_owned_by_registry.find(uri);
+    auto it = uris_owned_by_registry.find(uri.second);
     if (it == uris_owned_by_registry.end()) {
-      return Status::KeyError("Uri '", uri, "' not found in registry");
+      return Status::KeyError("Uri '", uri.second, "' not found in registry");
     }
-    uri = *it;  // Ensure uris point into the registry's memory
-    set.impl_->AddUri(*it, &set);
+    // uri = *it;  // Ensure uris point into the registry's memory

Review Comment:
   Commented code?



##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -43,7 +43,7 @@ Status AddExtensionSetToPlan(const ExtensionSet& ext_set, substrait::Plan* plan)
   auto uris = plan->mutable_extension_uris();
   uris->Reserve(static_cast<int>(ext_set.uris().size()));
   for (uint32_t anchor = 0; anchor < ext_set.uris().size(); ++anchor) {
-    auto uri = ext_set.uris()[anchor];
+    auto uri = ext_set.uris().at(anchor);

Review Comment:
   `at` will raise an exception if the element doesn't exist - is that what you want? Especially given the `empty` check below



##########
cpp/src/arrow/engine/substrait/extension_set.h:
##########
@@ -143,6 +159,10 @@ class ARROW_ENGINE_EXPORT ExtensionSet {
   explicit ExtensionSet(ExtensionIdRegistry* = default_extension_id_registry());
   ARROW_DEFAULT_MOVE_AND_ASSIGN(ExtensionSet);
 
+  static Status CheckHasUri(util::string_view uri, ExtensionSet* self);
+  static void AddUri(std::pair<uint32_t, util::string_view> uri, ExtensionSet* self);
+  static void AddUri(Id id, ExtensionSet* self);

Review Comment:
   And if you really are going to remove the pImpl, these shouldn't be static anymore.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] westonpace commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
westonpace commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r870781521


##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -43,7 +43,7 @@ Status AddExtensionSetToPlan(const ExtensionSet& ext_set, substrait::Plan* plan)
   auto uris = plan->mutable_extension_uris();
   uris->Reserve(static_cast<int>(ext_set.uris().size()));
   for (uint32_t anchor = 0; anchor < ext_set.uris().size(); ++anchor) {
-    auto uri = ext_set.uris()[anchor];
+    auto uri = ext_set.uris().at(anchor);

Review Comment:
   Hm...does this work?
   ```
   util::string_view uri = ext_set.uris()[anchor];
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] sanjibansg commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
sanjibansg commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r864604482


##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -40,99 +40,65 @@ struct TypePtrHashEq {
   }
 };
 
-struct IdHashEq {
-  using Id = ExtensionSet::Id;
-
-  size_t operator()(Id id) const {
-    constexpr ::arrow::internal::StringViewHash hash = {};
-    auto out = static_cast<size_t>(hash(id.uri));
-    ::arrow::internal::hash_combine(out, hash(id.name));
-    return out;
-  }
-
-  bool operator()(Id l, Id r) const { return l.uri == r.uri && l.name == r.name; }
-};
-
 }  // namespace
 
 // A builder used when creating a Substrait plan from an Arrow execution plan.  In
 // that situation we do not have a set of anchor values already defined so we keep
 // a map of what Ids we have seen.
-struct ExtensionSet::Impl {
-  void AddUri(util::string_view uri, ExtensionSet* self) {
-    if (uris_.find(uri) != uris_.end()) return;
-
-    self->uris_.push_back(uri);
-    uris_.insert(self->uris_.back());  // lookup helper's keys should reference memory
-                                       // owned by this ExtensionSet
-  }
-
-  Status CheckHasUri(util::string_view uri) {
-    if (uris_.find(uri) != uris_.end()) return Status::OK();
-
-    return Status::Invalid(
-        "Uri ", uri,
-        " was referenced by an extension but was not declared in the ExtensionSet.");
-  }
-
-  uint32_t EncodeType(ExtensionIdRegistry::TypeRecord type_record, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(type_record.id.uri, self);
-    auto it_success =
-        types_.emplace(type_record.id, static_cast<uint32_t>(types_.size()));
-
-    if (it_success.second) {
-      self->types_.push_back(
-          {type_record.id, type_record.type, type_record.is_variation});
-    }
-
-    return it_success.first->second;
-  }
-
-  uint32_t EncodeFunction(Id id, util::string_view function_name, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(id.uri, self);
-    auto it_success = functions_.emplace(id, static_cast<uint32_t>(functions_.size()));
-
-    if (it_success.second) {
-      self->functions_.push_back({id, function_name});
-    }
-
-    return it_success.first->second;
-  }
+ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry) : registry_(registry) {}
+
+Status ExtensionSet::CheckHasUri(util::string_view uri, ExtensionSet* self) {
+  auto it =
+      std::find_if(self->uris_.begin(), self->uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri;
+                   });
+  if (it != self->uris_.end()) return Status::OK();
+
+  return Status::Invalid(
+      "Uri ", uri,
+      " was referenced by an extension but was not declared in the ExtensionSet.");
+}
 
-  std::unordered_set<util::string_view, ::arrow::internal::StringViewHash> uris_;
-  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_, functions_;
-};
+void ExtensionSet::AddUri(std::pair<uint32_t, util::string_view> uri,
+                          ExtensionSet* self) {
+  auto it =
+      std::find_if(self->uris_.begin(), self->uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri.second;
+                   });
+  if (it != self->uris_.end()) return;
+  self->uris_[uri.first] = uri.second;
+}
 
-ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry)
-    : registry_(registry), impl_(new Impl(), [](Impl* impl) { delete impl; }) {}
+void ExtensionSet::AddUri(Id id, ExtensionSet* self) {
+  if (self->functions_map_.find(id) != self->functions_map_.end() &&
+      self->types_map_.find(id) != self->types_map_.end())
+    return;
+  self->uris_[self->uris_.size()] = id.uri;
+}
 
-Result<ExtensionSet> ExtensionSet::Make(std::vector<util::string_view> uris,
-                                        std::vector<Id> type_ids,
-                                        std::vector<bool> type_is_variation,
-                                        std::vector<Id> function_ids,
-                                        ExtensionIdRegistry* registry) {
+Result<ExtensionSet> ExtensionSet::Make(
+    std::unordered_map<uint32_t, util::string_view> uris, std::vector<Id> type_ids,
+    std::vector<bool> type_is_variation, std::vector<Id> function_ids,
+    ExtensionIdRegistry* registry) {
   ExtensionSet set;
   set.registry_ = registry;
 
   // TODO(bkietz) move this into the registry as registry->OwnUris(&uris) or so
   std::unordered_set<util::string_view, ::arrow::internal::StringViewHash>
       uris_owned_by_registry;
-  for (util::string_view uri : registry->Uris()) {
+  for (auto uri : registry->Uris()) {
     uris_owned_by_registry.insert(uri);
   }
 
   for (auto& uri : uris) {
-    if (uri.empty()) continue;
-    auto it = uris_owned_by_registry.find(uri);
+    auto it = uris_owned_by_registry.find(uri.second);
     if (it == uris_owned_by_registry.end()) {
-      return Status::KeyError("Uri '", uri, "' not found in registry");
+      return Status::KeyError("Uri '", uri.second, "' not found in registry");
     }
-    uri = *it;  // Ensure uris point into the registry's memory
-    set.impl_->AddUri(*it, &set);
+    // uri = *it;  // Ensure uris point into the registry's memory

Review Comment:
   Sorry, mistake. Corrected that, thanks!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] ursabot commented on pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
ursabot commented on PR #12852:
URL: https://github.com/apache/arrow/pull/12852#issuecomment-1137127387

   Benchmark runs are scheduled for baseline = 5994fd88259b8a6ee0efef7233d75352b7360188 and contender = 8bbc6955202650ab1c3f9f564ec99e4e499a1f40. 8bbc6955202650ab1c3f9f564ec99e4e499a1f40 is a master commit associated with this PR. Results will be available as each benchmark for each run completes.
   Conbench compare runs links:
   [Finished :arrow_down:0.0% :arrow_up:0.0%] [ec2-t3-xlarge-us-east-2](https://conbench.ursa.dev/compare/runs/9ca30abbc4984dbb89fa03a3fe317c14...67b7ee824def4ae7b031d96696916d83/)
   [Failed :arrow_down:0.19% :arrow_up:1.87%] [test-mac-arm](https://conbench.ursa.dev/compare/runs/3c567b197a0f4451a9216a6301f3ffed...11945380f42d46adb5c81679377571df/)
   [Failed :arrow_down:0.0% :arrow_up:0.0%] [ursa-i9-9960x](https://conbench.ursa.dev/compare/runs/aa96802ffae5427c951c499a577cd100...086628aa8c12411496d5ab1fc2f26564/)
   [Finished :arrow_down:0.08% :arrow_up:1.89%] [ursa-thinkcentre-m75q](https://conbench.ursa.dev/compare/runs/6841d58727ff4dd49ba92a716c1f8949...336df63889274f729deb59de65876a71/)
   Buildkite builds:
   [Finished] [`8bbc6955` ec2-t3-xlarge-us-east-2](https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-ec2-t3-xlarge-us-east-2/builds/818)
   [Failed] [`8bbc6955` test-mac-arm](https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-test-mac-arm/builds/817)
   [Failed] [`8bbc6955` ursa-i9-9960x](https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-ursa-i9-9960x/builds/807)
   [Finished] [`8bbc6955` ursa-thinkcentre-m75q](https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-ursa-thinkcentre-m75q/builds/821)
   [Finished] [`5994fd88` ec2-t3-xlarge-us-east-2](https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-ec2-t3-xlarge-us-east-2/builds/817)
   [Failed] [`5994fd88` test-mac-arm](https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-test-mac-arm/builds/816)
   [Failed] [`5994fd88` ursa-i9-9960x](https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-ursa-i9-9960x/builds/806)
   [Finished] [`5994fd88` ursa-thinkcentre-m75q](https://buildkite.com/apache-arrow/arrow-bci-benchmark-on-ursa-thinkcentre-m75q/builds/820)
   Supported benchmarks:
   ec2-t3-xlarge-us-east-2: Supported benchmark langs: Python, R. Runs only benchmarks with cloud = True
   test-mac-arm: Supported benchmark langs: C++, Python, R
   ursa-i9-9960x: Supported benchmark langs: Python, R, JavaScript
   ursa-thinkcentre-m75q: Supported benchmark langs: C++, Java
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] westonpace commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
westonpace commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r870881874


##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -43,7 +43,7 @@ Status AddExtensionSetToPlan(const ExtensionSet& ext_set, substrait::Plan* plan)
   auto uris = plan->mutable_extension_uris();
   uris->Reserve(static_cast<int>(ext_set.uris().size()));
   for (uint32_t anchor = 0; anchor < ext_set.uris().size(); ++anchor) {
-    auto uri = ext_set.uris()[anchor];
+    auto uri = ext_set.uris().at(anchor);

Review Comment:
   Nvm, per @jeroen 's comment below we should keep this as `at`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] sanjibansg commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
sanjibansg commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r875294830


##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -174,31 +134,51 @@ Result<ExtensionSet> ExtensionSet::Make(std::vector<util::string_view> uris,
 }
 
 Result<ExtensionSet::TypeRecord> ExtensionSet::DecodeType(uint32_t anchor) const {
-  if (anchor >= types_.size() || types_[anchor].id.empty()) {
+  if (types_.find(anchor) == types_.end() || types_.at(anchor).id.empty()) {
     return Status::Invalid("User defined type reference ", anchor,
                            " did not have a corresponding anchor in the extension set");
   }
-  return types_[anchor];
+  return types_.at(anchor);
 }
 
 Result<uint32_t> ExtensionSet::EncodeType(const DataType& type) {
   if (auto rec = registry_->GetType(type)) {
-    return impl_->EncodeType(*rec, this);
+    RETURN_NOT_OK(this->AddUri(rec->id));
+    auto it_success =
+        types_map_.emplace(rec->id, static_cast<uint32_t>(types_map_.size()));
+    if (it_success.second) {
+      if (types_.find(static_cast<unsigned int>(types_.size())) != types_.end()) {
+        return Status::Invalid("Key already exist in the uris map");
+      }

Review Comment:
   Using `DCHECK` now instead of only returning a Invalid status



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -174,31 +134,51 @@ Result<ExtensionSet> ExtensionSet::Make(std::vector<util::string_view> uris,
 }
 
 Result<ExtensionSet::TypeRecord> ExtensionSet::DecodeType(uint32_t anchor) const {
-  if (anchor >= types_.size() || types_[anchor].id.empty()) {
+  if (types_.find(anchor) == types_.end() || types_.at(anchor).id.empty()) {
     return Status::Invalid("User defined type reference ", anchor,
                            " did not have a corresponding anchor in the extension set");
   }
-  return types_[anchor];
+  return types_.at(anchor);
 }
 
 Result<uint32_t> ExtensionSet::EncodeType(const DataType& type) {
   if (auto rec = registry_->GetType(type)) {
-    return impl_->EncodeType(*rec, this);
+    RETURN_NOT_OK(this->AddUri(rec->id));
+    auto it_success =
+        types_map_.emplace(rec->id, static_cast<uint32_t>(types_map_.size()));
+    if (it_success.second) {
+      if (types_.find(static_cast<unsigned int>(types_.size())) != types_.end()) {
+        return Status::Invalid("Key already exist in the uris map");
+      }
+      types_[static_cast<unsigned int>(types_.size())] = {rec->id, rec->type};
+    }
+    return it_success.first->second;
   }
   return Status::KeyError("type ", type.ToString(), " not found in the registry");
 }
 
 Result<ExtensionSet::FunctionRecord> ExtensionSet::DecodeFunction(uint32_t anchor) const {
-  if (anchor >= functions_.size() || functions_[anchor].id.empty()) {
+  if (functions_.find(anchor) == functions_.end() || functions_.at(anchor).id.empty()) {
     return Status::Invalid("User defined function reference ", anchor,
                            " did not have a corresponding anchor in the extension set");
   }
-  return functions_[anchor];
+  return functions_.at(anchor);
 }
 
 Result<uint32_t> ExtensionSet::EncodeFunction(util::string_view function_name) {
   if (auto rec = registry_->GetFunction(function_name)) {
-    return impl_->EncodeFunction(rec->id, rec->function_name, this);
+    RETURN_NOT_OK(this->AddUri(rec->id));
+    auto it_success =
+        functions_map_.emplace(rec->id, static_cast<uint32_t>(functions_map_.size()));
+    if (it_success.second) {
+      if (functions_.find(static_cast<unsigned int>(functions_.size())) !=
+          functions_.end()) {
+        return Status::Invalid("Key already exist in the uris map");
+      }

Review Comment:
   Made the change, thanks!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] sanjibansg commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
sanjibansg commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r864644563


##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -40,99 +40,65 @@ struct TypePtrHashEq {
   }
 };
 
-struct IdHashEq {
-  using Id = ExtensionSet::Id;
-
-  size_t operator()(Id id) const {
-    constexpr ::arrow::internal::StringViewHash hash = {};
-    auto out = static_cast<size_t>(hash(id.uri));
-    ::arrow::internal::hash_combine(out, hash(id.name));
-    return out;
-  }
-
-  bool operator()(Id l, Id r) const { return l.uri == r.uri && l.name == r.name; }
-};
-
 }  // namespace
 
 // A builder used when creating a Substrait plan from an Arrow execution plan.  In
 // that situation we do not have a set of anchor values already defined so we keep
 // a map of what Ids we have seen.
-struct ExtensionSet::Impl {
-  void AddUri(util::string_view uri, ExtensionSet* self) {
-    if (uris_.find(uri) != uris_.end()) return;
-
-    self->uris_.push_back(uri);
-    uris_.insert(self->uris_.back());  // lookup helper's keys should reference memory
-                                       // owned by this ExtensionSet
-  }
-
-  Status CheckHasUri(util::string_view uri) {
-    if (uris_.find(uri) != uris_.end()) return Status::OK();
-
-    return Status::Invalid(
-        "Uri ", uri,
-        " was referenced by an extension but was not declared in the ExtensionSet.");
-  }
-
-  uint32_t EncodeType(ExtensionIdRegistry::TypeRecord type_record, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(type_record.id.uri, self);
-    auto it_success =
-        types_.emplace(type_record.id, static_cast<uint32_t>(types_.size()));
-
-    if (it_success.second) {
-      self->types_.push_back(
-          {type_record.id, type_record.type, type_record.is_variation});
-    }
-
-    return it_success.first->second;
-  }
-
-  uint32_t EncodeFunction(Id id, util::string_view function_name, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(id.uri, self);
-    auto it_success = functions_.emplace(id, static_cast<uint32_t>(functions_.size()));
-
-    if (it_success.second) {
-      self->functions_.push_back({id, function_name});
-    }
-
-    return it_success.first->second;
-  }
+ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry) : registry_(registry) {}
+
+Status ExtensionSet::CheckHasUri(util::string_view uri, ExtensionSet* self) {
+  auto it =
+      std::find_if(self->uris_.begin(), self->uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri;
+                   });
+  if (it != self->uris_.end()) return Status::OK();
+
+  return Status::Invalid(
+      "Uri ", uri,
+      " was referenced by an extension but was not declared in the ExtensionSet.");
+}
 
-  std::unordered_set<util::string_view, ::arrow::internal::StringViewHash> uris_;
-  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_, functions_;
-};
+void ExtensionSet::AddUri(std::pair<uint32_t, util::string_view> uri,
+                          ExtensionSet* self) {
+  auto it =
+      std::find_if(self->uris_.begin(), self->uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri.second;
+                   });
+  if (it != self->uris_.end()) return;
+  self->uris_[uri.first] = uri.second;
+}
 
-ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry)
-    : registry_(registry), impl_(new Impl(), [](Impl* impl) { delete impl; }) {}
+void ExtensionSet::AddUri(Id id, ExtensionSet* self) {
+  if (self->functions_map_.find(id) != self->functions_map_.end() &&
+      self->types_map_.find(id) != self->types_map_.end())
+    return;

Review Comment:
   I thought it may be needed for doing a check while adding the type_ids or the function_ids. Removed it in the latest commit.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
lidavidm commented on PR #12852:
URL: https://github.com/apache/arrow/pull/12852#issuecomment-1131768203

   > It turns out that `arrow/util/hashing.h` is a [private header](https://github.com/apache/arrow/blob/7f31c9d2d95b5b30fa7997c2519d3b85919f1fd9/cpp/src/arrow/util/hashing.h#L18) (we should probably rename it `hashing_internal.h` but not as part of this PR).
   
   FWIW, see https://issues.apache.org/jira/browse/ARROW-16609 (we may have to make it public?)


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] westonpace closed pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
westonpace closed pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors
URL: https://github.com/apache/arrow/pull/12852


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] sanjibansg commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
sanjibansg commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r875294573


##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -40,125 +39,86 @@ struct TypePtrHashEq {
   }
 };
 
-struct IdHashEq {
-  using Id = ExtensionSet::Id;
-
-  size_t operator()(Id id) const {
-    constexpr ::arrow::internal::StringViewHash hash = {};
-    auto out = static_cast<size_t>(hash(id.uri));
-    ::arrow::internal::hash_combine(out, hash(id.name));
-    return out;
-  }
-
-  bool operator()(Id l, Id r) const { return l.uri == r.uri && l.name == r.name; }
-};
-
 }  // namespace
 
 // A builder used when creating a Substrait plan from an Arrow execution plan.  In
 // that situation we do not have a set of anchor values already defined so we keep
 // a map of what Ids we have seen.
-struct ExtensionSet::Impl {
-  void AddUri(util::string_view uri, ExtensionSet* self) {
-    if (uris_.find(uri) != uris_.end()) return;
-
-    self->uris_.push_back(uri);
-    uris_.insert(self->uris_.back());  // lookup helper's keys should reference memory
-                                       // owned by this ExtensionSet
-  }
-
-  Status CheckHasUri(util::string_view uri) {
-    if (uris_.find(uri) != uris_.end()) return Status::OK();
-
-    return Status::Invalid(
-        "Uri ", uri,
-        " was referenced by an extension but was not declared in the ExtensionSet.");
-  }
-
-  uint32_t EncodeType(ExtensionIdRegistry::TypeRecord type_record, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(type_record.id.uri, self);
-    auto it_success =
-        types_.emplace(type_record.id, static_cast<uint32_t>(types_.size()));
-
-    if (it_success.second) {
-      self->types_.push_back(
-          {type_record.id, type_record.type, type_record.is_variation});
-    }
-
-    return it_success.first->second;
-  }
-
-  uint32_t EncodeFunction(Id id, util::string_view function_name, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(id.uri, self);
-    auto it_success = functions_.emplace(id, static_cast<uint32_t>(functions_.size()));
+ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry) : registry_(registry) {}
+
+Status ExtensionSet::CheckHasUri(util::string_view uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri;
+                   });
+  if (it != uris_.end()) return Status::OK();
+
+  return Status::Invalid(
+      "Uri ", uri,
+      " was referenced by an extension but was not declared in the ExtensionSet.");
+}
 
-    if (it_success.second) {
-      self->functions_.push_back({id, function_name});
-    }
+void ExtensionSet::AddUri(std::pair<uint32_t, util::string_view> uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri.second;
+                   });
+  if (it != uris_.end()) return;
+  uris_[uri.first] = uri.second;
+}
 
-    return it_success.first->second;
+Status ExtensionSet::AddUri(Id id) {
+  auto uris_size = static_cast<unsigned int>(uris_.size());
+  if (uris_.find(uris_size) != uris_.end()) {
+    return Status::Invalid("Key already exist in the uris map");
   }
+  uris_[uris_size] = id.uri;
+  return Status::OK();
+}
 
-  std::unordered_set<util::string_view, ::arrow::internal::StringViewHash> uris_;
-  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_, functions_;
-};
-
-ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry)
-    : registry_(registry), impl_(new Impl(), [](Impl* impl) { delete impl; }) {}
-
-Result<ExtensionSet> ExtensionSet::Make(std::vector<util::string_view> uris,
-                                        std::vector<Id> type_ids,
-                                        std::vector<bool> type_is_variation,
-                                        std::vector<Id> function_ids,
-                                        ExtensionIdRegistry* registry) {
+Result<ExtensionSet> ExtensionSet::Make(
+    std::unordered_map<uint32_t, util::string_view> uris,
+    std::unordered_map<uint32_t, Id> type_ids,
+    std::unordered_map<uint32_t, Id> function_ids, ExtensionIdRegistry* registry) {
   ExtensionSet set;
   set.registry_ = registry;
 
   // TODO(bkietz) move this into the registry as registry->OwnUris(&uris) or so
   std::unordered_set<util::string_view, ::arrow::internal::StringViewHash>
       uris_owned_by_registry;
-  for (util::string_view uri : registry->Uris()) {
+  for (auto uri : registry->Uris()) {

Review Comment:
   Using `util::string_view` instead of auto here now



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] sanjibansg commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
sanjibansg commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r875294214


##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -40,125 +39,86 @@ struct TypePtrHashEq {
   }
 };
 
-struct IdHashEq {
-  using Id = ExtensionSet::Id;
-
-  size_t operator()(Id id) const {
-    constexpr ::arrow::internal::StringViewHash hash = {};
-    auto out = static_cast<size_t>(hash(id.uri));
-    ::arrow::internal::hash_combine(out, hash(id.name));
-    return out;
-  }
-
-  bool operator()(Id l, Id r) const { return l.uri == r.uri && l.name == r.name; }
-};
-
 }  // namespace
 
 // A builder used when creating a Substrait plan from an Arrow execution plan.  In
 // that situation we do not have a set of anchor values already defined so we keep
 // a map of what Ids we have seen.
-struct ExtensionSet::Impl {
-  void AddUri(util::string_view uri, ExtensionSet* self) {
-    if (uris_.find(uri) != uris_.end()) return;
-
-    self->uris_.push_back(uri);
-    uris_.insert(self->uris_.back());  // lookup helper's keys should reference memory
-                                       // owned by this ExtensionSet
-  }
-
-  Status CheckHasUri(util::string_view uri) {
-    if (uris_.find(uri) != uris_.end()) return Status::OK();
-
-    return Status::Invalid(
-        "Uri ", uri,
-        " was referenced by an extension but was not declared in the ExtensionSet.");
-  }
-
-  uint32_t EncodeType(ExtensionIdRegistry::TypeRecord type_record, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(type_record.id.uri, self);
-    auto it_success =
-        types_.emplace(type_record.id, static_cast<uint32_t>(types_.size()));
-
-    if (it_success.second) {
-      self->types_.push_back(
-          {type_record.id, type_record.type, type_record.is_variation});
-    }
-
-    return it_success.first->second;
-  }
-
-  uint32_t EncodeFunction(Id id, util::string_view function_name, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(id.uri, self);
-    auto it_success = functions_.emplace(id, static_cast<uint32_t>(functions_.size()));
+ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry) : registry_(registry) {}
+
+Status ExtensionSet::CheckHasUri(util::string_view uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri;
+                   });
+  if (it != uris_.end()) return Status::OK();
+
+  return Status::Invalid(
+      "Uri ", uri,
+      " was referenced by an extension but was not declared in the ExtensionSet.");
+}
 
-    if (it_success.second) {
-      self->functions_.push_back({id, function_name});
-    }
+void ExtensionSet::AddUri(std::pair<uint32_t, util::string_view> uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri.second;
+                   });
+  if (it != uris_.end()) return;
+  uris_[uri.first] = uri.second;
+}
 
-    return it_success.first->second;
+Status ExtensionSet::AddUri(Id id) {
+  auto uris_size = static_cast<unsigned int>(uris_.size());
+  if (uris_.find(uris_size) != uris_.end()) {
+    return Status::Invalid("Key already exist in the uris map");

Review Comment:
   Added the suggested comment.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] sanjibansg commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
sanjibansg commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r875294091


##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -40,125 +39,86 @@ struct TypePtrHashEq {
   }
 };
 
-struct IdHashEq {
-  using Id = ExtensionSet::Id;
-
-  size_t operator()(Id id) const {
-    constexpr ::arrow::internal::StringViewHash hash = {};
-    auto out = static_cast<size_t>(hash(id.uri));
-    ::arrow::internal::hash_combine(out, hash(id.name));
-    return out;
-  }
-
-  bool operator()(Id l, Id r) const { return l.uri == r.uri && l.name == r.name; }
-};
-
 }  // namespace
 
 // A builder used when creating a Substrait plan from an Arrow execution plan.  In
 // that situation we do not have a set of anchor values already defined so we keep
 // a map of what Ids we have seen.
-struct ExtensionSet::Impl {
-  void AddUri(util::string_view uri, ExtensionSet* self) {
-    if (uris_.find(uri) != uris_.end()) return;
-
-    self->uris_.push_back(uri);
-    uris_.insert(self->uris_.back());  // lookup helper's keys should reference memory
-                                       // owned by this ExtensionSet
-  }
-
-  Status CheckHasUri(util::string_view uri) {
-    if (uris_.find(uri) != uris_.end()) return Status::OK();
-
-    return Status::Invalid(
-        "Uri ", uri,
-        " was referenced by an extension but was not declared in the ExtensionSet.");
-  }
-
-  uint32_t EncodeType(ExtensionIdRegistry::TypeRecord type_record, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(type_record.id.uri, self);
-    auto it_success =
-        types_.emplace(type_record.id, static_cast<uint32_t>(types_.size()));
-
-    if (it_success.second) {
-      self->types_.push_back(
-          {type_record.id, type_record.type, type_record.is_variation});
-    }
-
-    return it_success.first->second;
-  }
-
-  uint32_t EncodeFunction(Id id, util::string_view function_name, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(id.uri, self);
-    auto it_success = functions_.emplace(id, static_cast<uint32_t>(functions_.size()));
+ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry) : registry_(registry) {}
+
+Status ExtensionSet::CheckHasUri(util::string_view uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri;
+                   });
+  if (it != uris_.end()) return Status::OK();
+
+  return Status::Invalid(
+      "Uri ", uri,
+      " was referenced by an extension but was not declared in the ExtensionSet.");
+}
 
-    if (it_success.second) {
-      self->functions_.push_back({id, function_name});
-    }
+void ExtensionSet::AddUri(std::pair<uint32_t, util::string_view> uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri.second;
+                   });
+  if (it != uris_.end()) return;
+  uris_[uri.first] = uri.second;
+}
 
-    return it_success.first->second;
+Status ExtensionSet::AddUri(Id id) {
+  auto uris_size = static_cast<unsigned int>(uris_.size());
+  if (uris_.find(uris_size) != uris_.end()) {
+    return Status::Invalid("Key already exist in the uris map");

Review Comment:
   Made the change.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] westonpace commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
westonpace commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r856373437


##########
cpp/src/arrow/engine/substrait/extension_set.h:
##########
@@ -226,14 +246,10 @@ class ARROW_ENGINE_EXPORT ExtensionSet {
  private:
   ExtensionIdRegistry* registry_;
   /// The subset of extension registry URIs referenced by this extension set
-  std::vector<util::string_view> uris_;
+  std::unordered_map<uint32_t, util::string_view> uris_;
   std::vector<TypeRecord> types_;
-
   std::vector<FunctionRecord> functions_;
-
-  // pimpl pattern to hide lookup details
-  struct Impl;
-  std::unique_ptr<Impl, void (*)(Impl*)> impl_;
+  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_map_, functions_map_;

Review Comment:
   Do we have a guideline for hiding.  I find it hard to know when details are "too messy" and deserve a pimpl (other than mutex which we have lint checks for).
   
   Given that users of this header are going to have to dealing with `Id` (`ExtensionIdRegistry` exposes the idea) the `IdHashEq` could be considered useful.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] jvanstraten commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
jvanstraten commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r856457349


##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -108,13 +108,23 @@ void SetElement(size_t i, const Element& element, std::vector<T>* vector) {
   }
   (*vector)[i] = static_cast<T>(element);
 }
+
+template <typename Element, typename key, typename value>
+void SetMapElement(key i, const Element& element, std::unordered_map<key, value>* map) {
+  DCHECK_LE(i, 1 << 20);
+  if (i >= map->size()) {
+    map->reserve(i + 1);
+  }
+  (*map)[i] = static_cast<value>(element);
+}
+
 }  // namespace
 
 Result<ExtensionSet> GetExtensionSetFromPlan(const substrait::Plan& plan,
                                              ExtensionIdRegistry* registry) {
-  std::vector<util::string_view> uris;
+  std::unordered_map<uint32_t, util::string_view> uris;
   for (const auto& uri : plan.extension_uris()) {
-    SetElement(uri.extension_uri_anchor(), uri.uri(), &uris);
+    SetMapElement(uri.extension_uri_anchor(), uri.uri(), &uris);
   }
 
   // NOTE: it's acceptable to use views to memory owned by plan; ExtensionSet::Make

Review Comment:
   > I don't think an `ordered_map` is needed because we can come up with unique ids by just looking at the size of the map (if 0 is not valid then we can add `1`.)
   
   That doesn't work, because there's no requirement for the indices to be sequential for incoming plans. For example, if the indices 1 and 3 are defined, and a new type is added on the production side, it would erroneously try to reuse map_size + 1 = 3 by this logic. The whole reason why we want to use maps at all is because they don't have to be sequential, otherwise it'd be much cheaper to just use vectors. You could use it as a cheap starting point when looking for unused anchors though; if they *are* sequential it would immediately yield a valid index.
   
   > until it's clear what they actually represent.
   
   They're like a poor-man's version of polymorphism, and a way to attach logical type information that doesn't impact the physical representation. For example, one might define a custom type `polygon`, with type variations `triangle` and `quad`, written as `polygon[triangle]` and `polygon[quad]` respectively. They'd all have the same type index and the same physical storage (probably some list of coordinate structs), but a function can then be written to only accept `polygon[triangle]`, and a function that accepts polygons may or may not also accept `polygon[quad]`s and `polygon[triangle]`s (this depends on how the variation is defined in the YAML files). Note that variations can also be defined for Substrait's built-in types. Ultimately, a Substrait type consists of its physical base type (a simple type, compound type, or user-defined type), its nullability, a variation index (using 0 for "no variation"), and (for compound types and hopefully at some point also user-defined types) 
 a set of template parameters.
   
   Unless we really want to round-trip Substrait plans or validate them completely (having spent months on just a validator that still can't do everything, I don't recommend it), we could just ignore variations entirely. Even if someone else defined a variation in an extension we don't know about, it would have no bearing on how the plan is executed, since we only need to know what the physical representation of a type is. It's more of a producer or optimizer thing, to restrict which functions are available.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] lidavidm commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
lidavidm commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r856376046


##########
cpp/src/arrow/engine/substrait/extension_set.h:
##########
@@ -226,14 +246,10 @@ class ARROW_ENGINE_EXPORT ExtensionSet {
  private:
   ExtensionIdRegistry* registry_;
   /// The subset of extension registry URIs referenced by this extension set
-  std::vector<util::string_view> uris_;
+  std::unordered_map<uint32_t, util::string_view> uris_;
   std::vector<TypeRecord> types_;
-
   std::vector<FunctionRecord> functions_;
-
-  // pimpl pattern to hide lookup details
-  struct Impl;
-  std::unique_ptr<Impl, void (*)(Impl*)> impl_;
+  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_map_, functions_map_;

Review Comment:
   I don't think we have a real guideline. IdHashEq could indeed be useful, but unordered_map is a "heavy" include to have (that said we lost that battle a long time ago). I don't think we have to hide this, though, just wondering if it's useful.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] sanjibansg commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
sanjibansg commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r864605218


##########
cpp/src/arrow/engine/substrait/extension_set.h:
##########
@@ -158,14 +178,14 @@ class ARROW_ENGINE_EXPORT ExtensionSet {
   /// An extension set should instead be created using
   /// arrow::engine::GetExtensionSetFromPlan
   static Result<ExtensionSet> Make(
-      std::vector<util::string_view> uris, std::vector<Id> type_ids,
+      std::unordered_map<uint32_t, util::string_view> uris, std::vector<Id> type_ids,
       std::vector<bool> type_is_variation, std::vector<Id> function_ids,
       ExtensionIdRegistry* = default_extension_id_registry());
 
   // index in these vectors == value of _anchor/_reference fields
   /// TODO(ARROW-15583) this assumes that _anchor/_references won't be huge, which is not
   /// guaranteed. Could it be?

Review Comment:
   Made the change, thanks!



##########
cpp/src/arrow/engine/substrait/extension_set.h:
##########
@@ -226,14 +246,10 @@ class ARROW_ENGINE_EXPORT ExtensionSet {
  private:
   ExtensionIdRegistry* registry_;
   /// The subset of extension registry URIs referenced by this extension set
-  std::vector<util::string_view> uris_;
+  std::unordered_map<uint32_t, util::string_view> uris_;
   std::vector<TypeRecord> types_;
-
   std::vector<FunctionRecord> functions_;

Review Comment:
   Made the change.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] sanjibansg commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
sanjibansg commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r864604706


##########
cpp/src/arrow/engine/substrait/extension_set.h:
##########
@@ -143,6 +159,10 @@ class ARROW_ENGINE_EXPORT ExtensionSet {
   explicit ExtensionSet(ExtensionIdRegistry* = default_extension_id_registry());
   ARROW_DEFAULT_MOVE_AND_ASSIGN(ExtensionSet);
 
+  static Status CheckHasUri(util::string_view uri, ExtensionSet* self);
+  static void AddUri(std::pair<uint32_t, util::string_view> uri, ExtensionSet* self);
+  static void AddUri(Id id, ExtensionSet* self);

Review Comment:
   Made the change, thanks!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] sanjibansg commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
sanjibansg commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r864610550


##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -108,13 +108,23 @@ void SetElement(size_t i, const Element& element, std::vector<T>* vector) {
   }
   (*vector)[i] = static_cast<T>(element);
 }

Review Comment:
   Removed `SetElement` function, thanks!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] westonpace commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
westonpace commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r870881168


##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -174,31 +134,51 @@ Result<ExtensionSet> ExtensionSet::Make(std::vector<util::string_view> uris,
 }
 
 Result<ExtensionSet::TypeRecord> ExtensionSet::DecodeType(uint32_t anchor) const {
-  if (anchor >= types_.size() || types_[anchor].id.empty()) {
+  if (types_.find(anchor) == types_.end() || types_.at(anchor).id.empty()) {
     return Status::Invalid("User defined type reference ", anchor,
                            " did not have a corresponding anchor in the extension set");
   }
-  return types_[anchor];
+  return types_.at(anchor);

Review Comment:
   Ah, fair enough.  I think you're probably right on the behavior for maps.  Thanks for the catch.  Let's keep this as `at`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] westonpace commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
westonpace commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r856547712


##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -108,13 +108,23 @@ void SetElement(size_t i, const Element& element, std::vector<T>* vector) {
   }
   (*vector)[i] = static_cast<T>(element);
 }
+
+template <typename Element, typename key, typename value>
+void SetMapElement(key i, const Element& element, std::unordered_map<key, value>* map) {
+  DCHECK_LE(i, 1 << 20);
+  if (i >= map->size()) {
+    map->reserve(i + 1);
+  }
+  (*map)[i] = static_cast<value>(element);
+}
+
 }  // namespace
 
 Result<ExtensionSet> GetExtensionSetFromPlan(const substrait::Plan& plan,
                                              ExtensionIdRegistry* registry) {
-  std::vector<util::string_view> uris;
+  std::unordered_map<uint32_t, util::string_view> uris;
   for (const auto& uri : plan.extension_uris()) {
-    SetElement(uri.extension_uri_anchor(), uri.uri(), &uris);
+    SetMapElement(uri.extension_uri_anchor(), uri.uri(), &uris);
   }
 
   // NOTE: it's acceptable to use views to memory owned by plan; ExtensionSet::Make

Review Comment:
   > That doesn't work, because there's no requirement for the indices to be sequential for incoming plans
   
   If it's an incoming plan we don't need to invent anchors.  This "come up with a new anchor to use" problem only occurs when going from Arrow -> Substrait.  I don't think it makes sense to share an ExtensionSet between production and consumption scenarios.
   
   Thanks for the explanation of variations.  What you are describing makes sense.  It would allow functions to restrict themselves to a certain "class" of a type.  I'd be curious to see a real world use case for such a thing but otherwise I agree that consumers can safely ignore them (provided they aren't expected to be validating consumers)
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow] westonpace commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

Posted by GitBox <gi...@apache.org>.
westonpace commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r856375614


##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -108,13 +108,23 @@ void SetElement(size_t i, const Element& element, std::vector<T>* vector) {
   }
   (*vector)[i] = static_cast<T>(element);
 }
+
+template <typename Element, typename key, typename value>
+void SetMapElement(key i, const Element& element, std::unordered_map<key, value>* map) {
+  DCHECK_LE(i, 1 << 20);
+  if (i >= map->size()) {
+    map->reserve(i + 1);
+  }
+  (*map)[i] = static_cast<value>(element);
+}
+
 }  // namespace
 
 Result<ExtensionSet> GetExtensionSetFromPlan(const substrait::Plan& plan,
                                              ExtensionIdRegistry* registry) {
-  std::vector<util::string_view> uris;
+  std::unordered_map<uint32_t, util::string_view> uris;
   for (const auto& uri : plan.extension_uris()) {
-    SetElement(uri.extension_uri_anchor(), uri.uri(), &uris);
+    SetMapElement(uri.extension_uri_anchor(), uri.uri(), &uris);
   }
 
   // NOTE: it's acceptable to use views to memory owned by plan; ExtensionSet::Make

Review Comment:
   I don't think an `ordered_map` is needed because we can come up with unique ids by just looking at the size of the map (if 0 is not valid then we can add `1`.)
   
   I agree that the vectors won't work and I also agree that removing type variations entirely is maybe the best idea until it's clear what they actually represent.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org