You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/05/11 21:53:43 UTC

[GitHub] [arrow] westonpace commented on a diff in pull request #12852: ARROW-15583: [C++] The Substrait consumer could potentially use a massive amount of RAM if the producer uses large anchors

westonpace commented on code in PR #12852:
URL: https://github.com/apache/arrow/pull/12852#discussion_r870765591


##########
cpp/src/arrow/engine/substrait/extension_set.h:
##########
@@ -226,14 +238,14 @@ class ARROW_ENGINE_EXPORT ExtensionSet {
  private:
   ExtensionIdRegistry* registry_;
   /// The subset of extension registry URIs referenced by this extension set
-  std::vector<util::string_view> uris_;
-  std::vector<TypeRecord> types_;
-
-  std::vector<FunctionRecord> functions_;
-
-  // pimpl pattern to hide lookup details
-  struct Impl;
-  std::unique_ptr<Impl, void (*)(Impl*)> impl_;
+  std::unordered_map<uint32_t, util::string_view> uris_;
+  std::unordered_map<uint32_t, TypeRecord> types_;
+  std::unordered_map<uint32_t, FunctionRecord> functions_;
+  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_map_, functions_map_;

Review Comment:
   Can we add some comments explaining these fields?  Also, combining declarations on one line is discouraged as it can be a bit tricky to read.
   ```suggestion
     // Map from anchor values to URI values
     std::unordered_map<uint32_t, util::string_view> uris_;
     // Map from anchor values to type definitions, used during Substrait->Arrow
     // and populated from the Substrait extension set
     std::unordered_map<uint32_t, TypeRecord> types_;
     // Map from anchor values to function definitions, used during Substrait->Arrow
     // and populated from the Substrait extension set
     std::unordered_map<uint32_t, FunctionRecord> functions_;
     // Map from type names to anchor values.  Used during Arrow->Substrait
     // and built as the plan is created.
     std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_map_;
     // Map from function names to anchor values.  Used during Arrow->Substrait
     // and built as the plan is created.
     std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> functions_map_;
   ```



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -16,7 +16,6 @@
 // under the License.
 
 #include "arrow/engine/substrait/extension_set.h"
-

Review Comment:
   Can you add this line back in?  We try to keep separation between groups of headers.



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -40,125 +39,86 @@ struct TypePtrHashEq {
   }
 };
 
-struct IdHashEq {
-  using Id = ExtensionSet::Id;
-
-  size_t operator()(Id id) const {
-    constexpr ::arrow::internal::StringViewHash hash = {};
-    auto out = static_cast<size_t>(hash(id.uri));
-    ::arrow::internal::hash_combine(out, hash(id.name));
-    return out;
-  }
-
-  bool operator()(Id l, Id r) const { return l.uri == r.uri && l.name == r.name; }
-};
-
 }  // namespace
 
 // A builder used when creating a Substrait plan from an Arrow execution plan.  In
 // that situation we do not have a set of anchor values already defined so we keep
 // a map of what Ids we have seen.
-struct ExtensionSet::Impl {
-  void AddUri(util::string_view uri, ExtensionSet* self) {
-    if (uris_.find(uri) != uris_.end()) return;
-
-    self->uris_.push_back(uri);
-    uris_.insert(self->uris_.back());  // lookup helper's keys should reference memory
-                                       // owned by this ExtensionSet
-  }
-
-  Status CheckHasUri(util::string_view uri) {
-    if (uris_.find(uri) != uris_.end()) return Status::OK();
-
-    return Status::Invalid(
-        "Uri ", uri,
-        " was referenced by an extension but was not declared in the ExtensionSet.");
-  }
-
-  uint32_t EncodeType(ExtensionIdRegistry::TypeRecord type_record, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(type_record.id.uri, self);
-    auto it_success =
-        types_.emplace(type_record.id, static_cast<uint32_t>(types_.size()));
-
-    if (it_success.second) {
-      self->types_.push_back(
-          {type_record.id, type_record.type, type_record.is_variation});
-    }
-
-    return it_success.first->second;
-  }
-
-  uint32_t EncodeFunction(Id id, util::string_view function_name, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(id.uri, self);
-    auto it_success = functions_.emplace(id, static_cast<uint32_t>(functions_.size()));
+ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry) : registry_(registry) {}
+
+Status ExtensionSet::CheckHasUri(util::string_view uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri;
+                   });
+  if (it != uris_.end()) return Status::OK();
+
+  return Status::Invalid(
+      "Uri ", uri,
+      " was referenced by an extension but was not declared in the ExtensionSet.");
+}
 
-    if (it_success.second) {
-      self->functions_.push_back({id, function_name});
-    }
+void ExtensionSet::AddUri(std::pair<uint32_t, util::string_view> uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri.second;
+                   });
+  if (it != uris_.end()) return;
+  uris_[uri.first] = uri.second;
+}
 
-    return it_success.first->second;
+Status ExtensionSet::AddUri(Id id) {
+  auto uris_size = static_cast<unsigned int>(uris_.size());
+  if (uris_.find(uris_size) != uris_.end()) {
+    return Status::Invalid("Key already exist in the uris map");

Review Comment:
   ```suggestion
       return Status::Invalid("Key already exists in the uris map");
   ```



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -40,125 +39,86 @@ struct TypePtrHashEq {
   }
 };
 
-struct IdHashEq {
-  using Id = ExtensionSet::Id;
-
-  size_t operator()(Id id) const {
-    constexpr ::arrow::internal::StringViewHash hash = {};
-    auto out = static_cast<size_t>(hash(id.uri));
-    ::arrow::internal::hash_combine(out, hash(id.name));
-    return out;
-  }
-
-  bool operator()(Id l, Id r) const { return l.uri == r.uri && l.name == r.name; }
-};
-
 }  // namespace
 
 // A builder used when creating a Substrait plan from an Arrow execution plan.  In
 // that situation we do not have a set of anchor values already defined so we keep
 // a map of what Ids we have seen.
-struct ExtensionSet::Impl {
-  void AddUri(util::string_view uri, ExtensionSet* self) {
-    if (uris_.find(uri) != uris_.end()) return;
-
-    self->uris_.push_back(uri);
-    uris_.insert(self->uris_.back());  // lookup helper's keys should reference memory
-                                       // owned by this ExtensionSet
-  }
-
-  Status CheckHasUri(util::string_view uri) {
-    if (uris_.find(uri) != uris_.end()) return Status::OK();
-
-    return Status::Invalid(
-        "Uri ", uri,
-        " was referenced by an extension but was not declared in the ExtensionSet.");
-  }
-
-  uint32_t EncodeType(ExtensionIdRegistry::TypeRecord type_record, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(type_record.id.uri, self);
-    auto it_success =
-        types_.emplace(type_record.id, static_cast<uint32_t>(types_.size()));
-
-    if (it_success.second) {
-      self->types_.push_back(
-          {type_record.id, type_record.type, type_record.is_variation});
-    }
-
-    return it_success.first->second;
-  }
-
-  uint32_t EncodeFunction(Id id, util::string_view function_name, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(id.uri, self);
-    auto it_success = functions_.emplace(id, static_cast<uint32_t>(functions_.size()));
+ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry) : registry_(registry) {}
+
+Status ExtensionSet::CheckHasUri(util::string_view uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri;
+                   });
+  if (it != uris_.end()) return Status::OK();
+
+  return Status::Invalid(
+      "Uri ", uri,
+      " was referenced by an extension but was not declared in the ExtensionSet.");
+}
 
-    if (it_success.second) {
-      self->functions_.push_back({id, function_name});
-    }
+void ExtensionSet::AddUri(std::pair<uint32_t, util::string_view> uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri.second;
+                   });
+  if (it != uris_.end()) return;
+  uris_[uri.first] = uri.second;
+}
 
-    return it_success.first->second;
+Status ExtensionSet::AddUri(Id id) {
+  auto uris_size = static_cast<unsigned int>(uris_.size());
+  if (uris_.find(uris_size) != uris_.end()) {
+    return Status::Invalid("Key already exist in the uris map");

Review Comment:
   ```suggestion
       // Substrait plans shouldn't have repeated URIs in the extension set
       return Status::Invalid("Key already exist in the uris map");
   ```
   It's correct, but a bit odd that we do `FindOrCreate` when adding an anchor/uri pair but we do `InsertOrFail` when adding by `Id` so a comment might help.



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -40,125 +39,86 @@ struct TypePtrHashEq {
   }
 };
 
-struct IdHashEq {
-  using Id = ExtensionSet::Id;
-
-  size_t operator()(Id id) const {
-    constexpr ::arrow::internal::StringViewHash hash = {};
-    auto out = static_cast<size_t>(hash(id.uri));
-    ::arrow::internal::hash_combine(out, hash(id.name));
-    return out;
-  }
-
-  bool operator()(Id l, Id r) const { return l.uri == r.uri && l.name == r.name; }
-};
-
 }  // namespace
 
 // A builder used when creating a Substrait plan from an Arrow execution plan.  In
 // that situation we do not have a set of anchor values already defined so we keep
 // a map of what Ids we have seen.
-struct ExtensionSet::Impl {
-  void AddUri(util::string_view uri, ExtensionSet* self) {
-    if (uris_.find(uri) != uris_.end()) return;
-
-    self->uris_.push_back(uri);
-    uris_.insert(self->uris_.back());  // lookup helper's keys should reference memory
-                                       // owned by this ExtensionSet
-  }
-
-  Status CheckHasUri(util::string_view uri) {
-    if (uris_.find(uri) != uris_.end()) return Status::OK();
-
-    return Status::Invalid(
-        "Uri ", uri,
-        " was referenced by an extension but was not declared in the ExtensionSet.");
-  }
-
-  uint32_t EncodeType(ExtensionIdRegistry::TypeRecord type_record, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(type_record.id.uri, self);
-    auto it_success =
-        types_.emplace(type_record.id, static_cast<uint32_t>(types_.size()));
-
-    if (it_success.second) {
-      self->types_.push_back(
-          {type_record.id, type_record.type, type_record.is_variation});
-    }
-
-    return it_success.first->second;
-  }
-
-  uint32_t EncodeFunction(Id id, util::string_view function_name, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(id.uri, self);
-    auto it_success = functions_.emplace(id, static_cast<uint32_t>(functions_.size()));
+ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry) : registry_(registry) {}
+
+Status ExtensionSet::CheckHasUri(util::string_view uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri;
+                   });
+  if (it != uris_.end()) return Status::OK();
+
+  return Status::Invalid(
+      "Uri ", uri,
+      " was referenced by an extension but was not declared in the ExtensionSet.");
+}
 
-    if (it_success.second) {
-      self->functions_.push_back({id, function_name});
-    }
+void ExtensionSet::AddUri(std::pair<uint32_t, util::string_view> uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri.second;
+                   });
+  if (it != uris_.end()) return;
+  uris_[uri.first] = uri.second;
+}
 
-    return it_success.first->second;
+Status ExtensionSet::AddUri(Id id) {
+  auto uris_size = static_cast<unsigned int>(uris_.size());
+  if (uris_.find(uris_size) != uris_.end()) {
+    return Status::Invalid("Key already exist in the uris map");
   }
+  uris_[uris_size] = id.uri;
+  return Status::OK();
+}
 
-  std::unordered_set<util::string_view, ::arrow::internal::StringViewHash> uris_;
-  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_, functions_;
-};
-
-ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry)
-    : registry_(registry), impl_(new Impl(), [](Impl* impl) { delete impl; }) {}
-
-Result<ExtensionSet> ExtensionSet::Make(std::vector<util::string_view> uris,
-                                        std::vector<Id> type_ids,
-                                        std::vector<bool> type_is_variation,
-                                        std::vector<Id> function_ids,
-                                        ExtensionIdRegistry* registry) {
+Result<ExtensionSet> ExtensionSet::Make(
+    std::unordered_map<uint32_t, util::string_view> uris,
+    std::unordered_map<uint32_t, Id> type_ids,
+    std::unordered_map<uint32_t, Id> function_ids, ExtensionIdRegistry* registry) {
   ExtensionSet set;
   set.registry_ = registry;
 
   // TODO(bkietz) move this into the registry as registry->OwnUris(&uris) or so
   std::unordered_set<util::string_view, ::arrow::internal::StringViewHash>
       uris_owned_by_registry;
-  for (util::string_view uri : registry->Uris()) {
+  for (auto uri : registry->Uris()) {

Review Comment:
   ```suggestion
     for (util::string_view uri : registry->Uris()) {
   ```
   I generally don't mind auto but I think it can be confusing to tell when you are dealing with a string or a string_view and having the explicit type might be helpful here.



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -174,31 +134,51 @@ Result<ExtensionSet> ExtensionSet::Make(std::vector<util::string_view> uris,
 }
 
 Result<ExtensionSet::TypeRecord> ExtensionSet::DecodeType(uint32_t anchor) const {
-  if (anchor >= types_.size() || types_[anchor].id.empty()) {
+  if (types_.find(anchor) == types_.end() || types_.at(anchor).id.empty()) {
     return Status::Invalid("User defined type reference ", anchor,
                            " did not have a corresponding anchor in the extension set");
   }
-  return types_[anchor];
+  return types_.at(anchor);
 }
 
 Result<uint32_t> ExtensionSet::EncodeType(const DataType& type) {
   if (auto rec = registry_->GetType(type)) {
-    return impl_->EncodeType(*rec, this);
+    RETURN_NOT_OK(this->AddUri(rec->id));
+    auto it_success =
+        types_map_.emplace(rec->id, static_cast<uint32_t>(types_map_.size()));
+    if (it_success.second) {
+      if (types_.find(static_cast<unsigned int>(types_.size())) != types_.end()) {
+        return Status::Invalid("Key already exist in the uris map");
+      }

Review Comment:
   ```suggestion
         DCHECK_EQ(types_.find(static_cast<unsigned int>(types_.size())), types_.end()) << "Type existed in types_ but not types_map_.  ExtensionSet is inconsistent";
   ```
   There really isn't much a user can do with this message.  If we get here I think it would have to be a bug (possibly reusing an ExtensionSet) so we might as well make it a `DCHECK`.



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -174,31 +134,51 @@ Result<ExtensionSet> ExtensionSet::Make(std::vector<util::string_view> uris,
 }
 
 Result<ExtensionSet::TypeRecord> ExtensionSet::DecodeType(uint32_t anchor) const {
-  if (anchor >= types_.size() || types_[anchor].id.empty()) {
+  if (types_.find(anchor) == types_.end() || types_.at(anchor).id.empty()) {
     return Status::Invalid("User defined type reference ", anchor,
                            " did not have a corresponding anchor in the extension set");
   }
-  return types_[anchor];
+  return types_.at(anchor);
 }
 
 Result<uint32_t> ExtensionSet::EncodeType(const DataType& type) {
   if (auto rec = registry_->GetType(type)) {
-    return impl_->EncodeType(*rec, this);
+    RETURN_NOT_OK(this->AddUri(rec->id));
+    auto it_success =
+        types_map_.emplace(rec->id, static_cast<uint32_t>(types_map_.size()));
+    if (it_success.second) {
+      if (types_.find(static_cast<unsigned int>(types_.size())) != types_.end()) {
+        return Status::Invalid("Key already exist in the uris map");
+      }
+      types_[static_cast<unsigned int>(types_.size())] = {rec->id, rec->type};
+    }
+    return it_success.first->second;
   }
   return Status::KeyError("type ", type.ToString(), " not found in the registry");
 }
 
 Result<ExtensionSet::FunctionRecord> ExtensionSet::DecodeFunction(uint32_t anchor) const {
-  if (anchor >= functions_.size() || functions_[anchor].id.empty()) {
+  if (functions_.find(anchor) == functions_.end() || functions_.at(anchor).id.empty()) {
     return Status::Invalid("User defined function reference ", anchor,
                            " did not have a corresponding anchor in the extension set");
   }
-  return functions_[anchor];
+  return functions_.at(anchor);
 }
 
 Result<uint32_t> ExtensionSet::EncodeFunction(util::string_view function_name) {
   if (auto rec = registry_->GetFunction(function_name)) {
-    return impl_->EncodeFunction(rec->id, rec->function_name, this);
+    RETURN_NOT_OK(this->AddUri(rec->id));
+    auto it_success =
+        functions_map_.emplace(rec->id, static_cast<uint32_t>(functions_map_.size()));
+    if (it_success.second) {
+      if (functions_.find(static_cast<unsigned int>(functions_.size())) !=
+          functions_.end()) {
+        return Status::Invalid("Key already exist in the uris map");
+      }

Review Comment:
   Same as above, let's make this a `DCHECK`.



##########
cpp/src/arrow/engine/substrait/plan_internal.cc:
##########
@@ -43,7 +43,7 @@ Status AddExtensionSetToPlan(const ExtensionSet& ext_set, substrait::Plan* plan)
   auto uris = plan->mutable_extension_uris();
   uris->Reserve(static_cast<int>(ext_set.uris().size()));
   for (uint32_t anchor = 0; anchor < ext_set.uris().size(); ++anchor) {
-    auto uri = ext_set.uris()[anchor];
+    auto uri = ext_set.uris().at(anchor);

Review Comment:
   Hm...does this work?
   ```
   util::string_view uri = ext_set.uris()[anchor];
   ```



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -40,125 +39,86 @@ struct TypePtrHashEq {
   }
 };
 
-struct IdHashEq {
-  using Id = ExtensionSet::Id;
-
-  size_t operator()(Id id) const {
-    constexpr ::arrow::internal::StringViewHash hash = {};
-    auto out = static_cast<size_t>(hash(id.uri));
-    ::arrow::internal::hash_combine(out, hash(id.name));
-    return out;
-  }
-
-  bool operator()(Id l, Id r) const { return l.uri == r.uri && l.name == r.name; }
-};
-
 }  // namespace
 
 // A builder used when creating a Substrait plan from an Arrow execution plan.  In
 // that situation we do not have a set of anchor values already defined so we keep
 // a map of what Ids we have seen.
-struct ExtensionSet::Impl {
-  void AddUri(util::string_view uri, ExtensionSet* self) {
-    if (uris_.find(uri) != uris_.end()) return;
-
-    self->uris_.push_back(uri);
-    uris_.insert(self->uris_.back());  // lookup helper's keys should reference memory
-                                       // owned by this ExtensionSet
-  }
-
-  Status CheckHasUri(util::string_view uri) {
-    if (uris_.find(uri) != uris_.end()) return Status::OK();
-
-    return Status::Invalid(
-        "Uri ", uri,
-        " was referenced by an extension but was not declared in the ExtensionSet.");
-  }
-
-  uint32_t EncodeType(ExtensionIdRegistry::TypeRecord type_record, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(type_record.id.uri, self);
-    auto it_success =
-        types_.emplace(type_record.id, static_cast<uint32_t>(types_.size()));
-
-    if (it_success.second) {
-      self->types_.push_back(
-          {type_record.id, type_record.type, type_record.is_variation});
-    }
-
-    return it_success.first->second;
-  }
-
-  uint32_t EncodeFunction(Id id, util::string_view function_name, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(id.uri, self);
-    auto it_success = functions_.emplace(id, static_cast<uint32_t>(functions_.size()));
+ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry) : registry_(registry) {}
+
+Status ExtensionSet::CheckHasUri(util::string_view uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri;
+                   });

Review Comment:
   Hmm...this is a linear search through the URIs map each time.  Admittedly, the URIs map should be quite small, but I wonder if we want this map to be bidirectional as well.  I think we can probably leave it for now though.



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -174,31 +134,51 @@ Result<ExtensionSet> ExtensionSet::Make(std::vector<util::string_view> uris,
 }
 
 Result<ExtensionSet::TypeRecord> ExtensionSet::DecodeType(uint32_t anchor) const {
-  if (anchor >= types_.size() || types_[anchor].id.empty()) {
+  if (types_.find(anchor) == types_.end() || types_.at(anchor).id.empty()) {
     return Status::Invalid("User defined type reference ", anchor,
                            " did not have a corresponding anchor in the extension set");
   }
-  return types_[anchor];
+  return types_.at(anchor);

Review Comment:
   ```suggestion
     return types_[anchor];
   ```
   We don't handle exceptions so there is generally no benefit to using `at` over `[]`.



##########
cpp/src/arrow/engine/substrait/extension_set.cc:
##########
@@ -40,125 +39,86 @@ struct TypePtrHashEq {
   }
 };
 
-struct IdHashEq {
-  using Id = ExtensionSet::Id;
-
-  size_t operator()(Id id) const {
-    constexpr ::arrow::internal::StringViewHash hash = {};
-    auto out = static_cast<size_t>(hash(id.uri));
-    ::arrow::internal::hash_combine(out, hash(id.name));
-    return out;
-  }
-
-  bool operator()(Id l, Id r) const { return l.uri == r.uri && l.name == r.name; }
-};
-
 }  // namespace
 
 // A builder used when creating a Substrait plan from an Arrow execution plan.  In
 // that situation we do not have a set of anchor values already defined so we keep
 // a map of what Ids we have seen.
-struct ExtensionSet::Impl {
-  void AddUri(util::string_view uri, ExtensionSet* self) {
-    if (uris_.find(uri) != uris_.end()) return;
-
-    self->uris_.push_back(uri);
-    uris_.insert(self->uris_.back());  // lookup helper's keys should reference memory
-                                       // owned by this ExtensionSet
-  }
-
-  Status CheckHasUri(util::string_view uri) {
-    if (uris_.find(uri) != uris_.end()) return Status::OK();
-
-    return Status::Invalid(
-        "Uri ", uri,
-        " was referenced by an extension but was not declared in the ExtensionSet.");
-  }
-
-  uint32_t EncodeType(ExtensionIdRegistry::TypeRecord type_record, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(type_record.id.uri, self);
-    auto it_success =
-        types_.emplace(type_record.id, static_cast<uint32_t>(types_.size()));
-
-    if (it_success.second) {
-      self->types_.push_back(
-          {type_record.id, type_record.type, type_record.is_variation});
-    }
-
-    return it_success.first->second;
-  }
-
-  uint32_t EncodeFunction(Id id, util::string_view function_name, ExtensionSet* self) {
-    // note: at this point we're guaranteed to have an Id which points to memory owned by
-    // the set's registry.
-    AddUri(id.uri, self);
-    auto it_success = functions_.emplace(id, static_cast<uint32_t>(functions_.size()));
+ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry) : registry_(registry) {}
+
+Status ExtensionSet::CheckHasUri(util::string_view uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri;
+                   });
+  if (it != uris_.end()) return Status::OK();
+
+  return Status::Invalid(
+      "Uri ", uri,
+      " was referenced by an extension but was not declared in the ExtensionSet.");
+}
 
-    if (it_success.second) {
-      self->functions_.push_back({id, function_name});
-    }
+void ExtensionSet::AddUri(std::pair<uint32_t, util::string_view> uri) {
+  auto it =
+      std::find_if(uris_.begin(), uris_.end(),
+                   [&uri](const std::pair<uint32_t, util::string_view>& anchor_uri_pair) {
+                     return anchor_uri_pair.second == uri.second;
+                   });
+  if (it != uris_.end()) return;
+  uris_[uri.first] = uri.second;
+}
 
-    return it_success.first->second;
+Status ExtensionSet::AddUri(Id id) {
+  auto uris_size = static_cast<unsigned int>(uris_.size());
+  if (uris_.find(uris_size) != uris_.end()) {
+    return Status::Invalid("Key already exist in the uris map");
   }
+  uris_[uris_size] = id.uri;
+  return Status::OK();
+}
 
-  std::unordered_set<util::string_view, ::arrow::internal::StringViewHash> uris_;
-  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_, functions_;
-};
-
-ExtensionSet::ExtensionSet(ExtensionIdRegistry* registry)
-    : registry_(registry), impl_(new Impl(), [](Impl* impl) { delete impl; }) {}
-
-Result<ExtensionSet> ExtensionSet::Make(std::vector<util::string_view> uris,
-                                        std::vector<Id> type_ids,
-                                        std::vector<bool> type_is_variation,
-                                        std::vector<Id> function_ids,
-                                        ExtensionIdRegistry* registry) {
+Result<ExtensionSet> ExtensionSet::Make(

Review Comment:
   ```suggestion
   // Creates an extension set from the Substrait plan's top-level extensions block
   Result<ExtensionSet> ExtensionSet::Make(
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org