You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@arrow.apache.org by "bkietz (via GitHub)" <gi...@apache.org> on 2024/02/28 22:12:35 UTC

[I] [C++] Parse query parameters in util::Uri::Parse [arrow]

bkietz opened a new issue, #40283:
URL: https://github.com/apache/arrow/issues/40283

   ### Describe the enhancement requested
   
   Discussion in: https://github.com/apache/arrow/pull/39067#discussion_r1505853441
   
   It'd be advantageous to provide helpers for looking up specific query parameters. `Uri::query_items()` currently re-parses the query string each time it is called and only returns a sequence of key-value pairs which is less useful for lookup.
   
   <details>
   Basic sketch of initial changes:
   
   <pre>
   diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc
   index db134f581..aa846528c 100644
   --- a/cpp/src/arrow/filesystem/s3fs.cc
   +++ b/cpp/src/arrow/filesystem/s3fs.cc
   @@ -355,11 +355,6 @@ Result<S3Options> S3Options::FromUri(const Uri& uri, std::string* out_path) {
        *out_path = std::string(internal::RemoveTrailingSlash(path));
      }
    
   -  std::unordered_map<std::string, std::string> options_map;
   -  ARROW_ASSIGN_OR_RAISE(const auto options_items, uri.query_items());
   -  for (const auto& kv : options_items) {
   -    options_map.emplace(kv.first, kv.second);
   -  }
    
      const auto username = uri.username();
      if (!username.empty()) {
   @@ -379,7 +374,8 @@ Result<S3Options> S3Options::FromUri(const Uri& uri, std::string* out_path) {
      }
    
      bool region_set = false;
   -  for (const auto& kv : options_map) {
   +  ARROW_ASSIGN_OR_RAISE(auto query_options, uri.query_items());
   +  for (const auto& kv : query_options) {
        if (kv.first == "region") {
          options.region = kv.second;
          region_set = true;
   diff --git a/cpp/src/arrow/util/uri.cc b/cpp/src/arrow/util/uri.cc
   index d1b54e78a..639b3a5b8 100644
   --- a/cpp/src/arrow/util/uri.cc
   +++ b/cpp/src/arrow/util/uri.cc
   @@ -117,25 +117,19 @@ struct Uri::Impl {
      void Reset() {
        uriFreeUriMembersA(&uri_);
        memset(&uri_, 0, sizeof(uri_));
   -    data_.clear();
        string_rep_.clear();
        path_segments_.clear();
        port_ = -1;
      }
    
   -  const std::string& KeepString(const std::string& s) {
   -    data_.push_back(s);
   -    return data_.back();
   -  }
   -
      UriUriA uri_;
   -  // Keep alive strings that uriparser stores pointers to
   -  std::vector<std::string> data_;
   +  // uriparser stores pointers into the string representation, so we must keep it alive
      std::string string_rep_;
      int32_t port_ = -1;
      std::vector<std::string_view> path_segments_;
      bool is_file_uri_;
      bool is_absolute_path_;
   +  std::optional<std::vector<std::pair<std::string, std::string>>> query_items_;
    };
    
    Uri::Uri() : impl_(new Impl) {}
   @@ -214,44 +208,20 @@ std::string Uri::path() const {
    
    std::string Uri::query_string() const { return TextRangeToString(impl_->uri_.query); }
    
   -Result<std::vector<std::pair<std::string, std::string>>> Uri::query_items() const {
   -  // XXX would it be worthwhile to fold this parsing into Uri::parse() or maybe
   -  // cache these lazily in an unordered_map? Then we could provide
   -  // Uri::query_item(std::string name)
   -  const auto& query = impl_->uri_.query;
   -  UriQueryListA* query_list;
   -  int item_count;
   -  std::vector<std::pair<std::string, std::string>> items;
   +Result<util::span<std::pair<std::string, std::string>>> Uri::query_items() const {
   +  if (impl_->query_items_) return *impl_->query_items_;
    
   -  if (query.first == nullptr) {
   -    return items;
   -  }
   -  if (uriDissectQueryMallocA(&query_list, &item_count, query.first, query.afterLast) !=
   -      URI_SUCCESS) {
   -    return Status::Invalid("Cannot parse query string: '", query_string(), "'");
   -  }
   -  std::unique_ptr<UriQueryListA, decltype(&uriFreeQueryListA)> query_guard(
   -      query_list, uriFreeQueryListA);
   -
   -  items.reserve(item_count);
   -  while (query_list != nullptr) {
   -    if (query_list->value != nullptr) {
   -      items.emplace_back(query_list->key, query_list->value);
   -    } else {
   -      items.emplace_back(query_list->key, "");
   -    }
   -    query_list = query_list->next;
   -  }
   -  return items;
   +  return Status::Invalid("Cannot parse query string: '", query_string(), "'");
    }
    
    const std::string& Uri::ToString() const { return impl_->string_rep_; }
    
   -Status Uri::Parse(const std::string& uri_string) {
   +Status Uri::Parse(std::string uri_string) {
      impl_->Reset();
    
   -  const auto& s = impl_->KeepString(uri_string);
   -  impl_->string_rep_ = s;
   +  impl_->string_rep_ = std::move(uri_string);
   +  const auto& s = impl_->string_rep_;
   +
      const char* error_pos;
      if (uriParseSingleUriExA(&impl_->uri_, s.data(), s.data() + s.size(), &error_pos) !=
          URI_SUCCESS) {
   @@ -309,12 +279,39 @@ Status Uri::Parse(const std::string& uri_string) {
        impl_->port_ = port_num;
      }
    
   +  const auto& query = impl_->uri_.query;
   +
   +  if (query.first == nullptr) {
   +    impl_->query_items_ = std::vector<std::pair<std::string, std::string>>{};
   +    return Status::OK();
   +  }
   +
   +  int item_count;
   +  UriQueryListA* query_list;
   +  if (uriDissectQueryMallocA(&query_list, &item_count, query.first, query.afterLast) !=
   +      URI_SUCCESS) {
   +    return Status::Invalid("Cannot parse query string: '", query_string(), "'");
   +  }
   +  std::unique_ptr<UriQueryListA, decltype(&uriFreeQueryListA)> query_guard(
   +      query_list, uriFreeQueryListA);
   +
   +  impl_->query_items_ = std::vector<std::pair<std::string, std::string>>(item_count);
   +  auto* item = impl_->query_items_->data();
   +  while (query_list != nullptr) {
   +    if (query_list->value != nullptr) {
   +      *item++ = {query_list->key, query_list->value};
   +    } else {
   +      *item++ = {query_list->key, ""};
   +    }
   +    query_list = query_list->next;
   +  }
   +
      return Status::OK();
    }
    
   -Result<Uri> Uri::FromString(const std::string& uri_string) {
   +Result<Uri> Uri::FromString(std::string uri_string) {
      Uri uri;
   -  ARROW_RETURN_NOT_OK(uri.Parse(uri_string));
   +  ARROW_RETURN_NOT_OK(uri.Parse(std::move(uri_string)));
      return uri;
    }
    
   diff --git a/cpp/src/arrow/util/uri.h b/cpp/src/arrow/util/uri.h
   index 74dbe924f..cdfcd78f5 100644
   --- a/cpp/src/arrow/util/uri.h
   +++ b/cpp/src/arrow/util/uri.h
   @@ -25,6 +25,7 @@
    #include <vector>
    
    #include "arrow/type_fwd.h"
   +#include "arrow/util/span.h"
    #include "arrow/util/visibility.h"
    
    namespace arrow::util {
   @@ -77,16 +78,16 @@ class ARROW_EXPORT Uri {
      ///
      /// Note this API doesn't allow differentiating between an empty value
      /// and a missing value, such in "a&b=1" vs. "a=&b=1".
   -  Result<std::vector<std::pair<std::string, std::string>>> query_items() const;
   +  Result<util::span<std::pair<std::string, std::string>>> query_items() const;
    
      /// Get the string representation of this URI.
      const std::string& ToString() const;
    
      /// Factory function to parse a URI from its string representation.
   -  Status Parse(const std::string& uri_string);
   +  Status Parse(std::string uri_string);
    
      /// Factory function to parse a URI from its string representation.
   -  static Result<Uri> FromString(const std::string& uri_string);
   +  static Result<Uri> FromString(std::string uri_string);
    
     private:
      struct Impl;
   diff --git a/cpp/src/arrow/util/uri_test.cc b/cpp/src/arrow/util/uri_test.cc
   index 36e09b1b2..9fa436ffe 100644
   --- a/cpp/src/arrow/util/uri_test.cc
   +++ b/cpp/src/arrow/util/uri_test.cc
   @@ -137,7 +137,7 @@ TEST(Uri, ParseQuery) {
        ASSERT_EQ(uri.query_string(), query_string);
        auto result = uri.query_items();
        ASSERT_OK(result);
   -    ASSERT_EQ(*result, items);
   +    ASSERT_EQ(*result, util::span{items});
      };
    
      check_case("unix://localhost/tmp", "", {});
   </pre>
   </details>
   
   ### Component(s)
   
   C++


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@arrow.apache.org.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org