You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/06/01 17:56:22 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #10264: ARROW-12644: [C++][Python][R][Dataset] URL-decode path segments in partitioning

pitrou commented on a change in pull request #10264:
URL: https://github.com/apache/arrow/pull/10264#discussion_r643360494



##########
File path: cpp/src/arrow/dataset/partition.h
##########
@@ -174,9 +192,9 @@ class ARROW_DS_EXPORT DirectoryPartitioning : public KeyValuePartitioning {
  public:
   /// If a field in schema is of dictionary type, the corresponding element of
   /// dictionaries must be contain the dictionary of values for that field.
-  explicit DirectoryPartitioning(std::shared_ptr<Schema> schema,
-                                 ArrayVector dictionaries = {})
-      : KeyValuePartitioning(std::move(schema), std::move(dictionaries)) {}
+  explicit DirectoryPartitioning(
+      std::shared_ptr<Schema> schema, ArrayVector dictionaries = {},
+      KeyValuePartitioningOptions options = KeyValuePartitioningOptions());

Review comment:
       Just FYI, I think `KeyValuePartitioningOptions options = {}` can be used as a shorter spelling.

##########
File path: cpp/src/arrow/dataset/partition.cc
##########
@@ -259,15 +274,29 @@ Result<std::string> KeyValuePartitioning::Format(const compute::Expression& expr
   return FormatValues(values);
 }
 
-std::vector<KeyValuePartitioning::Key> DirectoryPartitioning::ParseKeys(
+DirectoryPartitioning::DirectoryPartitioning(std::shared_ptr<Schema> schema,
+                                             ArrayVector dictionaries,
+                                             KeyValuePartitioningOptions options)
+    : KeyValuePartitioning(std::move(schema), std::move(dictionaries), options) {
+  if (options.url_decode_segments) {
+    util::InitializeUTF8();
+  }
+}
+
+Result<std::vector<KeyValuePartitioning::Key>> DirectoryPartitioning::ParseKeys(
     const std::string& path) const {
   std::vector<Key> keys;
 
   int i = 0;
   for (auto&& segment : fs::internal::SplitAbstractPath(path)) {
     if (i >= schema_->num_fields()) break;
 
-    keys.push_back({schema_->field(i++)->name(), std::move(segment)});
+    if (options_.url_decode_segments) {
+      ARROW_ASSIGN_OR_RAISE(auto decoded, SafeUriUnescape(segment));
+      keys.push_back({schema_->field(i++)->name(), std::move(decoded)});
+    } else {
+      keys.push_back({schema_->field(i++)->name(), std::move(segment)});

Review comment:
       I'm curious: is there any reason we don't utf8-validate here?

##########
File path: cpp/src/arrow/dataset/partition.cc
##########
@@ -481,28 +533,39 @@ std::shared_ptr<PartitioningFactory> DirectoryPartitioning::MakeFactory(
       new DirectoryPartitioningFactory(std::move(field_names), options));
 }
 
-util::optional<KeyValuePartitioning::Key> HivePartitioning::ParseKey(
-    const std::string& segment, const std::string& null_fallback) {
+Result<util::optional<KeyValuePartitioning::Key>> HivePartitioning::ParseKey(
+    const std::string& segment, const HivePartitioningOptions& options) {
   auto name_end = string_view(segment).find_first_of('=');
   // Not round-trippable
   if (name_end == string_view::npos) {
     return util::nullopt;
   }
 
   auto name = segment.substr(0, name_end);
-  auto value = segment.substr(name_end + 1);
-  if (value == null_fallback) {
-    return Key{name, util::nullopt};
+  std::string value;
+  if (options.url_decode_segments) {
+    // Static method, so we have no better place for it
+    util::InitializeUTF8();
+    auto raw_value =
+        util::string_view(segment.data() + name_end + 1, segment.size() - name_end - 1);

Review comment:
       Is there a reason you don't use the same spelling as below, i.e. `segment.substr(name_end + 1)`?

##########
File path: python/pyarrow/_dataset.pyx
##########
@@ -1947,13 +1949,17 @@ cdef class DirectoryPartitioning(Partitioning):
     cdef:
         CDirectoryPartitioning* directory_partitioning
 
-    def __init__(self, Schema schema not None, dictionaries=None):
+    def __init__(self, Schema schema not None, dictionaries=None,
+                 bint url_decode_segments=True):

Review comment:
       Since this is a boolean option, I'd rather make it keyword-only, i.e. `def __init__(..., *, bint url_decode_segments=True)`.

##########
File path: cpp/src/arrow/dataset/partition.h
##########
@@ -89,8 +89,15 @@ class ARROW_DS_EXPORT Partitioning {
   std::shared_ptr<Schema> schema_;
 };
 
+/// \brief Options for key-value based partitioning (hive/directory).
+struct ARROW_DS_EXPORT KeyValuePartitioningOptions {
+  /// After splitting a path into components, URL-decode the path components
+  /// before parsing.
+  bool url_decode_segments = true;

Review comment:
       I'm afraid at some point, we'll have to deal with other encodings (e.g. hex, or some system-specific oddity). Perhaps make this an enum instead, so that it's open-ended?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org