You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "westonpace (via GitHub)" <gi...@apache.org> on 2023/04/11 22:22:17 UTC

[GitHub] [arrow] westonpace commented on a diff in pull request #34984: GH-34565: [C++] Teach dataset_writer to accept custom filename functor

westonpace commented on code in PR #34984:
URL: https://github.com/apache/arrow/pull/34984#discussion_r1163385209


##########
cpp/src/arrow/dataset/dataset_writer.cc:
##########
@@ -308,11 +309,24 @@ class DatasetWriterDirectoryQueue {
   }
 
   Result<std::string> GetNextFilename() {
-    auto basename = ::arrow::internal::Replace(write_options_.basename_template,
-                                               kIntegerToken, ToChars(file_counter_++));
+    std::optional<std::string> basename;
+    if (write_options_.basename_template_functor == nullptr) {
+      basename = ::arrow::internal::Replace(write_options_.basename_template,
+                                            kIntegerToken, ToChars(file_counter_++));
+    } else {
+      basename = ::arrow::internal::Replace(

Review Comment:
   If a user is providing a `basename_template_functor` do we need to use `basename_template` at all?  Can this just be...
   
   ```
   basename = write_options_.basename_template_functor(file_counter_++);
   ```
   
   I don't think it matters too much but I think it would be simpler to say "either you use the template OR you use a custom function".



##########
cpp/src/arrow/dataset/dataset_writer.cc:
##########
@@ -308,11 +309,24 @@ class DatasetWriterDirectoryQueue {
   }
 
   Result<std::string> GetNextFilename() {
-    auto basename = ::arrow::internal::Replace(write_options_.basename_template,
-                                               kIntegerToken, ToChars(file_counter_++));
+    std::optional<std::string> basename;
+    if (write_options_.basename_template_functor == nullptr) {
+      basename = ::arrow::internal::Replace(write_options_.basename_template,
+                                            kIntegerToken, ToChars(file_counter_++));
+    } else {
+      basename = ::arrow::internal::Replace(
+          write_options_.basename_template, kIntegerToken,
+          write_options_.basename_template_functor(file_counter_++));
+    }
     if (!basename) {
       return Status::Invalid("string interpolation of basename template failed");
     }
+    if (used_filenames_.find(*basename) != used_filenames_.end()) {
+      return Status::Invalid("filename ", *basename,
+                             " is already used before. Check basename_template_functor");
+    } else {
+      used_filenames_.insert(*basename);
+    }

Review Comment:
   Small nit, not a big deal, but you could combine the `find` and `insert` calls...
   
   ```
   if (!used_filenames_.insert(*basename).second) {
     return Status::Invalid(...);
   }
   ```



##########
cpp/src/arrow/dataset/file_base.h:
##########
@@ -404,6 +404,10 @@ struct ARROW_DS_EXPORT FileSystemDatasetWriteOptions {
   /// {i} will be replaced by an auto incremented integer.
   std::string basename_template;
 
+  /// A functor which will be applied on basename_template to replace auto incremented
+  /// integer only if provided.

Review Comment:
   ```suggestion
     /// A functor which will be applied on an incremented counter.  The result will be inserted into
     /// the basename_template in place of {i}.
     ///
     /// This can be used, for example, to left-pad the file counter.
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org