You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/04/28 09:36:12 UTC

[GitHub] [arrow] pitrou commented on a diff in pull request #12625: ARROW-15587: [C++] Add support for all options specified by substrait::ReadRel::LocalFiles::FileOrFiles

pitrou commented on code in PR #12625:
URL: https://github.com/apache/arrow/pull/12625#discussion_r860678354


##########
cpp/src/arrow/engine/substrait/relation_internal.cc:
##########
@@ -49,6 +50,59 @@ Status CheckRelCommon(const RelMessage& rel) {
   return Status::OK();
 }
 
+Result<fs::FileInfoVector> GetGlobFiles(const std::shared_ptr<fs::FileSystem>& filesystem,
+                                        std::string& path) {
+  fs::FileInfoVector results, temp;
+  fs::FileSelector selector;
+  std::string cur;
+  size_t i = 0;
+
+#if _WIN32
+  auto split_path = fs::internal::SplitAbstractPath(path, '\\');
+  ARROW_ASSIGN_OR_RAISE(auto file, filesystem->GetFileInfo(split_path[i++] + "\\"));

Review Comment:
   Instead of switching based on platform here, file paths should be normalized to `/` separators at an upper level.
   Filesystems are not supposed to support `\` separators.



##########
cpp/src/arrow/engine/substrait/serde_test.cc:
##########
@@ -654,7 +654,9 @@ TEST(Substrait, ReadRel) {
   ASSERT_EQ(scan_node_options.dataset->type_name(), "filesystem");
   const auto& dataset =
       checked_cast<const dataset::FileSystemDataset&>(*scan_node_options.dataset);
-  EXPECT_THAT(dataset.files(), ElementsAre("/tmp/dat1.parquet", "/tmp/dat2.parquet"));
+  auto files = dataset.files();
+  std::sort(files.begin(), files.end());
+  EXPECT_THAT(files, ElementsAre("/tmp/dat1.parquet", "/tmp/dat2.parquet"));

Review Comment:
   Can use `UnorderedElementsAre` so that you don't have to sort manually.



##########
cpp/src/arrow/engine/substrait/relation_internal.cc:
##########
@@ -49,6 +50,59 @@ Status CheckRelCommon(const RelMessage& rel) {
   return Status::OK();
 }
 
+Result<fs::FileInfoVector> GetGlobFiles(const std::shared_ptr<fs::FileSystem>& filesystem,
+                                        std::string& path) {
+  fs::FileInfoVector results, temp;
+  fs::FileSelector selector;
+  std::string cur;
+  size_t i = 0;

Review Comment:
   Can we keep the convention of using either `int32_t` or `int64_t`?



##########
cpp/src/arrow/engine/substrait/relation_internal.cc:
##########
@@ -49,6 +50,59 @@ Status CheckRelCommon(const RelMessage& rel) {
   return Status::OK();
 }
 
+Result<fs::FileInfoVector> GetGlobFiles(const std::shared_ptr<fs::FileSystem>& filesystem,
+                                        std::string& path) {
+  fs::FileInfoVector results, temp;
+  fs::FileSelector selector;
+  std::string cur;
+  size_t i = 0;
+
+#if _WIN32
+  auto split_path = fs::internal::SplitAbstractPath(path, '\\');
+  ARROW_ASSIGN_OR_RAISE(auto file, filesystem->GetFileInfo(split_path[i++] + "\\"));

Review Comment:
   Also, why are you adding a trailing separator?



##########
cpp/src/arrow/engine/substrait/relation_internal.cc:
##########
@@ -49,6 +50,59 @@ Status CheckRelCommon(const RelMessage& rel) {
   return Status::OK();
 }
 
+Result<fs::FileInfoVector> GetGlobFiles(const std::shared_ptr<fs::FileSystem>& filesystem,
+                                        std::string& path) {

Review Comment:
   Also, should be `const std::string&`



##########
cpp/src/arrow/engine/substrait/relation_internal.cc:
##########
@@ -88,35 +142,41 @@ Result<compute::Declaration> FromProto(const substrait::Rel& rel,
 
       std::shared_ptr<dataset::FileFormat> format;
       auto filesystem = std::make_shared<fs::LocalFileSystem>();
-      std::vector<std::shared_ptr<dataset::FileFragment>> fragments;
+      std::vector<fs::FileInfo> files;
 
       for (const auto& item : read.local_files().items()) {
-        if (item.path_type_case() !=
-            substrait::ReadRel_LocalFiles_FileOrFiles::kUriFile) {
-          return Status::NotImplemented(
-              "substrait::ReadRel::LocalFiles::FileOrFiles with "
-              "path_type other than uri_file");
+        std::string path;
+        if (item.path_type_case() ==
+            substrait::ReadRel_LocalFiles_FileOrFiles::kUriPath) {
+          path = item.uri_path();
+        } else if (item.path_type_case() ==
+                   substrait::ReadRel_LocalFiles_FileOrFiles::kUriFile) {
+          path = item.uri_file();
+        } else if (item.path_type_case() ==
+                   substrait::ReadRel_LocalFiles_FileOrFiles::kUriFolder) {
+          path = item.uri_folder();
+        } else {
+          path = item.uri_path_glob();
         }
 
         if (item.format() ==
             substrait::ReadRel::LocalFiles::FileOrFiles::FILE_FORMAT_PARQUET) {
           format = std::make_shared<dataset::ParquetFileFormat>();
-        } else if (util::string_view{item.uri_file()}.ends_with(".arrow")) {
+        } else if (util::string_view{path}.ends_with(".arrow")) {
           format = std::make_shared<dataset::IpcFileFormat>();
-        } else if (util::string_view{item.uri_file()}.ends_with(".feather")) {
+        } else if (util::string_view{path}.ends_with(".feather")) {
           format = std::make_shared<dataset::IpcFileFormat>();
         } else {
           return Status::NotImplemented(
               "substrait::ReadRel::LocalFiles::FileOrFiles::format "
               "other than FILE_FORMAT_PARQUET");
         }
 
-        if (!util::string_view{item.uri_file()}.starts_with("file:///")) {
+        if (!util::string_view{path}.starts_with("file:///")) {

Review Comment:
   Wow. URIs should really be parsed with the `Uri` class from `arrow/util/uri.h`. Can you at least open a JIRA for that?



##########
cpp/src/arrow/filesystem/path_util.cc:
##########
@@ -287,6 +288,38 @@ bool IsLikelyUri(util::string_view v) {
   return ::arrow::internal::IsValidUriScheme(v.substr(0, pos));
 }
 
+struct Globber::Impl {
+  std::regex pattern_;
+
+  explicit Impl(std::string p) : pattern_(std::regex(std::move(transform(p)))) {}

Review Comment:
   Nit: `std::move` is not required here.
   ```suggestion
     explicit Impl(const std::string& p) : pattern_(std::regex(transform(p))) {}
   ```



##########
cpp/src/arrow/filesystem/path_util.cc:
##########
@@ -287,6 +288,38 @@ bool IsLikelyUri(util::string_view v) {
   return ::arrow::internal::IsValidUriScheme(v.substr(0, pos));
 }
 
+struct Globber::Impl {
+  std::regex pattern_;
+
+  explicit Impl(std::string p) : pattern_(std::regex(std::move(transform(p)))) {}
+
+  std::string transform(std::string p) {

Review Comment:
   Call this e.g. `PatternToRegex`? Also, can take a `const std::string&` to avoid a pointless copy.



##########
cpp/src/arrow/filesystem/path_util.h:
##########
@@ -128,6 +128,17 @@ bool IsEmptyPath(util::string_view s);
 ARROW_EXPORT
 bool IsLikelyUri(util::string_view s);
 
+class ARROW_EXPORT Globber {

Review Comment:
   Can you add tests for this class somewhere in `filesystem_test.cc`?



##########
cpp/src/arrow/filesystem/path_util.cc:
##########
@@ -287,6 +288,38 @@ bool IsLikelyUri(util::string_view v) {
   return ::arrow::internal::IsValidUriScheme(v.substr(0, pos));
 }
 
+struct Globber::Impl {
+  std::regex pattern_;
+
+  explicit Impl(std::string p) : pattern_(std::regex(std::move(transform(p)))) {}
+
+  std::string transform(std::string p) {

Review Comment:
   Yes, we can simply escape all of that. A switch/case statement would work.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org