You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/05/03 13:30:38 UTC

[GitHub] [arrow] pitrou commented on a diff in pull request #12625: ARROW-15587: [C++] Add support for all options specified by substrait::ReadRel::LocalFiles::FileOrFiles

pitrou commented on code in PR #12625:
URL: https://github.com/apache/arrow/pull/12625#discussion_r863755589


##########
cpp/src/arrow/filesystem/filesystem_test.cc:
##########
@@ -265,6 +265,24 @@ TEST(PathUtil, ToSlashes) {
 #endif
 }
 
+TEST(PathUtil, Globber) {
+  Globber localfs_linux("/f?o/bar/a?/1*.txt");
+  ASSERT_TRUE(localfs_linux.Matches("/foo/bar/a1/1.txt"));
+  ASSERT_TRUE(localfs_linux.Matches("/f#o/bar/ab/1000.txt"));

Review Comment:
   Can you test failing matches as well?
   For example, what happens when matching `"/foo/bar/ab/1/23.txt"`?



##########
cpp/src/arrow/filesystem/path_util.cc:
##########
@@ -287,6 +288,42 @@ bool IsLikelyUri(util::string_view v) {
   return ::arrow::internal::IsValidUriScheme(v.substr(0, pos));
 }
 
+struct Globber::Impl {
+  std::regex pattern_;
+
+  explicit Impl(const std::string& p) : pattern_(std::regex(PatternToRegex(p))) {}
+
+  std::string PatternToRegex(const std::string& p) {
+    std::string special_chars_ = "()[]{}+-|^$\\.&~# \t\n\r\v\f";
+    std::string transformed;
+    for (char c : p) {
+      if (c == '*') {
+        if (transformed.back() != '\\')

Review Comment:
   What if `transformed` is empty here? It would seem better to structure the code a bit differently, e.g.:
   ```c++
   auto it = p.begin();
   while (it != p.end()) {
     if (*it == '\\') {
       // Escaping sequence
       if (++it == p.end()) {
         transformed += '\\';
       } else {
         transformed += *it++;
       }
       continue;
     }
     // other cases here
   }



##########
cpp/src/arrow/filesystem/path_util.h:
##########
@@ -128,6 +128,17 @@ bool IsEmptyPath(util::string_view s);
 ARROW_EXPORT
 bool IsLikelyUri(util::string_view s);
 
+class ARROW_EXPORT Globber {
+ public:
+  ~Globber();
+  explicit Globber(std::string pattern);
+  bool Matches(std::string path);

Review Comment:
   There is no need to make a copy here as the parameter isn't kept anywhere after this call.
   ```suggestion
     bool Matches(const std::string& path);
   ```



##########
cpp/src/arrow/filesystem/path_util.cc:
##########
@@ -287,6 +288,42 @@ bool IsLikelyUri(util::string_view v) {
   return ::arrow::internal::IsValidUriScheme(v.substr(0, pos));
 }
 
+struct Globber::Impl {
+  std::regex pattern_;
+
+  explicit Impl(const std::string& p) : pattern_(std::regex(PatternToRegex(p))) {}
+
+  std::string PatternToRegex(const std::string& p) {
+    std::string special_chars_ = "()[]{}+-|^$\\.&~# \t\n\r\v\f";

Review Comment:
   Nit: local variable naming
   ```suggestion
       std::string special_chars = "()[]{}+-|^$\\.&~# \t\n\r\v\f";
   ```



##########
cpp/src/arrow/engine/substrait/relation_internal.cc:
##########
@@ -49,6 +50,52 @@ Status CheckRelCommon(const RelMessage& rel) {
   return Status::OK();
 }
 
+Result<fs::FileInfoVector> GetGlobFiles(const std::shared_ptr<fs::FileSystem>& filesystem,
+                                        const std::string& glob) {
+  fs::FileInfoVector results, temp;
+  fs::FileSelector selector;
+  std::string cur;
+
+  auto split_path = fs::internal::SplitAbstractPath(glob, '/');
+  ARROW_ASSIGN_OR_RAISE(auto file, filesystem->GetFileInfo("/"));
+  results.push_back(std::move(file));
+
+  for (size_t i = 0; i < split_path.size(); i++) {
+    if (split_path[i].find_first_of("*?") == std::string::npos) {
+      if (cur.empty())
+        cur = split_path[i];
+      else
+        cur = fs::internal::ConcatAbstractPath(cur, split_path[i]);
+      continue;
+    } else {
+      for (auto res : results) {
+        if (res.type() != fs::FileType::Directory) continue;
+        selector.base_dir = res.path() + cur;
+        ARROW_ASSIGN_OR_RAISE(auto entries, filesystem->GetFileInfo(selector));

Review Comment:
   Note that instead of issuing a blocking call for each selector, it would probably be more efficient to use the async filesystem APIs. This can be left for another JIRA, though.



##########
cpp/src/arrow/filesystem/filesystem_test.cc:
##########
@@ -265,6 +265,24 @@ TEST(PathUtil, ToSlashes) {
 #endif
 }
 
+TEST(PathUtil, Globber) {
+  Globber localfs_linux("/f?o/bar/a?/1*.txt");
+  ASSERT_TRUE(localfs_linux.Matches("/foo/bar/a1/1.txt"));
+  ASSERT_TRUE(localfs_linux.Matches("/f#o/bar/ab/1000.txt"));
+
+  Globber localfs_windows("C:/f?o/bar/a?/1*.txt");
+  ASSERT_TRUE(localfs_windows.Matches("C:/f_o/bar/ac/1000.txt"));
+
+  Globber remotefs("remote://my|bucket(#?)/foo{*}/[?]bar~/b&z/a: *-c.txt");
+  ASSERT_TRUE(remotefs.Matches("remote://my|bucket(#0)/foo{}/[?]bar~/b&z/a: -c.txt"));
+  ASSERT_TRUE(
+      remotefs.Matches("remote://my|bucket(#%)/foo{abc}/[_]bar~/b&z/a: ab-c.txt"));
+
+  Globber wildcards("remote://bucket?/f\\?o/\\*/*.parquet");
+  ASSERT_TRUE(wildcards.Matches("remote://bucket0/f?o/*/abc.parquet"));
+  ASSERT_FALSE(wildcards.Matches("remote://bucket0/foo/ab/a.parquet"));
+}

Review Comment:
   Would be nice to add tests for edge cases, such as an empty pattern or `"\\"` or `"*"`...



##########
cpp/src/arrow/engine/substrait/serde_test.cc:
##########
@@ -654,7 +655,8 @@ TEST(Substrait, ReadRel) {
   ASSERT_EQ(scan_node_options.dataset->type_name(), "filesystem");
   const auto& dataset =
       checked_cast<const dataset::FileSystemDataset&>(*scan_node_options.dataset);
-  EXPECT_THAT(dataset.files(), ElementsAre("/tmp/dat1.parquet", "/tmp/dat2.parquet"));
+  EXPECT_THAT(dataset.files(),
+              UnorderedElementsAre("/tmp/dat1.parquet", "/tmp/dat2.parquet"));
   EXPECT_EQ(dataset.format()->type_name(), "parquet");
   EXPECT_EQ(*dataset.schema(), Schema({field("i", int64()), field("b", boolean())}));
 }

Review Comment:
   I'm surprised: we aren't adding any tests for globbing here?



##########
cpp/src/arrow/engine/substrait/relation_internal.cc:
##########
@@ -49,6 +50,52 @@ Status CheckRelCommon(const RelMessage& rel) {
   return Status::OK();
 }
 
+Result<fs::FileInfoVector> GetGlobFiles(const std::shared_ptr<fs::FileSystem>& filesystem,
+                                        const std::string& glob) {
+  fs::FileInfoVector results, temp;
+  fs::FileSelector selector;
+  std::string cur;
+
+  auto split_path = fs::internal::SplitAbstractPath(glob, '/');
+  ARROW_ASSIGN_OR_RAISE(auto file, filesystem->GetFileInfo("/"));
+  results.push_back(std::move(file));
+
+  for (size_t i = 0; i < split_path.size(); i++) {
+    if (split_path[i].find_first_of("*?") == std::string::npos) {
+      if (cur.empty())
+        cur = split_path[i];
+      else
+        cur = fs::internal::ConcatAbstractPath(cur, split_path[i]);
+      continue;
+    } else {
+      for (auto res : results) {
+        if (res.type() != fs::FileType::Directory) continue;
+        selector.base_dir = res.path() + cur;
+        ARROW_ASSIGN_OR_RAISE(auto entries, filesystem->GetFileInfo(selector));
+        fs::internal::Globber globber(
+            fs::internal::ConcatAbstractPath(selector.base_dir, split_path[i]));
+        for (auto entry : entries) {
+          if (globber.Matches(entry.path())) {
+            temp.push_back(std::move(entry));
+          }
+        }
+      }
+      results = std::move(temp);
+      cur.clear();
+    }
+  }
+
+  if (!cur.empty()) {
+    for (size_t i = 0; i < results.size(); i++) {
+      ARROW_ASSIGN_OR_RAISE(
+          results[i], filesystem->GetFileInfo(
+                          fs::internal::ConcatAbstractPath(results[i].path(), cur)));

Review Comment:
   There's a version of `GetFileInfo` that takes a vector of paths, can you use it?



##########
cpp/src/arrow/engine/substrait/relation_internal.cc:
##########
@@ -49,6 +50,52 @@ Status CheckRelCommon(const RelMessage& rel) {
   return Status::OK();
 }
 
+Result<fs::FileInfoVector> GetGlobFiles(const std::shared_ptr<fs::FileSystem>& filesystem,
+                                        const std::string& glob) {
+  fs::FileInfoVector results, temp;
+  fs::FileSelector selector;
+  std::string cur;
+
+  auto split_path = fs::internal::SplitAbstractPath(glob, '/');
+  ARROW_ASSIGN_OR_RAISE(auto file, filesystem->GetFileInfo("/"));
+  results.push_back(std::move(file));
+
+  for (size_t i = 0; i < split_path.size(); i++) {
+    if (split_path[i].find_first_of("*?") == std::string::npos) {
+      if (cur.empty())
+        cur = split_path[i];
+      else
+        cur = fs::internal::ConcatAbstractPath(cur, split_path[i]);
+      continue;
+    } else {
+      for (auto res : results) {
+        if (res.type() != fs::FileType::Directory) continue;
+        selector.base_dir = res.path() + cur;
+        ARROW_ASSIGN_OR_RAISE(auto entries, filesystem->GetFileInfo(selector));
+        fs::internal::Globber globber(
+            fs::internal::ConcatAbstractPath(selector.base_dir, split_path[i]));
+        for (auto entry : entries) {
+          if (globber.Matches(entry.path())) {
+            temp.push_back(std::move(entry));
+          }
+        }
+      }
+      results = std::move(temp);
+      cur.clear();
+    }
+  }
+
+  if (!cur.empty()) {
+    for (size_t i = 0; i < results.size(); i++) {
+      ARROW_ASSIGN_OR_RAISE(
+          results[i], filesystem->GetFileInfo(
+                          fs::internal::ConcatAbstractPath(results[i].path(), cur)));
+    }
+  }
+
+  return results;

Review Comment:
   Don't you want to filter the results for which the file type is `FileType::NotFound`?



##########
cpp/src/arrow/filesystem/path_util.cc:
##########
@@ -287,6 +288,42 @@ bool IsLikelyUri(util::string_view v) {
   return ::arrow::internal::IsValidUriScheme(v.substr(0, pos));
 }
 
+struct Globber::Impl {
+  std::regex pattern_;
+
+  explicit Impl(const std::string& p) : pattern_(std::regex(PatternToRegex(p))) {}
+
+  std::string PatternToRegex(const std::string& p) {

Review Comment:
   Nit: can make this a static method
   ```suggestion
     static std::string PatternToRegex(const std::string& p) {
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org