You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/12/14 12:37:33 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #11943: ARROW-14918: [C++] GcsFileSystem and listing files

pitrou commented on a change in pull request #11943:
URL: https://github.com/apache/arrow/pull/11943#discussion_r768619895



##########
File path: cpp/src/arrow/filesystem/gcsfs.cc
##########
@@ -285,6 +285,41 @@ class GcsFileSystem::Impl {
         path.object.back() == '/' ? FileType::Directory : FileType::File);
   }
 
+  Result<FileInfoVector> GetFileInfo(const FileSelector& select) {
+    ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(select.base_dir));
+    auto prefix = p.object.empty() ? gcs::Prefix() : gcs::Prefix(p.object);
+    auto delimiter = select.recursive ? gcs::Delimiter() : gcs::Delimiter("/");
+    bool found_directory = false;
+    FileInfoVector result;
+    for (auto const& o : client_.ListObjects(p.bucket, prefix, delimiter)) {
+      if (!o.ok()) {
+        if (select.allow_not_found &&
+            o.status().code() == google::cloud::StatusCode::kNotFound) {
+          continue;
+        }
+        return internal::ToArrowStatus(o.status());
+      }
+      found_directory = true;
+      // Skip the directory itself from the results
+      if (o->name() == p.object) {
+        continue;
+      }
+      auto path = internal::ConcatAbstractPath(o->bucket(), o->name());
+      if (o->name().back() == '/') {
+        result.push_back(
+            FileInfo(internal::EnsureTrailingSlash(path), FileType::Directory));
+        continue;
+      }
+      auto info = FileInfo(path, FileType::File);
+      info.set_size(static_cast<int64_t>(o->size()));
+      result.push_back(std::move(info));
+    }
+    if (!found_directory && !select.allow_not_found) {
+      return Status::IOError("no such file or directory " + select.base_dir);

Review comment:
       ```suggestion
         return Status::IOError("No such file or directory: '", select.base_dir, "'");
   ```

##########
File path: cpp/src/arrow/filesystem/gcsfs_test.cc
##########
@@ -39,6 +39,11 @@
 
 namespace arrow {
 namespace fs {
+/// Custom comparison for FileInfo, we need this to use complex googletest matchers.

Review comment:
       I'm not sure that would solve your issue, but you can use `SortInfos` to get a deterministic comparand (see `arrow/filesystem/test_util.h`).

##########
File path: cpp/src/arrow/filesystem/gcsfs.cc
##########
@@ -285,6 +285,41 @@ class GcsFileSystem::Impl {
         path.object.back() == '/' ? FileType::Directory : FileType::File);
   }
 
+  Result<FileInfoVector> GetFileInfo(const FileSelector& select) {
+    ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(select.base_dir));
+    auto prefix = p.object.empty() ? gcs::Prefix() : gcs::Prefix(p.object);
+    auto delimiter = select.recursive ? gcs::Delimiter() : gcs::Delimiter("/");
+    bool found_directory = false;
+    FileInfoVector result;
+    for (auto const& o : client_.ListObjects(p.bucket, prefix, delimiter)) {
+      if (!o.ok()) {
+        if (select.allow_not_found &&
+            o.status().code() == google::cloud::StatusCode::kNotFound) {
+          continue;
+        }
+        return internal::ToArrowStatus(o.status());
+      }
+      found_directory = true;
+      // Skip the directory itself from the results
+      if (o->name() == p.object) {
+        continue;
+      }
+      auto path = internal::ConcatAbstractPath(o->bucket(), o->name());
+      if (o->name().back() == '/') {
+        result.push_back(
+            FileInfo(internal::EnsureTrailingSlash(path), FileType::Directory));
+        continue;
+      }
+      auto info = FileInfo(path, FileType::File);
+      info.set_size(static_cast<int64_t>(o->size()));

Review comment:
       For the record, the `ListObjects` results don't allow fetching the modification time?

##########
File path: cpp/src/arrow/filesystem/gcsfs_test.cc
##########
@@ -187,6 +193,31 @@ class GcsIntegrationTest : public ::testing::Test {
 
   std::string RandomFolderName() { return RandomChars(32) + "/"; }
 
+  struct Hierarchy {
+    std::string base_dir;
+    std::vector<FileInfo> contents;
+  };
+
+  Result<Hierarchy> CreateHierarchy(std::shared_ptr<arrow::fs::FileSystem> fs) {
+    const char* const kTestFolders[] = {
+        "a/", "a/0/", "a/0/0/", "a/1/", "a/2/",
+    };
+    auto result = Hierarchy{PreexistingBucketPath() + "a/", {}};
+    for (auto const* f : kTestFolders) {
+      const auto folder = PreexistingBucketPath() + f;
+      RETURN_NOT_OK(fs->CreateDir(folder, true));
+      result.contents.push_back(arrow::fs::Dir(folder));
+      for (int i = 0; i != 64; ++i) {
+        const auto filename = folder + "test-file-" + std::to_string(i);
+        ARROW_ASSIGN_OR_RAISE(auto w, fs->OpenOutputStream(filename, {}));
+        RETURN_NOT_OK(w->Write(filename.data(), filename.size()));
+        RETURN_NOT_OK(w->Close());

Review comment:
       As a nit, you can also use the helper function `CreateFile` (in `test_util.h`).




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org