You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/06/05 13:35:37 UTC

[GitHub] [arrow] fsaintjacques commented on a change in pull request #7156: ARROW-8074: [C++][Dataset][Python] FileFragments from buffers and NativeFiles

fsaintjacques commented on a change in pull request #7156:
URL: https://github.com/apache/arrow/pull/7156#discussion_r426085245



##########
File path: python/pyarrow/_dataset.pyx
##########
@@ -42,6 +43,51 @@ def _forbid_instantiation(klass, subclasses_instead=True):
     raise TypeError(msg)
 
 
+ctypedef CResult[shared_ptr[CRandomAccessFile]] CCustomOpen()
+
+cdef class FileSource:
+
+    cdef:
+        # XXX why is shared_ptr necessary here? CFileSource shouldn't need it

Review comment:
       This comment is dead.

##########
File path: cpp/src/arrow/dataset/test_util.h
##########
@@ -251,6 +251,15 @@ class JSONRecordBatchFileFormat : public FileFormat {
   SchemaResolver resolver_;
 };
 
+inline static std::vector<FileSource> SourcesFromPaths(

Review comment:
       Seems like a useful static method `FileSource::FromPaths`.

##########
File path: cpp/src/arrow/result.h
##########
@@ -298,7 +298,7 @@ class ARROW_MUST_USE_TYPE Result : public util::EqualityComparable<Result<T>> {
   ///
   /// \return The stored non-OK status object, or an OK status if this object
   ///         has a value.
-  Status status() const { return status_; }
+  const Status& status() const { return status_; }

Review comment:
       Good catch.

##########
File path: cpp/src/arrow/flight/client.cc
##########
@@ -108,7 +108,7 @@ class FinishableStream {
   std::shared_ptr<Stream> stream() const { return stream_; }
 
   /// \brief Finish the call, adding server context to the given status.
-  virtual Status Finish(Status&& st) {
+  virtual Status Finish(Status st) {

Review comment:
       This PR shouldn't touch this code unless it's failing at building?

##########
File path: cpp/src/arrow/util/checked_cast.h
##########
@@ -39,11 +40,11 @@ inline OutputType checked_cast(InputType&& value) {
 }
 
 template <class T, class U>
-std::shared_ptr<T> checked_pointer_cast(const std::shared_ptr<U>& r) noexcept {
+std::shared_ptr<T> checked_pointer_cast(std::shared_ptr<U> r) noexcept {

Review comment:
       Shouldn't we stick with the signature from the standard?
   
   https://en.cppreference.com/w/cpp/memory/shared_ptr/pointer_cast

##########
File path: cpp/src/arrow/status.h
##########
@@ -306,8 +306,12 @@ class ARROW_MUST_USE_TYPE ARROW_EXPORT Status : public util::EqualityComparable<
   std::string message() const { return ok() ? "" : state_->msg; }
 
   /// \brief Return the status detail attached to this message.
-  std::shared_ptr<StatusDetail> detail() const {
-    return state_ == NULLPTR ? NULLPTR : state_->detail;
+  const std::shared_ptr<StatusDetail>& detail() const {
+    if (state_ == NULLPTR) {

Review comment:
       ```suggestion
       static std::shared_ptr<StatusDetail> no_detail = NULLPTR;
       return state_ ? state_->detail : no_detail;
   ```

##########
File path: cpp/src/arrow/dataset/file_base.h
##########
@@ -45,82 +46,150 @@ namespace dataset {
 /// be read like a file
 class ARROW_DS_EXPORT FileSource {
  public:
-  // NOTE(kszucs): it'd be better to separate the BufferSource from FileSource
-  enum SourceKind { PATH, BUFFER };
-
   FileSource(std::string path, std::shared_ptr<fs::FileSystem> filesystem,
-             Compression::type compression = Compression::UNCOMPRESSED,
-             bool writable = true)
+             Compression::type compression = Compression::UNCOMPRESSED)
       : impl_(PathAndFileSystem{std::move(path), std::move(filesystem)}),
-        compression_(compression),
-        writable_(writable) {}
+        compression_(compression) {}
 
   explicit FileSource(std::shared_ptr<Buffer> buffer,
                       Compression::type compression = Compression::UNCOMPRESSED)
       : impl_(std::move(buffer)), compression_(compression) {}
 
-  explicit FileSource(std::shared_ptr<ResizableBuffer> buffer,
-                      Compression::type compression = Compression::UNCOMPRESSED)
-      : impl_(std::move(buffer)), compression_(compression), writable_(true) {}
-
-  bool operator==(const FileSource& other) const {
-    if (id() != other.id()) {
-      return false;
-    }
+  using CustomOpen = std::function<Result<std::shared_ptr<io::RandomAccessFile>>()>;
+  explicit FileSource(CustomOpen open) : impl_(std::move(open)) {}
 
-    if (id() == PATH) {
-      return path() == other.path() && filesystem() == other.filesystem();
-    }
+  using CustomOpenWithCompression =
+      std::function<Result<std::shared_ptr<io::RandomAccessFile>>(Compression::type)>;
+  explicit FileSource(CustomOpenWithCompression open_with_compression,
+                      Compression::type compression = Compression::UNCOMPRESSED)
+      : impl_(std::bind(std::move(open_with_compression), compression)),
+        compression_(compression) {}
 
-    return buffer()->Equals(*other.buffer());
-  }
+  explicit FileSource(std::shared_ptr<io::RandomAccessFile> file,
+                      Compression::type compression = Compression::UNCOMPRESSED)
+      : impl_([=] { return ToResult(file); }), compression_(compression) {}
 
-  /// \brief The kind of file, whether stored in a filesystem, memory
-  /// resident, or other
-  SourceKind id() const { return static_cast<SourceKind>(impl_.index()); }
+  FileSource() : impl_(CustomOpen{&InvalidOpen}) {}
 
   /// \brief Return the type of raw compression on the file, if any
   Compression::type compression() const { return compression_; }
 
-  /// \brief Whether the this source may be opened writable
-  bool writable() const { return writable_; }
-
-  /// \brief Return the file path, if any. Only valid when file source
-  /// type is PATH
+  /// \brief Return the file path, if any. Only valid when file source wraps a path.
   const std::string& path() const {
-    static std::string buffer_path = "<Buffer>";
-    return id() == PATH ? util::get<PATH>(impl_).path : buffer_path;
+    if (IsPath()) {
+      return util::get<PathAndFileSystem>(impl_).path;
+    }
+    if (IsBuffer()) {
+      static std::string no_path = "<Buffer>";
+      return no_path;
+    } else {
+      static std::string no_path = "<CustomOpen>";
+      return no_path;
+    }
   }
 
-  /// \brief Return the filesystem, if any. Only non null when file
-  /// source type is PATH
+  /// \brief Return the filesystem, if any. Otherwise returns nullptr
   const std::shared_ptr<fs::FileSystem>& filesystem() const {
-    static std::shared_ptr<fs::FileSystem> no_fs = NULLPTR;
-    return id() == PATH ? util::get<PATH>(impl_).filesystem : no_fs;
+    if (!IsPath()) {
+      static std::shared_ptr<fs::FileSystem> no_fs = NULLPTR;
+      return no_fs;
+    }
+    return util::get<PathAndFileSystem>(impl_).filesystem;
   }
 
   /// \brief Return the buffer containing the file, if any. Only value
   /// when file source type is BUFFER
   const std::shared_ptr<Buffer>& buffer() const {
-    static std::shared_ptr<Buffer> path_buffer = NULLPTR;
-    return id() == BUFFER ? util::get<BUFFER>(impl_) : path_buffer;
+    if (!IsBuffer()) {
+      static std::shared_ptr<Buffer> no_buffer = NULLPTR;
+      return no_buffer;
+    }
+    return util::get<std::shared_ptr<Buffer>>(impl_);
   }
 
   /// \brief Get a RandomAccessFile which views this file source
-  Result<std::shared_ptr<arrow::io::RandomAccessFile>> Open() const;
+  Result<std::shared_ptr<io::RandomAccessFile>> Open() const;
+
+ private:
+  static Result<std::shared_ptr<io::RandomAccessFile>> InvalidOpen() {
+    return Status::Invalid("Called Open() on an uninitialized FileSource");
+  }
+
+  bool IsPath() const { return util::holds_alternative<PathAndFileSystem>(impl_); }
+
+  bool IsBuffer() const {
+    return util::holds_alternative<std::shared_ptr<Buffer>>(impl_);
+  }
+
+  struct PathAndFileSystem {
+    std::string path;
+    std::shared_ptr<fs::FileSystem> filesystem;
+  };
+
+  util::variant<PathAndFileSystem, std::shared_ptr<Buffer>, CustomOpen> impl_;
+  Compression::type compression_ = Compression::UNCOMPRESSED;
+};
+
+/// \brief The path and filesystem where an actual file is located or a buffer which can
+/// be written to like a file
+class ARROW_DS_EXPORT WritableFileSource {

Review comment:
       Instead of having to copy the class with the exact same code, can we stick with `OpenWritable()`?

##########
File path: cpp/src/arrow/dataset/file_base.h
##########
@@ -45,82 +46,150 @@ namespace dataset {
 /// be read like a file
 class ARROW_DS_EXPORT FileSource {
  public:
-  // NOTE(kszucs): it'd be better to separate the BufferSource from FileSource
-  enum SourceKind { PATH, BUFFER };
-
   FileSource(std::string path, std::shared_ptr<fs::FileSystem> filesystem,
-             Compression::type compression = Compression::UNCOMPRESSED,
-             bool writable = true)
+             Compression::type compression = Compression::UNCOMPRESSED)

Review comment:
       I'm not satisfied how this class is transforming into a Franken-class. The need of static fake properties and the semi-broken default constructor.
   
   I'd say make FileSource an interface and use inheritance, make Open() virtual, the `path()` and `filesystem()`  will be specific to one implementation (maybe name them Source, FileSource, BufferSource, ...). We can make an accept visitor for classes who wants to touch properties like the path.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org