You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/07/26 00:23:55 UTC

[GitHub] [arrow] westonpace commented on a diff in pull request #13662: ARROW-14635: [Python][C++] implement fadvise

westonpace commented on code in PR #13662:
URL: https://github.com/apache/arrow/pull/13662#discussion_r929419924


##########
cpp/src/arrow/io/file.h:
##########
@@ -54,11 +54,12 @@ class ARROW_EXPORT FileOutputStream : public OutputStream {
   /// \brief Open a file descriptor for writing.  The underlying file isn't
   /// truncated.
   /// \param[in] fd file descriptor
+  /// \param[in] reuse should we use the page cache for the write

Review Comment:
   ```suggestion
     /// \param[in] reuse if false, the written data will not be kept in the OS page cache.
     ///                  This will have some negative impact on write throughput but can be
     ///                  used during large writes to avoid evicting more important data from
     ///                  the page cache.
   ```



##########
cpp/src/arrow/filesystem/localfs.h:
##########
@@ -37,7 +37,7 @@ struct ARROW_EXPORT LocalFileSystemOptions {
   /// Whether OpenInputStream and OpenInputFile return a mmap'ed file,
   /// or a regular one.
   bool use_mmap = false;
-
+  bool reuse = true;

Review Comment:
   Please add a doc comment explaining the meaning of this flag.



##########
cpp/src/arrow/io/file.cc:
##########
@@ -167,8 +169,23 @@ class OSFile {
     if (length < 0) {
       return Status::IOError("Length must be non-negative");
     }
+#ifdef __linux__
+    if (reuse_) {
+      return ::arrow::internal::FileWrite(fd_.fd(),
+                                          reinterpret_cast<const uint8_t*>(data), length);
+    } else {
+      auto status = ::arrow::internal::FileWrite(
+          fd_.fd(), reinterpret_cast<const uint8_t*>(data), length);
+      posix_fadvise(fd_.fd(), offset_, length, POSIX_FADV_DONTNEED);
+      offset_ += length;
+      return status;
+    }
+#else
+    // the posix_fadvise is not implemented on other filesystems.
+    // there could be alternative ways of getting this working but not implemented here.

Review Comment:
   ```suggestion
       // posix_fadvise is not implemented on other filesystems.
       // there could be alternative ways of getting this working but not implemented here.
   ```



##########
cpp/src/arrow/filesystem/localfs.cc:
##########
@@ -434,14 +434,15 @@ namespace {
 
 Result<std::shared_ptr<io::OutputStream>> OpenOutputStreamGeneric(const std::string& path,
                                                                   bool truncate,
-                                                                  bool append) {
+                                                                  bool append,
+                                                                  bool reuse) {
   RETURN_NOT_OK(ValidatePath(path));
   ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path));
   const bool write_only = true;
-  ARROW_ASSIGN_OR_RAISE(
-      auto fd, ::arrow::internal::FileOpenWritable(fn, write_only, truncate, append));
+  ARROW_ASSIGN_OR_RAISE(auto fd, ::arrow::internal::FileOpenWritable(
+                                     fn, write_only, truncate, append, !reuse));

Review Comment:
   ```suggestion
     // When reuse is false we need to open the file in sync mode because fadvise will
     // not drop the data from cache unless it has finished writing.
     ARROW_ASSIGN_OR_RAISE(auto fd, ::arrow::internal::FileOpenWritable(
                                        fn, write_only, truncate, append, !reuse));
   ```



##########
cpp/src/arrow/io/file.cc:
##########
@@ -207,6 +224,8 @@ class OSFile {
   FileDescriptor fd_;
   FileMode::type mode_;
   int64_t size_{-1};
+  bool reuse_;
+  int64_t offset_{0};

Review Comment:
   Does this need to be updated on a seek?



##########
python/pyarrow/_fs.pyx:
##########
@@ -782,6 +782,9 @@ cdef class LocalFileSystem(FileSystem):
     use_mmap : bool, default False
         Whether open_input_stream and open_input_file should return
         a mmap'ed file or a regular file.
+    reuse : bool, default True
+        If set to False, will use posix_fadvise to (try) not use the page cache.
+        This will only work on Linux!!

Review Comment:
   ```suggestion
           If set to False, will use posix_fadvise to (try) not use the page cache.
           This will only work on Linux.
   ```
   
   Why `(try)`?  Also, we should make this comment consistent with the one in the C++.  The user should be advised this will have a negative impact on performance.  The user should be informed that they might want to do this to prevent swapping out important information.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org