You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/04/23 20:56:58 UTC

[arrow] branch master updated: ARROW-2470: [C++] Avoid seeking in GetFileSize

This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new b65205e  ARROW-2470: [C++] Avoid seeking in GetFileSize
b65205e is described below

commit b65205e32eaa5d8548b6f1a7f9893d3b2ebf8588
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Mon Apr 23 22:56:51 2018 +0200

    ARROW-2470: [C++] Avoid seeking in GetFileSize
    
    This makes GetFileSize thread-safe and also reduces its cost.
    
    Author: Antoine Pitrou <an...@python.org>
    
    Closes #1934 from pitrou/ARROW-2470-getfilesize and squashes the following commits:
    
    52392075 <Antoine Pitrou> ARROW-2470:  Avoid seeking in GetFileSize
---
 cpp/src/arrow/io/io-file-test.cc | 27 ++++++++++++++++++++++++--
 cpp/src/arrow/util/io-util.cc    | 41 +++++++++++++++++++++++-----------------
 python/pyarrow/tests/test_io.py  |  2 ++
 3 files changed, 51 insertions(+), 19 deletions(-)

diff --git a/cpp/src/arrow/io/io-file-test.cc b/cpp/src/arrow/io/io-file-test.cc
index b661fb6..40e24aa 100644
--- a/cpp/src/arrow/io/io-file-test.cc
+++ b/cpp/src/arrow/io/io-file-test.cc
@@ -322,6 +322,9 @@ TEST_F(TestReadableFile, SeekTellSize) {
   ASSERT_OK(file_->GetSize(&size));
   ASSERT_EQ(8, size);
 
+  ASSERT_OK(file_->Tell(&position));
+  ASSERT_EQ(100, position);
+
   // does not support zero copy
   ASSERT_FALSE(file_->supports_zero_copy());
 }
@@ -526,6 +529,12 @@ TEST_F(TestPipeIO, TestWrite) {
   ASSERT_EQ(bytes_read, 0);
 }
 
+TEST_F(TestPipeIO, ReadableFileFails) {
+  // ReadableFile fails on non-seekable fd
+  std::shared_ptr<ReadableFile> file;
+  ASSERT_RAISES(IOError, ReadableFile::Open(r_, &file));
+}
+
 // ----------------------------------------------------------------------
 // Memory map tests
 
@@ -541,7 +550,7 @@ TEST_F(TestMemoryMappedFile, ZeroSizeFlie) {
   std::shared_ptr<MemoryMappedFile> result;
   ASSERT_OK(InitMemoryMap(0, path, &result));
 
-  int64_t size = 0;
+  int64_t size = -1;
   ASSERT_OK(result->Tell(&size));
   ASSERT_EQ(0, size);
 }
@@ -554,7 +563,7 @@ TEST_F(TestMemoryMappedFile, WriteRead) {
 
   const int reps = 5;
 
-  std::string path = "ipc-write-read-test";
+  std::string path = "io-memory-map-write-read-test";
   std::shared_ptr<MemoryMappedFile> result;
   ASSERT_OK(InitMemoryMap(reps * buffer_size, path, &result));
 
@@ -570,6 +579,20 @@ TEST_F(TestMemoryMappedFile, WriteRead) {
   }
 }
 
+TEST_F(TestMemoryMappedFile, GetSize) {
+  std::string path = "io-memory-map-get-size";
+  std::shared_ptr<MemoryMappedFile> result;
+  ASSERT_OK(InitMemoryMap(16384, path, &result));
+
+  int64_t size = -1;
+  ASSERT_OK(result->GetSize(&size));
+  ASSERT_EQ(16384, size);
+
+  int64_t position = -1;
+  ASSERT_OK(result->Tell(&position));
+  ASSERT_EQ(0, position);
+}
+
 TEST_F(TestMemoryMappedFile, ReadOnly) {
   const int64_t buffer_size = 1024;
   std::vector<uint8_t> buffer(buffer_size);
diff --git a/cpp/src/arrow/util/io-util.cc b/cpp/src/arrow/util/io-util.cc
index 10a30df..03edc18 100644
--- a/cpp/src/arrow/util/io-util.cc
+++ b/cpp/src/arrow/util/io-util.cc
@@ -255,25 +255,32 @@ Status FileSeek(int fd, int64_t pos, int whence) {
 Status FileSeek(int fd, int64_t pos) { return FileSeek(fd, pos, SEEK_SET); }
 
 Status FileGetSize(int fd, int64_t* size) {
-  int64_t ret;
-
-  // XXX Should use fstat() instead, but this function also ensures the
-  // file is seekable
-
-  // Save current position
-  int64_t current_position = lseek64_compat(fd, 0, SEEK_CUR);
-  CHECK_LSEEK(current_position);
-
-  // Move to end of the file, which returns the file length
-  ret = lseek64_compat(fd, 0, SEEK_END);
-  CHECK_LSEEK(ret);
-
-  *size = ret;
+#if defined(_MSC_VER)
+  struct __stat64 st;
+#else
+  struct stat st;
+#endif
+  st.st_size = -1;
 
-  // Restore file position
-  ret = lseek64_compat(fd, current_position, SEEK_SET);
-  CHECK_LSEEK(ret);
+#if defined(_MSC_VER)
+  int ret = _fstat64(fd, &st);
+#else
+  int ret = fstat(fd, &st);
+#endif
 
+  if (ret == -1) {
+    return Status::IOError("error stat()ing file");
+  }
+  if (st.st_size == 0) {
+    // Maybe the file doesn't support getting its size, double-check by
+    // trying to tell() (seekable files usually have a size, while
+    // non-seekable files don't)
+    int64_t position;
+    RETURN_NOT_OK(FileTell(fd, &position));
+  } else if (st.st_size < 0) {
+    return Status::IOError("error getting file size");
+  }
+  *size = st.st_size;
   return Status::OK();
 }
 
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index 1511600..02851be 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -100,6 +100,8 @@ def test_python_file_read():
     assert v == b'sample data'
     assert len(v) == 11
 
+    assert f.size() == len(data)
+
     f.close()
 
 

-- 
To stop receiving notification emails like this one, please contact
uwe@apache.org.