You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/04/12 12:11:04 UTC

[arrow] branch master updated: ARROW-2369: [Python] Fix reading large Parquet files (> 4 GB)

This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new f177404  ARROW-2369: [Python] Fix reading large Parquet files (> 4 GB)
f177404 is described below

commit f177404a25e4e79ad52ed4f9792f42595a65109e
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Thu Apr 12 14:09:11 2018 +0200

    ARROW-2369: [Python] Fix reading large Parquet files (> 4 GB)
    
    - Fix PythonFile.seek() for offsets > 4 GB
    - Avoid instantiating a PythonFile in ParquetFile, for efficiency
    
    Author: Antoine Pitrou <an...@python.org>
    
    Closes #1866 from pitrou/ARROW-2369 and squashes the following commits:
    
    20c5fa8 <Antoine Pitrou> ARROW-2369:  Fix reading large Parquet files (> 4 GB)
---
 cpp/src/arrow/python/io.cc      |  6 ++++--
 python/pyarrow/parquet.py       |  1 -
 python/pyarrow/tests/test_io.py | 35 +++++++++++++++++++++++++++++++++++
 3 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/python/io.cc b/cpp/src/arrow/python/io.cc
index 36c193d..155e86f 100644
--- a/cpp/src/arrow/python/io.cc
+++ b/cpp/src/arrow/python/io.cc
@@ -65,14 +65,16 @@ class PythonFile {
 
   Status Seek(int64_t position, int whence) {
     // whence: 0 for relative to start of file, 2 for end of file
-    PyObject* result = cpp_PyObject_CallMethod(file_, "seek", "(ii)", position, whence);
+    PyObject* result = cpp_PyObject_CallMethod(file_, "seek", "(ni)",
+                                               static_cast<Py_ssize_t>(position), whence);
     Py_XDECREF(result);
     PY_RETURN_IF_ERROR(StatusCode::IOError);
     return Status::OK();
   }
 
   Status Read(int64_t nbytes, PyObject** out) {
-    PyObject* result = cpp_PyObject_CallMethod(file_, "read", "(i)", nbytes);
+    PyObject* result =
+        cpp_PyObject_CallMethod(file_, "read", "(n)", static_cast<Py_ssize_t>(nbytes));
     PY_RETURN_IF_ERROR(StatusCode::IOError);
     *out = result;
     return Status::OK();
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index beeedca..34aa55a 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -60,7 +60,6 @@ class ParquetFile(object):
     """
     def __init__(self, source, metadata=None, common_metadata=None):
         self.reader = ParquetReader()
-        source = _ensure_file(source)
         self.reader.open(source, metadata=metadata)
         self.common_metadata = common_metadata
         self._nested_paths_by_prefix = self._build_nested_paths()
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index b29b9f1..1511600 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -21,6 +21,7 @@ import gc
 import os
 import pytest
 import sys
+import tempfile
 import weakref
 
 import numpy as np
@@ -30,6 +31,25 @@ import pandas as pd
 from pyarrow.compat import u, guid
 import pyarrow as pa
 
+
+def check_large_seeks(file_factory):
+    if sys.platform in ('win32', 'darwin'):
+        pytest.skip("need sparse file support")
+    try:
+        filename = tempfile.mktemp(prefix='test_io')
+        with open(filename, 'wb') as f:
+            f.truncate(2 ** 32 + 10)
+            f.seek(2 ** 32 + 5)
+            f.write(b'mark\n')
+        with file_factory(filename) as f:
+            assert f.seek(2 ** 32 + 5) == 2 ** 32 + 5
+            assert f.tell() == 2 ** 32 + 5
+            assert f.read(5) == b'mark\n'
+            assert f.tell() == 2 ** 32 + 10
+    finally:
+        os.unlink(filename)
+
+
 # ----------------------------------------------------------------------
 # Python file-like objects
 
@@ -83,6 +103,13 @@ def test_python_file_read():
     f.close()
 
 
+def test_python_file_large_seeks():
+    def factory(filename):
+        return pa.PythonFile(open(filename, 'rb'))
+
+    check_large_seeks(factory)
+
+
 def test_bytes_reader():
     # Like a BytesIO, but zero-copy underneath for C++ consumers
     data = b'some sample data'
@@ -544,6 +571,10 @@ def test_os_file_reader(sample_disk_data):
     _check_native_file_reader(pa.OSFile, sample_disk_data)
 
 
+def test_os_file_large_seeks():
+    check_large_seeks(pa.OSFile)
+
+
 def _try_delete(path):
     try:
         os.remove(path)
@@ -600,6 +631,10 @@ def test_memory_zero_length(tmpdir):
         assert memory_map.size() == 0
 
 
+def test_memory_map_large_seeks():
+    check_large_seeks(pa.memory_map)
+
+
 def test_os_file_writer(tmpdir):
     SIZE = 4096
     arr = np.random.randint(0, 256, size=SIZE).astype('u1')

-- 
To stop receiving notification emails like this one, please contact
uwe@apache.org.