You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/04/12 12:11:04 UTC
[arrow] branch master updated: ARROW-2369: [Python] Fix reading
large Parquet files (> 4 GB)
This is an automated email from the ASF dual-hosted git repository.
uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new f177404 ARROW-2369: [Python] Fix reading large Parquet files (> 4 GB)
f177404 is described below
commit f177404a25e4e79ad52ed4f9792f42595a65109e
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Thu Apr 12 14:09:11 2018 +0200
ARROW-2369: [Python] Fix reading large Parquet files (> 4 GB)
- Fix PythonFile.seek() for offsets > 4 GB
- Avoid instantiating a PythonFile in ParquetFile, for efficiency
Author: Antoine Pitrou <an...@python.org>
Closes #1866 from pitrou/ARROW-2369 and squashes the following commits:
20c5fa8 <Antoine Pitrou> ARROW-2369: Fix reading large Parquet files (> 4 GB)
---
cpp/src/arrow/python/io.cc | 6 ++++--
python/pyarrow/parquet.py | 1 -
python/pyarrow/tests/test_io.py | 35 +++++++++++++++++++++++++++++++++++
3 files changed, 39 insertions(+), 3 deletions(-)
diff --git a/cpp/src/arrow/python/io.cc b/cpp/src/arrow/python/io.cc
index 36c193d..155e86f 100644
--- a/cpp/src/arrow/python/io.cc
+++ b/cpp/src/arrow/python/io.cc
@@ -65,14 +65,16 @@ class PythonFile {
Status Seek(int64_t position, int whence) {
// whence: 0 for relative to start of file, 2 for end of file
- PyObject* result = cpp_PyObject_CallMethod(file_, "seek", "(ii)", position, whence);
+ PyObject* result = cpp_PyObject_CallMethod(file_, "seek", "(ni)",
+ static_cast<Py_ssize_t>(position), whence);
Py_XDECREF(result);
PY_RETURN_IF_ERROR(StatusCode::IOError);
return Status::OK();
}
Status Read(int64_t nbytes, PyObject** out) {
- PyObject* result = cpp_PyObject_CallMethod(file_, "read", "(i)", nbytes);
+ PyObject* result =
+ cpp_PyObject_CallMethod(file_, "read", "(n)", static_cast<Py_ssize_t>(nbytes));
PY_RETURN_IF_ERROR(StatusCode::IOError);
*out = result;
return Status::OK();
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index beeedca..34aa55a 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -60,7 +60,6 @@ class ParquetFile(object):
"""
def __init__(self, source, metadata=None, common_metadata=None):
self.reader = ParquetReader()
- source = _ensure_file(source)
self.reader.open(source, metadata=metadata)
self.common_metadata = common_metadata
self._nested_paths_by_prefix = self._build_nested_paths()
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index b29b9f1..1511600 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -21,6 +21,7 @@ import gc
import os
import pytest
import sys
+import tempfile
import weakref
import numpy as np
@@ -30,6 +31,25 @@ import pandas as pd
from pyarrow.compat import u, guid
import pyarrow as pa
+
+def check_large_seeks(file_factory):
+ if sys.platform in ('win32', 'darwin'):
+ pytest.skip("need sparse file support")
+ try:
+ filename = tempfile.mktemp(prefix='test_io')
+ with open(filename, 'wb') as f:
+ f.truncate(2 ** 32 + 10)
+ f.seek(2 ** 32 + 5)
+ f.write(b'mark\n')
+ with file_factory(filename) as f:
+ assert f.seek(2 ** 32 + 5) == 2 ** 32 + 5
+ assert f.tell() == 2 ** 32 + 5
+ assert f.read(5) == b'mark\n'
+ assert f.tell() == 2 ** 32 + 10
+ finally:
+ os.unlink(filename)
+
+
# ----------------------------------------------------------------------
# Python file-like objects
@@ -83,6 +103,13 @@ def test_python_file_read():
f.close()
+def test_python_file_large_seeks():
+ def factory(filename):
+ return pa.PythonFile(open(filename, 'rb'))
+
+ check_large_seeks(factory)
+
+
def test_bytes_reader():
# Like a BytesIO, but zero-copy underneath for C++ consumers
data = b'some sample data'
@@ -544,6 +571,10 @@ def test_os_file_reader(sample_disk_data):
_check_native_file_reader(pa.OSFile, sample_disk_data)
+def test_os_file_large_seeks():
+ check_large_seeks(pa.OSFile)
+
+
def _try_delete(path):
try:
os.remove(path)
@@ -600,6 +631,10 @@ def test_memory_zero_length(tmpdir):
assert memory_map.size() == 0
+def test_memory_map_large_seeks():
+ check_large_seeks(pa.memory_map)
+
+
def test_os_file_writer(tmpdir):
SIZE = 4096
arr = np.random.randint(0, 256, size=SIZE).astype('u1')
--
To stop receiving notification emails like this one, please contact
uwe@apache.org.