You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by am...@apache.org on 2022/06/20 15:39:50 UTC
[arrow] branch master updated: ARROW-16382: [Python] Disable memory mapping by default in pyarrow (#13342)
This is an automated email from the ASF dual-hosted git repository.
amolina pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new d49d8de3a9 ARROW-16382: [Python] Disable memory mapping by default in pyarrow (#13342)
d49d8de3a9 is described below
commit d49d8de3a9158f66ff7073fe077c5c292e38de6e
Author: AlvinJ15 <al...@gmail.com>
AuthorDate: Mon Jun 20 10:39:42 2022 -0500
ARROW-16382: [Python] Disable memory mapping by default in pyarrow (#13342)
[Python] Disable memory mapping by default in pyarrow
Authored-by: Alvin Chunga <al...@gmail.com>
Signed-off-by: Alessandro Molina <am...@turbogears.org>
---
python/pyarrow/_parquet.pyx | 2 +-
python/pyarrow/feather.py | 14 ++++++++------
python/pyarrow/ipc.pxi | 7 ++++---
python/pyarrow/ipc.py | 4 +++-
python/pyarrow/serialization.pxi | 2 +-
5 files changed, 17 insertions(+), 12 deletions(-)
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 56ab0115a0..e0d4ba76ad 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -1167,7 +1167,7 @@ cdef class ParquetReader(_Weakrefable):
self.pool = maybe_unbox_memory_pool(memory_pool)
self._metadata = None
- def open(self, object source not None, *, bint use_memory_map=True,
+ def open(self, object source not None, *, bint use_memory_map=False,
read_dictionary=None, FileMetaData metadata=None,
int buffer_size=0, bint pre_buffer=False,
coerce_int96_timestamp_unit=None,
diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py
index f20302d67b..8fe0126ee2 100644
--- a/python/pyarrow/feather.py
+++ b/python/pyarrow/feather.py
@@ -204,7 +204,7 @@ def write_feather(df, dest, compression=None, compression_level=None,
raise
-def read_feather(source, columns=None, use_threads=True, memory_map=True):
+def read_feather(source, columns=None, use_threads=True, memory_map=False):
"""
Read a pandas.DataFrame from Feather format. To read as pyarrow.Table use
feather.read_table.
@@ -212,6 +212,7 @@ def read_feather(source, columns=None, use_threads=True, memory_map=True):
Parameters
----------
source : str file path, or file-like object
+ You can use MemoryMappedFile as source, for explicitly use memory map.
columns : sequence, optional
Only read a specific set of columns. If not provided, all columns are
read.
@@ -219,8 +220,8 @@ def read_feather(source, columns=None, use_threads=True, memory_map=True):
Whether to parallelize reading using multiple threads. If false the
restriction is used in the conversion to Pandas as well as in the
reading from Feather format.
- memory_map : boolean, default True
- Use memory mapping when opening file on disk
+ memory_map : boolean, default False
+ Use memory mapping when opening file on disk, when source is a str.
Returns
-------
@@ -232,18 +233,19 @@ def read_feather(source, columns=None, use_threads=True, memory_map=True):
use_threads=use_threads).to_pandas(use_threads=use_threads))
-def read_table(source, columns=None, memory_map=True, use_threads=True):
+def read_table(source, columns=None, memory_map=False, use_threads=True):
"""
Read a pyarrow.Table from Feather format
Parameters
----------
source : str file path, or file-like object
+ You can use MemoryMappedFile as source, for explicitly use memory map.
columns : sequence, optional
Only read a specific set of columns. If not provided, all columns are
read.
- memory_map : boolean, default True
- Use memory mapping when opening file on disk
+ memory_map : boolean, default False
+ Use memory mapping when opening file on disk, when source is a str
use_threads : bool, default True
Whether to parallelize reading using multiple threads.
diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi
index 8b3784fae5..f0297ff004 100644
--- a/python/pyarrow/ipc.pxi
+++ b/python/pyarrow/ipc.pxi
@@ -401,7 +401,8 @@ cdef class MessageReader(_Weakrefable):
@staticmethod
def open_stream(source):
"""
- Open stream from source.
+ Open stream from source, if you want to use memory map use
+ MemoryMappedFile as source.
Parameters
----------
@@ -847,7 +848,7 @@ cdef class _RecordBatchFileReader(_Weakrefable):
except TypeError:
pass
- get_reader(source, True, &self.file)
+ get_reader(source, False, &self.file)
cdef int64_t offset = 0
if footer_offset is not None:
@@ -1089,7 +1090,7 @@ def read_schema(obj, DictionaryMemo dictionary_memo=None):
if isinstance(obj, Message):
raise NotImplementedError(type(obj))
- get_reader(obj, True, &cpp_file)
+ get_reader(obj, False, &cpp_file)
if dictionary_memo is not None:
arg_dict_memo = dictionary_memo.memo
diff --git a/python/pyarrow/ipc.py b/python/pyarrow/ipc.py
index 0e08bb3e58..d63c323b33 100644
--- a/python/pyarrow/ipc.py
+++ b/python/pyarrow/ipc.py
@@ -39,6 +39,7 @@ class RecordBatchStreamReader(lib._RecordBatchStreamReader):
----------
source : bytes/buffer-like, pyarrow.NativeFile, or file-like Python object
Either an in-memory buffer, or a readable file object.
+ If you want to use memory map use MemoryMappedFile as source.
options : pyarrow.ipc.IpcReadOptions
Options for IPC deserialization.
If None, default values will be used.
@@ -91,7 +92,8 @@ class RecordBatchFileReader(lib._RecordBatchFileReader):
Parameters
----------
source : bytes/buffer-like, pyarrow.NativeFile, or file-like Python object
- Either an in-memory buffer, or a readable file object
+ Either an in-memory buffer, or a readable file object.
+ If you want to use memory map use MemoryMappedFile as source.
footer_offset : int, default None
If the file is embedded in some larger file, this is the byte offset to
the very end of the file data
diff --git a/python/pyarrow/serialization.pxi b/python/pyarrow/serialization.pxi
index c03721578a..41609693d8 100644
--- a/python/pyarrow/serialization.pxi
+++ b/python/pyarrow/serialization.pxi
@@ -453,7 +453,7 @@ def read_serialized(source, base=None):
def _read_serialized(source, base=None):
cdef shared_ptr[CRandomAccessFile] stream
- get_reader(source, True, &stream)
+ get_reader(source, False, &stream)
cdef SerializedPyObject serialized = SerializedPyObject()
serialized.base = base