You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by am...@apache.org on 2022/06/20 15:39:50 UTC

[arrow] branch master updated: ARROW-16382: [Python] Disable memory mapping by default in pyarrow (#13342)

This is an automated email from the ASF dual-hosted git repository.

amolina pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new d49d8de3a9 ARROW-16382: [Python] Disable memory mapping by default in pyarrow (#13342)
d49d8de3a9 is described below

commit d49d8de3a9158f66ff7073fe077c5c292e38de6e
Author: AlvinJ15 <al...@gmail.com>
AuthorDate: Mon Jun 20 10:39:42 2022 -0500

    ARROW-16382: [Python] Disable memory mapping by default in pyarrow (#13342)
    
    [Python] Disable memory mapping by default in pyarrow
    
    Authored-by: Alvin Chunga <al...@gmail.com>
    Signed-off-by: Alessandro Molina <am...@turbogears.org>
---
 python/pyarrow/_parquet.pyx      |  2 +-
 python/pyarrow/feather.py        | 14 ++++++++------
 python/pyarrow/ipc.pxi           |  7 ++++---
 python/pyarrow/ipc.py            |  4 +++-
 python/pyarrow/serialization.pxi |  2 +-
 5 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 56ab0115a0..e0d4ba76ad 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -1167,7 +1167,7 @@ cdef class ParquetReader(_Weakrefable):
         self.pool = maybe_unbox_memory_pool(memory_pool)
         self._metadata = None
 
-    def open(self, object source not None, *, bint use_memory_map=True,
+    def open(self, object source not None, *, bint use_memory_map=False,
              read_dictionary=None, FileMetaData metadata=None,
              int buffer_size=0, bint pre_buffer=False,
              coerce_int96_timestamp_unit=None,
diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py
index f20302d67b..8fe0126ee2 100644
--- a/python/pyarrow/feather.py
+++ b/python/pyarrow/feather.py
@@ -204,7 +204,7 @@ def write_feather(df, dest, compression=None, compression_level=None,
         raise
 
 
-def read_feather(source, columns=None, use_threads=True, memory_map=True):
+def read_feather(source, columns=None, use_threads=True, memory_map=False):
     """
     Read a pandas.DataFrame from Feather format. To read as pyarrow.Table use
     feather.read_table.
@@ -212,6 +212,7 @@ def read_feather(source, columns=None, use_threads=True, memory_map=True):
     Parameters
     ----------
     source : str file path, or file-like object
+        You can use MemoryMappedFile as source, for explicitly use memory map.
     columns : sequence, optional
         Only read a specific set of columns. If not provided, all columns are
         read.
@@ -219,8 +220,8 @@ def read_feather(source, columns=None, use_threads=True, memory_map=True):
         Whether to parallelize reading using multiple threads. If false the
         restriction is used in the conversion to Pandas as well as in the
         reading from Feather format.
-    memory_map : boolean, default True
-        Use memory mapping when opening file on disk
+    memory_map : boolean, default False
+        Use memory mapping when opening file on disk, when source is a str.
 
     Returns
     -------
@@ -232,18 +233,19 @@ def read_feather(source, columns=None, use_threads=True, memory_map=True):
         use_threads=use_threads).to_pandas(use_threads=use_threads))
 
 
-def read_table(source, columns=None, memory_map=True, use_threads=True):
+def read_table(source, columns=None, memory_map=False, use_threads=True):
     """
     Read a pyarrow.Table from Feather format
 
     Parameters
     ----------
     source : str file path, or file-like object
+        You can use MemoryMappedFile as source, for explicitly use memory map.
     columns : sequence, optional
         Only read a specific set of columns. If not provided, all columns are
         read.
-    memory_map : boolean, default True
-        Use memory mapping when opening file on disk
+    memory_map : boolean, default False
+        Use memory mapping when opening file on disk, when source is a str
     use_threads : bool, default True
         Whether to parallelize reading using multiple threads.
 
diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi
index 8b3784fae5..f0297ff004 100644
--- a/python/pyarrow/ipc.pxi
+++ b/python/pyarrow/ipc.pxi
@@ -401,7 +401,8 @@ cdef class MessageReader(_Weakrefable):
     @staticmethod
     def open_stream(source):
         """
-        Open stream from source.
+        Open stream from source, if you want to use memory map use
+        MemoryMappedFile as source.
 
         Parameters
         ----------
@@ -847,7 +848,7 @@ cdef class _RecordBatchFileReader(_Weakrefable):
         except TypeError:
             pass
 
-        get_reader(source, True, &self.file)
+        get_reader(source, False, &self.file)
 
         cdef int64_t offset = 0
         if footer_offset is not None:
@@ -1089,7 +1090,7 @@ def read_schema(obj, DictionaryMemo dictionary_memo=None):
     if isinstance(obj, Message):
         raise NotImplementedError(type(obj))
 
-    get_reader(obj, True, &cpp_file)
+    get_reader(obj, False, &cpp_file)
 
     if dictionary_memo is not None:
         arg_dict_memo = dictionary_memo.memo
diff --git a/python/pyarrow/ipc.py b/python/pyarrow/ipc.py
index 0e08bb3e58..d63c323b33 100644
--- a/python/pyarrow/ipc.py
+++ b/python/pyarrow/ipc.py
@@ -39,6 +39,7 @@ class RecordBatchStreamReader(lib._RecordBatchStreamReader):
     ----------
     source : bytes/buffer-like, pyarrow.NativeFile, or file-like Python object
         Either an in-memory buffer, or a readable file object.
+        If you want to use memory map use MemoryMappedFile as source.
     options : pyarrow.ipc.IpcReadOptions
         Options for IPC deserialization.
         If None, default values will be used.
@@ -91,7 +92,8 @@ class RecordBatchFileReader(lib._RecordBatchFileReader):
     Parameters
     ----------
     source : bytes/buffer-like, pyarrow.NativeFile, or file-like Python object
-        Either an in-memory buffer, or a readable file object
+        Either an in-memory buffer, or a readable file object.
+        If you want to use memory map use MemoryMappedFile as source.
     footer_offset : int, default None
         If the file is embedded in some larger file, this is the byte offset to
         the very end of the file data
diff --git a/python/pyarrow/serialization.pxi b/python/pyarrow/serialization.pxi
index c03721578a..41609693d8 100644
--- a/python/pyarrow/serialization.pxi
+++ b/python/pyarrow/serialization.pxi
@@ -453,7 +453,7 @@ def read_serialized(source, base=None):
 
 def _read_serialized(source, base=None):
     cdef shared_ptr[CRandomAccessFile] stream
-    get_reader(source, True, &stream)
+    get_reader(source, False, &stream)
 
     cdef SerializedPyObject serialized = SerializedPyObject()
     serialized.base = base