You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/04/12 13:25:01 UTC

[GitHub] [arrow] pitrou commented on a diff in pull request #12800: ARROW-15776: [Python] Expose IpcReadOptions

pitrou commented on code in PR #12800:
URL: https://github.com/apache/arrow/pull/12800#discussion_r848428439


##########
python/pyarrow/ipc.py:
##########
@@ -39,10 +39,17 @@ class RecordBatchStreamReader(lib._RecordBatchStreamReader):
     ----------
     source : bytes/buffer-like, pyarrow.NativeFile, or file-like Python object
         Either an in-memory buffer, or a readable file object.
+    options : pyarrow.ipc.IpcReadOptions
+        Options for IPC serialization.
+
+        If None, default values will be used.
+    memory_pool : MemoryPool, default None
+        Uses default memory pool if not specified.
     """
 
-    def __init__(self, source):
-        self._open(source)
+    def __init__(self, source, *, options=None, memory_pool=None):

Review Comment:
   Why not put the `memory_pool` parameter in `IpcReadOptions` as in C++?



##########
python/pyarrow/ipc.pxi:
##########
@@ -100,6 +100,63 @@ cdef _wrap_read_stats(CIpcReadStats c):
                      c.num_replaced_dictionaries)
 
 
+cdef class IpcReadOptions(_Weakrefable):
+    """
+    Serialization options for reading IPC format.
+
+    Parameters
+    ----------
+    use_threads : bool
+        Whether to use the global CPU thread pool to parallelize any
+        computational tasks like decompression.
+    ensure_native_endian : bool
+        Whether to convert incoming data to platform-native endianness.
+        Default is true.
+    included_fields : list
+        If empty (the default), return all deserialized fields.
+        If non-empty, the values are the indices of fields to read on
+        the top-level schema.
+    """
+    __slots__ = ()
+
+    # cdef block is in lib.pxd
+
+    def __init__(self, *, bint ensure_native_endian=True,
+                 bint use_threads=True, list included_fields=None):
+        self.c_options = CIpcReadOptions.Defaults()
+        self.ensure_native_endian = ensure_native_endian
+        self.use_threads = use_threads
+        if included_fields is not None:
+            self.included_fields = included_fields
+
+    @property
+    def ensure_native_endian(self):
+        return self.c_options.ensure_native_endian
+
+    @ensure_native_endian.setter
+    def ensure_native_endian(self, bint value):
+        self.c_options.ensure_native_endian = value
+
+    @property
+    def use_threads(self):
+        return self.c_options.use_threads
+
+    @use_threads.setter
+    def use_threads(self, bint value):
+        self.c_options.use_threads = value
+
+    @property
+    def included_fields(self):
+        return self.c_options.included_fields
+
+    @included_fields.setter
+    def included_fields(self, list value):
+        if value is None:

Review Comment:
   I'm not sure allowing None is useful. You can instead write the method argument as `list value not None`.



##########
python/pyarrow/tests/test_ipc.py:
##########
@@ -487,6 +538,51 @@ def test_stream_options_roundtrip(stream_fixture, options):
         reader.read_next_batch()
 
 
+def test_read_options():
+    options = pa.ipc.IpcReadOptions()
+    assert options.use_threads is True
+    assert options.ensure_native_endian is True
+    assert options.included_fields == []
+
+    options.ensure_native_endian = False
+    assert options.ensure_native_endian is False
+
+    options.use_threads = False
+    assert options.use_threads is False
+
+    options.included_fields = [0, 1]
+    assert options.included_fields == [0, 1]
+    options.included_fields = None
+    assert options.included_fields == []
+
+    options = pa.ipc.IpcReadOptions(
+        use_threads=False, ensure_native_endian=False,
+        included_fields=[1]
+    )
+    assert options.use_threads is False
+    assert options.ensure_native_endian is False
+    assert options.included_fields == [1]
+
+
+def test_read_options_included_fields(stream_fixture):
+    options1 = pa.ipc.IpcReadOptions()
+    options2 = pa.ipc.IpcReadOptions(included_fields=[1])
+    stream_fixture.write_batches()
+    source = stream_fixture.get_source()
+
+    reader1 = pa.ipc.open_stream(source, options=options1)
+    reader2 = pa.ipc.open_stream(
+        source, options=options2, memory_pool=pa.system_memory_pool())
+
+    result1 = reader1.read_all()
+    result2 = reader2.read_all()
+
+    assert result1.num_columns == 2
+    assert result2.num_columns == 1
+
+    assert result2[0] == result1[1]

Review Comment:
   Can we compare with expected results instead?



##########
python/pyarrow/ipc.py:
##########
@@ -39,10 +39,17 @@ class RecordBatchStreamReader(lib._RecordBatchStreamReader):
     ----------
     source : bytes/buffer-like, pyarrow.NativeFile, or file-like Python object
         Either an in-memory buffer, or a readable file object.
+    options : pyarrow.ipc.IpcReadOptions
+        Options for IPC serialization.
+
+        If None, default values will be used.

Review Comment:
   No need for the empty line IMHO.
   ```suggestion
           Options for IPC deserialization.
           If None, default values will be used.
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org