You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2017/05/14 15:23:30 UTC

arrow git commit: ARROW-1022: [Python] Add multithreaded read option to read_feather

Repository: arrow
Updated Branches:
  refs/heads/master 5739e04b3 -> d8d3d8435


ARROW-1022: [Python] Add multithreaded read option to read_feather

Author: Wes McKinney <we...@twosigma.com>

Closes #682 from wesm/ARROW-1022 and squashes the following commits:

8fd241e [Wes McKinney] Add multithreaded read option to read_feather


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/d8d3d843
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/d8d3d843
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/d8d3d843

Branch: refs/heads/master
Commit: d8d3d84354d827e45c8267cd05aecd2aa36cf60b
Parents: 5739e04
Author: Wes McKinney <we...@twosigma.com>
Authored: Sun May 14 17:23:26 2017 +0200
Committer: Uwe L. Korn <uw...@xhochy.com>
Committed: Sun May 14 17:23:26 2017 +0200

----------------------------------------------------------------------
 python/pyarrow/feather.py            | 10 ++++++----
 python/pyarrow/tests/test_feather.py | 11 +++++++++--
 2 files changed, 15 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/d8d3d843/python/pyarrow/feather.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py
index 3754aec..34783a7 100644
--- a/python/pyarrow/feather.py
+++ b/python/pyarrow/feather.py
@@ -37,7 +37,7 @@ class FeatherReader(ext.FeatherReader):
         self.source = source
         self.open(source)
 
-    def read(self, columns=None):
+    def read(self, columns=None, nthreads=1):
         if columns is not None:
             column_set = set(columns)
         else:
@@ -53,7 +53,7 @@ class FeatherReader(ext.FeatherReader):
                 names.append(name)
 
         table = Table.from_arrays(columns, names=names)
-        return table.to_pandas()
+        return table.to_pandas(nthreads=nthreads)
 
 
 class FeatherWriter(object):
@@ -118,7 +118,7 @@ def write_feather(df, dest):
         raise
 
 
-def read_feather(source, columns=None):
+def read_feather(source, columns=None, nthreads=1):
     """
     Read a pandas.DataFrame from Feather format
 
@@ -128,10 +128,12 @@ def read_feather(source, columns=None):
     columns : sequence, optional
         Only read a specific set of columns. If not provided, all columns are
         read
+    nthreads : int, default 1
+        Number of CPU threads to use when reading to pandas.DataFrame
 
     Returns
     -------
     df : pandas.DataFrame
     """
     reader = FeatherReader(source)
-    return reader.read(columns=columns)
+    return reader.read(columns=columns, nthreads=nthreads)

http://git-wip-us.apache.org/repos/asf/arrow/blob/d8d3d843/python/pyarrow/tests/test_feather.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py
index 69c32be..287e0da 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -61,7 +61,8 @@ class TestFeatherReader(unittest.TestCase):
         return counts
 
     def _check_pandas_roundtrip(self, df, expected=None, path=None,
-                                columns=None, null_counts=None):
+                                columns=None, null_counts=None,
+                                nthreads=1):
         if path is None:
             path = random_path()
 
@@ -70,7 +71,7 @@ class TestFeatherReader(unittest.TestCase):
         if not os.path.exists(path):
             raise Exception('file not written')
 
-        result = read_feather(path, columns)
+        result = read_feather(path, columns, nthreads=nthreads)
         if expected is None:
             expected = df
 
@@ -293,6 +294,12 @@ class TestFeatherReader(unittest.TestCase):
         df = pd.DataFrame({'strings': [''] * 10})
         self._check_pandas_roundtrip(df)
 
+    def test_multithreaded_read(self):
+        data = {'c{0}'.format(i): [''] * 10
+                for i in range(100)}
+        df = pd.DataFrame(data)
+        self._check_pandas_roundtrip(df, nthreads=4)
+
     def test_nan_as_null(self):
         # Create a nan that is not numpy.nan
         values = np.array(['foo', np.nan, np.nan * 2, 'bar'] * 10)