You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/05/15 08:09:36 UTC

[arrow] branch master updated: ARROW-2332: Add Feather Dataset class

This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 27fc25c  ARROW-2332: Add Feather Dataset class
27fc25c is described below

commit 27fc25cd73b4b806a6e9900b1588701b7fa715b9
Author: Dhruv Madeka <ma...@bu.edu>
AuthorDate: Tue May 15 10:09:23 2018 +0200

    ARROW-2332: Add Feather Dataset class
    
    Added a class to read a list of `feather` files into a PyArrow table or a pandas DataFrame
    
    Author: Dhruv Madeka <ma...@bu.edu>
    
    Closes #2040 from dmadeka/feather-dataset and squashes the following commits:
    
    044e2c63 <Dhruv Madeka> Add Feather Dataset class
---
 python/pyarrow/feather.py            | 68 +++++++++++++++++++++++++++++++++++-
 python/pyarrow/tests/test_feather.py | 26 +++++++++++++-
 2 files changed, 92 insertions(+), 2 deletions(-)

diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py
index 6ebf900..c2157ab 100644
--- a/python/pyarrow/feather.py
+++ b/python/pyarrow/feather.py
@@ -24,7 +24,7 @@ import warnings
 
 from pyarrow.compat import pdapi
 from pyarrow.lib import FeatherError  # noqa
-from pyarrow.lib import RecordBatch, Table
+from pyarrow.lib import RecordBatch, Table, concat_tables
 import pyarrow.lib as ext
 
 try:
@@ -94,6 +94,72 @@ class FeatherWriter(object):
         self.writer.close()
 
 
+class FeatherDataset(object):
+    """
+    Encapsulates details of reading a list of Feather files.
+
+    Parameters
+    ----------
+    path_or_paths : List[str]
+        A list of file names
+    validate_schema : boolean, default True
+        Check that individual file schemas are all the same / compatible
+    """
+    def __init__(self, path_or_paths, validate_schema=True):
+        self.paths = path_or_paths
+        self.validate_schema = validate_schema
+
+    def read_table(self, columns=None):
+        """
+        Read multiple feather files as a single pyarrow.Table
+
+        Parameters
+        ----------
+        columns : List[str]
+            Names of columns to read from the file
+
+        Returns
+        -------
+        pyarrow.Table
+            Content of the file as a table (of columns)
+        """
+        _fil = FeatherReader(self.paths[0]).read_table(columns=columns)
+        self._tables = [_fil]
+        self.schema = _fil.schema
+
+        for fil in self.paths[1:]:
+            fil_table = FeatherReader(fil).read_table(columns=columns)
+            if self.validate_schema:
+                self.validate_schemas(fil, fil_table)
+            self._tables.append(fil_table)
+        return concat_tables(self._tables)
+
+    def validate_schemas(self, piece, table):
+        if not self.schema.equals(table.schema):
+            raise ValueError('Schema in {0!s} was different. \n'
+                             '{1!s}\n\nvs\n\n{2!s}'
+                             .format(piece, self.schema,
+                                     table.schema))
+
+    def read_pandas(self, columns=None, nthreads=1):
+        """
+        Read multiple Parquet files as a single pandas DataFrame
+
+        Parameters
+        ----------
+        columns : List[str]
+            Names of columns to read from the file
+        nthreads : int, default 1
+            Number of columns to read in parallel.
+
+        Returns
+        -------
+        pandas.DataFrame
+            Content of the file as a pandas DataFrame (of columns)
+        """
+        return self.read_table(columns=columns).to_pandas(nthreads=nthreads)
+
+
 def write_feather(df, dest):
     """
     Write a pandas.DataFrame to Feather format
diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py
index 9bbfe05..46e4e5f 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -29,7 +29,7 @@ import pandas as pd
 
 import pyarrow as pa
 from pyarrow.feather import (read_feather, write_feather,
-                             read_table, FeatherReader)
+                             read_table, FeatherReader, FeatherDataset)
 from pyarrow.lib import FeatherWriter
 
 
@@ -100,6 +100,30 @@ class TestFeatherReader(unittest.TestCase):
 
         pytest.raises(exc, f)
 
+    def test_dataset(self):
+        num_values = (100, 100)
+        num_files = 5
+        paths = [random_path() for i in range(num_files)]
+        df = pd.DataFrame(np.random.randn(*num_values),
+                          columns=['col_' + str(i)
+                                   for i in range(num_values[1])])
+
+        self.test_files.extend(paths)
+        for index, path in enumerate(paths):
+            rows = (index * (num_values[0] // num_files),
+                    (index + 1) * (num_values[0] // num_files))
+            writer = FeatherWriter()
+            writer.open(path)
+
+            for col in range(num_values[1]):
+                writer.write_array(df.columns[col],
+                                   df.iloc[rows[0]:rows[1], col])
+
+            writer.close()
+
+        data = FeatherDataset(paths).read_pandas()
+        assert_frame_equal(data, df)
+
     def test_num_rows_attr(self):
         df = pd.DataFrame({'foo': [1, 2, 3, 4, 5]})
         path = random_path()

-- 
To stop receiving notification emails like this one, please contact
uwe@apache.org.