You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/05/15 08:09:36 UTC
[arrow] branch master updated: ARROW-2332: Add Feather Dataset class
This is an automated email from the ASF dual-hosted git repository.
uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 27fc25c ARROW-2332: Add Feather Dataset class
27fc25c is described below
commit 27fc25cd73b4b806a6e9900b1588701b7fa715b9
Author: Dhruv Madeka <ma...@bu.edu>
AuthorDate: Tue May 15 10:09:23 2018 +0200
ARROW-2332: Add Feather Dataset class
Added a class to read a list of `feather` files into a PyArrow table or a pandas DataFrame
Author: Dhruv Madeka <ma...@bu.edu>
Closes #2040 from dmadeka/feather-dataset and squashes the following commits:
044e2c63 <Dhruv Madeka> Add Feather Dataset class
---
python/pyarrow/feather.py | 68 +++++++++++++++++++++++++++++++++++-
python/pyarrow/tests/test_feather.py | 26 +++++++++++++-
2 files changed, 92 insertions(+), 2 deletions(-)
diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py
index 6ebf900..c2157ab 100644
--- a/python/pyarrow/feather.py
+++ b/python/pyarrow/feather.py
@@ -24,7 +24,7 @@ import warnings
from pyarrow.compat import pdapi
from pyarrow.lib import FeatherError # noqa
-from pyarrow.lib import RecordBatch, Table
+from pyarrow.lib import RecordBatch, Table, concat_tables
import pyarrow.lib as ext
try:
@@ -94,6 +94,72 @@ class FeatherWriter(object):
self.writer.close()
+class FeatherDataset(object):
+ """
+ Encapsulates details of reading a list of Feather files.
+
+ Parameters
+ ----------
+ path_or_paths : List[str]
+ A list of file names
+ validate_schema : boolean, default True
+ Check that individual file schemas are all the same / compatible
+ """
+ def __init__(self, path_or_paths, validate_schema=True):
+ self.paths = path_or_paths
+ self.validate_schema = validate_schema
+
+ def read_table(self, columns=None):
+ """
+ Read multiple feather files as a single pyarrow.Table
+
+ Parameters
+ ----------
+ columns : List[str]
+ Names of columns to read from the file
+
+ Returns
+ -------
+ pyarrow.Table
+ Content of the file as a table (of columns)
+ """
+ _fil = FeatherReader(self.paths[0]).read_table(columns=columns)
+ self._tables = [_fil]
+ self.schema = _fil.schema
+
+ for fil in self.paths[1:]:
+ fil_table = FeatherReader(fil).read_table(columns=columns)
+ if self.validate_schema:
+ self.validate_schemas(fil, fil_table)
+ self._tables.append(fil_table)
+ return concat_tables(self._tables)
+
+ def validate_schemas(self, piece, table):
+ if not self.schema.equals(table.schema):
+ raise ValueError('Schema in {0!s} was different. \n'
+ '{1!s}\n\nvs\n\n{2!s}'
+ .format(piece, self.schema,
+ table.schema))
+
+ def read_pandas(self, columns=None, nthreads=1):
+ """
+ Read multiple Parquet files as a single pandas DataFrame
+
+ Parameters
+ ----------
+ columns : List[str]
+ Names of columns to read from the file
+ nthreads : int, default 1
+ Number of columns to read in parallel.
+
+ Returns
+ -------
+ pandas.DataFrame
+ Content of the file as a pandas DataFrame (of columns)
+ """
+ return self.read_table(columns=columns).to_pandas(nthreads=nthreads)
+
+
def write_feather(df, dest):
"""
Write a pandas.DataFrame to Feather format
diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py
index 9bbfe05..46e4e5f 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -29,7 +29,7 @@ import pandas as pd
import pyarrow as pa
from pyarrow.feather import (read_feather, write_feather,
- read_table, FeatherReader)
+ read_table, FeatherReader, FeatherDataset)
from pyarrow.lib import FeatherWriter
@@ -100,6 +100,30 @@ class TestFeatherReader(unittest.TestCase):
pytest.raises(exc, f)
+ def test_dataset(self):
+ num_values = (100, 100)
+ num_files = 5
+ paths = [random_path() for i in range(num_files)]
+ df = pd.DataFrame(np.random.randn(*num_values),
+ columns=['col_' + str(i)
+ for i in range(num_values[1])])
+
+ self.test_files.extend(paths)
+ for index, path in enumerate(paths):
+ rows = (index * (num_values[0] // num_files),
+ (index + 1) * (num_values[0] // num_files))
+ writer = FeatherWriter()
+ writer.open(path)
+
+ for col in range(num_values[1]):
+ writer.write_array(df.columns[col],
+ df.iloc[rows[0]:rows[1], col])
+
+ writer.close()
+
+ data = FeatherDataset(paths).read_pandas()
+ assert_frame_equal(data, df)
+
def test_num_rows_attr(self):
df = pd.DataFrame({'foo': [1, 2, 3, 4, 5]})
path = random_path()
--
To stop receiving notification emails like this one, please contact
uwe@apache.org.