You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2018/04/23 12:45:52 UTC
[arrow] branch master updated: ARROW-1731: [Python] Add columns
selector in Table.from_array
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new a5ae134 ARROW-1731: [Python] Add columns selector in Table.from_array
a5ae134 is described below
commit a5ae1344b65cc20cd9c4b03aa87365d4e147986c
Author: Gatis Seja <ga...@hotmail.com>
AuthorDate: Mon Apr 23 14:45:09 2018 +0200
ARROW-1731: [Python] Add columns selector in Table.from_array
Fixes both ARROW-1731 and ARROW-1388
Author: Gatis Seja <ga...@hotmail.com>
Author: Samuel <sa...@bmlltech.com>
Closes #1924 from samuelsinayoko/ARROW-1731-ahl-hackathon and squashes the following commits:
90bf22c <Gatis Seja> flake8-ed pyarrow/table.pxi
4a989eb <Gatis Seja> comment fix, improve test
7651bdb <Gatis Seja> Add tests Table.drop. KeyError if col not found
b3eb0b0 <Samuel> Fix bug in Table.drop implementation
d81e34a <Samuel> Add initial implementation of Table.drop
6da40fc <Gatis Seja> Add tests for Table.drop (cols)
de20a0a <Gatis Seja> Add columns to Table.from_pandas()
f363f72 <Gatis Seja> Add columns to from_pandas, needs dev
71d968d <Samuel> Add `columns` parameter to Table.from_pandas
---
python/pyarrow/pandas_compat.py | 6 +++--
python/pyarrow/table.pxi | 38 ++++++++++++++++++++++++++---
python/pyarrow/tests/test_convert_pandas.py | 11 +++++++++
python/pyarrow/tests/test_table.py | 17 +++++++++++++
4 files changed, 66 insertions(+), 6 deletions(-)
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 24da744..c288c7f 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -316,7 +316,9 @@ def _index_level_name(index, i, column_names):
return '__index_level_{:d}__'.format(i)
-def dataframe_to_arrays(df, schema, preserve_index, nthreads=1):
+def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None):
+ if columns is None:
+ columns = df.columns
column_names = []
index_columns = []
index_column_names = []
@@ -334,7 +336,7 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1):
'Duplicate column names found: {}'.format(list(df.columns))
)
- for name in df.columns:
+ for name in columns:
col = df[name]
name = _column_name_to_strings(name)
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index cbbfe7d..a97fde2 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -726,7 +726,7 @@ cdef class RecordBatch:
@classmethod
def from_pandas(cls, df, Schema schema=None, bint preserve_index=True,
- nthreads=None):
+ nthreads=None, columns=None):
"""
Convert pandas.DataFrame to an Arrow RecordBatch
@@ -742,13 +742,15 @@ cdef class RecordBatch:
nthreads : int, default None (may use up to system CPU count threads)
If greater than 1, convert columns to Arrow in parallel using
indicated number of threads
+ columns : list, optional
+ List of column to be converted. If None, use all columns.
Returns
-------
pyarrow.RecordBatch
"""
names, arrays, metadata = pdcompat.dataframe_to_arrays(
- df, schema, preserve_index, nthreads=nthreads
+ df, schema, preserve_index, nthreads=nthreads, columns=columns
)
return cls.from_arrays(arrays, names, metadata)
@@ -892,7 +894,7 @@ cdef class Table:
@classmethod
def from_pandas(cls, df, Schema schema=None, bint preserve_index=True,
- nthreads=None):
+ nthreads=None, columns=None):
"""
Convert pandas.DataFrame to an Arrow Table
@@ -908,6 +910,9 @@ cdef class Table:
nthreads : int, default None (may use up to system CPU count threads)
If greater than 1, convert columns to Arrow in parallel using
indicated number of threads
+ columns : list, optional
+ List of column to be converted. If None, use all columns.
+
Returns
-------
@@ -929,7 +934,8 @@ cdef class Table:
df,
schema=schema,
preserve_index=preserve_index,
- nthreads=nthreads
+ nthreads=nthreads,
+ columns=columns
)
return cls.from_arrays(arrays, names=names, metadata=metadata)
@@ -1286,6 +1292,30 @@ cdef class Table:
return pyarrow_wrap_table(c_table)
+ def drop(self, columns):
+ """
+ Drop one or more columns and return a new table.
+
+ columns: list of str
+
+ Returns pa.Table
+ """
+ indices = []
+ for col in columns:
+ idx = self.schema.get_field_index(col)
+ if idx == -1:
+ raise KeyError("Column {!r} not found".format(col))
+ indices.append(idx)
+
+ indices.sort()
+ indices.reverse()
+
+ table = self
+ for idx in indices:
+ table = table.remove_column(idx)
+
+ return table
+
def concat_tables(tables):
"""
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index c43c29d..62ea5b8 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -133,6 +133,17 @@ class TestConvertMetadata(object):
table = pa.Table.from_pandas(df)
assert table.column(0).name == '0'
+ def test_from_pandas_with_columns(self):
+ df = pd.DataFrame({0: [1, 2, 3], 1: [1, 3, 3], 2: [2, 4, 5]})
+
+ table = pa.Table.from_pandas(df, columns=[0, 1])
+ expected = pa.Table.from_pandas(df[[0, 1]])
+ assert expected.equals(table)
+
+ record_batch_table = pa.RecordBatch.from_pandas(df, columns=[0, 1])
+ record_batch_expected = pa.RecordBatch.from_pandas(df[[0, 1]])
+ assert record_batch_expected.equals(record_batch_table)
+
def test_column_index_names_are_preserved(self):
df = pd.DataFrame({'data': [1, 2, 3]})
df.columns.names = ['a']
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
index 11555c6..100f2b0 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -341,6 +341,23 @@ def test_table_add_column():
assert t4.equals(expected)
+def test_table_drop():
+ """ drop one or more columns given labels"""
+ a = pa.array(range(5))
+ b = pa.array([-10, -5, 0, 5, 10])
+ c = pa.array(range(5, 10))
+
+ table = pa.Table.from_arrays([a, b, c], names=('a', 'b', 'c'))
+ t2 = table.drop(['a', 'b'])
+
+ exp = pa.Table.from_arrays([c], names=('c',))
+ assert exp.equals(t2)
+
+ # -- raise KeyError if column not in Table
+ with pytest.raises(KeyError, match="Column 'd' not found"):
+ table.drop(['d'])
+
+
def test_table_remove_column():
data = [
pa.array(range(5)),
--
To stop receiving notification emails like this one, please contact
apitrou@apache.org.