You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2018/04/23 12:45:52 UTC

[arrow] branch master updated: ARROW-1731: [Python] Add columns selector in Table.from_array

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new a5ae134  ARROW-1731: [Python] Add columns selector in Table.from_array
a5ae134 is described below

commit a5ae1344b65cc20cd9c4b03aa87365d4e147986c
Author: Gatis Seja <ga...@hotmail.com>
AuthorDate: Mon Apr 23 14:45:09 2018 +0200

    ARROW-1731: [Python] Add columns selector in Table.from_array
    
    Fixes both ARROW-1731 and ARROW-1388
    
    Author: Gatis Seja <ga...@hotmail.com>
    Author: Samuel <sa...@bmlltech.com>
    
    Closes #1924 from samuelsinayoko/ARROW-1731-ahl-hackathon and squashes the following commits:
    
    90bf22c <Gatis Seja> flake8-ed pyarrow/table.pxi
    4a989eb <Gatis Seja> comment fix, improve test
    7651bdb <Gatis Seja> Add tests Table.drop. KeyError if col not found
    b3eb0b0 <Samuel> Fix bug in Table.drop implementation
    d81e34a <Samuel> Add initial implementation of Table.drop
    6da40fc <Gatis Seja> Add tests for Table.drop (cols)
    de20a0a <Gatis Seja> Add columns to Table.from_pandas()
    f363f72 <Gatis Seja> Add columns to from_pandas, needs dev
    71d968d <Samuel> Add `columns` parameter to Table.from_pandas
---
 python/pyarrow/pandas_compat.py             |  6 +++--
 python/pyarrow/table.pxi                    | 38 ++++++++++++++++++++++++++---
 python/pyarrow/tests/test_convert_pandas.py | 11 +++++++++
 python/pyarrow/tests/test_table.py          | 17 +++++++++++++
 4 files changed, 66 insertions(+), 6 deletions(-)

diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 24da744..c288c7f 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -316,7 +316,9 @@ def _index_level_name(index, i, column_names):
         return '__index_level_{:d}__'.format(i)
 
 
-def dataframe_to_arrays(df, schema, preserve_index, nthreads=1):
+def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None):
+    if columns is None:
+        columns = df.columns
     column_names = []
     index_columns = []
     index_column_names = []
@@ -334,7 +336,7 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1):
             'Duplicate column names found: {}'.format(list(df.columns))
         )
 
-    for name in df.columns:
+    for name in columns:
         col = df[name]
         name = _column_name_to_strings(name)
 
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index cbbfe7d..a97fde2 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -726,7 +726,7 @@ cdef class RecordBatch:
 
     @classmethod
     def from_pandas(cls, df, Schema schema=None, bint preserve_index=True,
-                    nthreads=None):
+                    nthreads=None, columns=None):
         """
         Convert pandas.DataFrame to an Arrow RecordBatch
 
@@ -742,13 +742,15 @@ cdef class RecordBatch:
         nthreads : int, default None (may use up to system CPU count threads)
             If greater than 1, convert columns to Arrow in parallel using
             indicated number of threads
+        columns : list, optional
+           List of column to be converted. If None, use all columns.
 
         Returns
         -------
         pyarrow.RecordBatch
         """
         names, arrays, metadata = pdcompat.dataframe_to_arrays(
-            df, schema, preserve_index, nthreads=nthreads
+            df, schema, preserve_index, nthreads=nthreads, columns=columns
         )
         return cls.from_arrays(arrays, names, metadata)
 
@@ -892,7 +894,7 @@ cdef class Table:
 
     @classmethod
     def from_pandas(cls, df, Schema schema=None, bint preserve_index=True,
-                    nthreads=None):
+                    nthreads=None, columns=None):
         """
         Convert pandas.DataFrame to an Arrow Table
 
@@ -908,6 +910,9 @@ cdef class Table:
         nthreads : int, default None (may use up to system CPU count threads)
             If greater than 1, convert columns to Arrow in parallel using
             indicated number of threads
+        columns : list, optional
+           List of column to be converted. If None, use all columns.
+
 
         Returns
         -------
@@ -929,7 +934,8 @@ cdef class Table:
             df,
             schema=schema,
             preserve_index=preserve_index,
-            nthreads=nthreads
+            nthreads=nthreads,
+            columns=columns
         )
         return cls.from_arrays(arrays, names=names, metadata=metadata)
 
@@ -1286,6 +1292,30 @@ cdef class Table:
 
         return pyarrow_wrap_table(c_table)
 
+    def drop(self, columns):
+        """
+        Drop one or more columns and return a new table.
+
+        columns: list of str
+
+        Returns pa.Table
+        """
+        indices = []
+        for col in columns:
+            idx = self.schema.get_field_index(col)
+            if idx == -1:
+                raise KeyError("Column {!r} not found".format(col))
+            indices.append(idx)
+
+        indices.sort()
+        indices.reverse()
+
+        table = self
+        for idx in indices:
+            table = table.remove_column(idx)
+
+        return table
+
 
 def concat_tables(tables):
     """
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index c43c29d..62ea5b8 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -133,6 +133,17 @@ class TestConvertMetadata(object):
         table = pa.Table.from_pandas(df)
         assert table.column(0).name == '0'
 
+    def test_from_pandas_with_columns(self):
+        df = pd.DataFrame({0: [1, 2, 3], 1: [1, 3, 3], 2: [2, 4, 5]})
+
+        table = pa.Table.from_pandas(df, columns=[0, 1])
+        expected = pa.Table.from_pandas(df[[0, 1]])
+        assert expected.equals(table)
+
+        record_batch_table = pa.RecordBatch.from_pandas(df, columns=[0, 1])
+        record_batch_expected = pa.RecordBatch.from_pandas(df[[0, 1]])
+        assert record_batch_expected.equals(record_batch_table)
+
     def test_column_index_names_are_preserved(self):
         df = pd.DataFrame({'data': [1, 2, 3]})
         df.columns.names = ['a']
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
index 11555c6..100f2b0 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -341,6 +341,23 @@ def test_table_add_column():
     assert t4.equals(expected)
 
 
+def test_table_drop():
+    """ drop one or more columns given labels"""
+    a = pa.array(range(5))
+    b = pa.array([-10, -5, 0, 5, 10])
+    c = pa.array(range(5, 10))
+
+    table = pa.Table.from_arrays([a, b, c], names=('a', 'b', 'c'))
+    t2 = table.drop(['a', 'b'])
+
+    exp = pa.Table.from_arrays([c], names=('c',))
+    assert exp.equals(t2)
+
+    # -- raise KeyError if column not in Table
+    with pytest.raises(KeyError, match="Column 'd' not found"):
+        table.drop(['d'])
+
+
 def test_table_remove_column():
     data = [
         pa.array(range(5)),

-- 
To stop receiving notification emails like this one, please contact
apitrou@apache.org.