You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/03/13 03:18:29 UTC

arrow git commit: ARROW-574: Python: Add support for nested Python lists in Pandas conversion

Repository: arrow
Updated Branches:
  refs/heads/master 492b3d439 -> 2cf36ef2d


ARROW-574: Python: Add support for nested Python lists in Pandas conversion

Author: Uwe L. Korn <uw...@xhochy.com>

Closes #364 from xhochy/ARROW-574 and squashes the following commits:

3ef02ae [Uwe L. Korn] ARROW-574: Python: Add support for nested Python lists in Pandas conversion


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/2cf36ef2
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/2cf36ef2
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/2cf36ef2

Branch: refs/heads/master
Commit: 2cf36ef2d6aea6a5ddf32c900d33db40d728bcd9
Parents: 492b3d4
Author: Uwe L. Korn <uw...@xhochy.com>
Authored: Sun Mar 12 23:18:23 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Sun Mar 12 23:18:23 2017 -0400

----------------------------------------------------------------------
 python/pyarrow/tests/pandas_examples.py     | 40 ++++++++++++++++++++++++
 python/pyarrow/tests/test_convert_pandas.py | 14 +++++++--
 python/pyarrow/tests/test_parquet.py        | 17 ++++++++--
 python/src/pyarrow/adapters/builtin.cc      | 27 +++++++++-------
 python/src/pyarrow/adapters/builtin.h       |  7 +++++
 python/src/pyarrow/adapters/pandas.cc       | 25 +++++++++++++--
 6 files changed, 112 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/2cf36ef2/python/pyarrow/tests/pandas_examples.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/pandas_examples.py b/python/pyarrow/tests/pandas_examples.py
index 63af423..c9343fc 100644
--- a/python/pyarrow/tests/pandas_examples.py
+++ b/python/pyarrow/tests/pandas_examples.py
@@ -76,3 +76,43 @@ def dataframe_with_arrays():
     schema = pa.Schema.from_fields(fields)
 
     return df, schema
+
+def dataframe_with_lists():
+    """
+    Dataframe with list columns of every possible primtive type.
+
+    Returns
+    -------
+    df: pandas.DataFrame
+    schema: pyarrow.Schema
+        Arrow schema definition that is in line with the constructed df.
+    """
+    arrays = OrderedDict()
+    fields = []
+
+    fields.append(pa.field('int64', pa.list_(pa.int64())))
+    arrays['int64'] = [
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+        [0, 1, 2, 3, 4],
+        None,
+        [0]
+    ]
+    fields.append(pa.field('double', pa.list_(pa.double())))
+    arrays['double'] = [
+        [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
+        [0., 1., 2., 3., 4.],
+        None,
+        [0.]
+    ]
+    fields.append(pa.field('str_list', pa.list_(pa.string())))
+    arrays['str_list'] = [
+        [u"1", u"�"],
+        None,
+        [u"1"],
+        [u"1", u"2", u"3"]
+    ]
+
+    df = pd.DataFrame(arrays)
+    schema = pa.Schema.from_fields(fields)
+
+    return df, schema

http://git-wip-us.apache.org/repos/asf/arrow/blob/2cf36ef2/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 953fa2c..a79bb23 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -30,7 +30,7 @@ import pandas.util.testing as tm
 from pyarrow.compat import u
 import pyarrow as A
 
-from .pandas_examples import dataframe_with_arrays
+from .pandas_examples import dataframe_with_arrays, dataframe_with_lists
 
 
 def _alltypes_example(size=100):
@@ -333,7 +333,7 @@ class TestPandasConversion(unittest.TestCase):
         expected['date'] = pd.to_datetime(df['date'])
         tm.assert_frame_equal(result, expected)
 
-    def test_column_of_lists(self):
+    def test_column_of_arrays(self):
         df, schema = dataframe_with_arrays()
         self._check_pandas_roundtrip(df, schema=schema, expected_schema=schema)
         table = A.Table.from_pandas(df, schema=schema)
@@ -343,6 +343,16 @@ class TestPandasConversion(unittest.TestCase):
             field = schema.field_by_name(column)
             self._check_array_roundtrip(df[column], field=field)
 
+    def test_column_of_lists(self):
+        df, schema = dataframe_with_lists()
+        self._check_pandas_roundtrip(df, schema=schema, expected_schema=schema)
+        table = A.Table.from_pandas(df, schema=schema)
+        assert table.schema.equals(schema)
+
+        for column in df.columns:
+            field = schema.field_by_name(column)
+            self._check_array_roundtrip(df[column], field=field)
+
     def test_threaded_conversion(self):
         df = _alltypes_example()
         self._check_pandas_roundtrip(df, nthreads=2,

http://git-wip-us.apache.org/repos/asf/arrow/blob/2cf36ef2/python/pyarrow/tests/test_parquet.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 96f2d15..c72ff9e 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -23,7 +23,7 @@ import pytest
 from pyarrow.compat import guid
 import pyarrow as pa
 import pyarrow.io as paio
-from .pandas_examples import dataframe_with_arrays
+from .pandas_examples import dataframe_with_arrays, dataframe_with_lists
 
 import numpy as np
 import pandas as pd
@@ -322,7 +322,7 @@ def test_compare_schemas():
 
 
 @parquet
-def test_column_of_lists(tmpdir):
+def test_column_of_arrays(tmpdir):
     df, schema = dataframe_with_arrays()
 
     filename = tmpdir.join('pandas_rountrip.parquet')
@@ -335,6 +335,19 @@ def test_column_of_lists(tmpdir):
 
 
 @parquet
+def test_column_of_lists(tmpdir):
+    df, schema = dataframe_with_lists()
+
+    filename = tmpdir.join('pandas_rountrip.parquet')
+    arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True,
+                                       schema=schema)
+    pq.write_table(arrow_table, filename.strpath, version="2.0")
+    table_read = pq.read_table(filename.strpath)
+    df_read = table_read.to_pandas()
+    pdt.assert_frame_equal(df, df_read)
+
+
+@parquet
 def test_multithreaded_read():
     df = alltypes_sample(size=10000)
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/2cf36ef2/python/src/pyarrow/adapters/builtin.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc
index c125cc0..4f7b2cb 100644
--- a/python/src/pyarrow/adapters/builtin.cc
+++ b/python/src/pyarrow/adapters/builtin.cc
@@ -206,8 +206,7 @@ class SeqVisitor {
 };
 
 // Non-exhaustive type inference
-static Status InferArrowType(
-    PyObject* obj, int64_t* size, std::shared_ptr<DataType>* out_type) {
+Status InferArrowType(PyObject* obj, int64_t* size, std::shared_ptr<DataType>* out_type) {
   *size = PySequence_Size(obj);
   if (PyErr_Occurred()) {
     // Not a sequence
@@ -496,6 +495,19 @@ Status ListConverter::Init(const std::shared_ptr<ArrayBuilder>& builder) {
   return Status::OK();
 }
 
+Status AppendPySequence(PyObject* obj, const std::shared_ptr<DataType>& type,
+    const std::shared_ptr<ArrayBuilder>& builder) {
+  std::shared_ptr<SeqConverter> converter = GetConverter(type);
+  if (converter == nullptr) {
+    std::stringstream ss;
+    ss << "No type converter implemented for " << type->ToString();
+    return Status::NotImplemented(ss.str());
+  }
+  converter->Init(builder);
+
+  return converter->AppendData(obj);
+}
+
 Status ConvertPySequence(
     PyObject* obj, MemoryPool* pool, std::shared_ptr<arrow::Array>* out) {
   std::shared_ptr<DataType> type;
@@ -509,19 +521,10 @@ Status ConvertPySequence(
     return Status::OK();
   }
 
-  std::shared_ptr<SeqConverter> converter = GetConverter(type);
-  if (converter == nullptr) {
-    std::stringstream ss;
-    ss << "No type converter implemented for " << type->ToString();
-    return Status::NotImplemented(ss.str());
-  }
-
   // Give the sequence converter an array builder
   std::shared_ptr<ArrayBuilder> builder;
   RETURN_NOT_OK(arrow::MakeBuilder(pool, type, &builder));
-  converter->Init(builder);
-
-  RETURN_NOT_OK(converter->AppendData(obj));
+  RETURN_NOT_OK(AppendPySequence(obj, type, builder));
 
   return builder->Finish(out);
 }

http://git-wip-us.apache.org/repos/asf/arrow/blob/2cf36ef2/python/src/pyarrow/adapters/builtin.h
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/builtin.h b/python/src/pyarrow/adapters/builtin.h
index 667298e..0c863a5 100644
--- a/python/src/pyarrow/adapters/builtin.h
+++ b/python/src/pyarrow/adapters/builtin.h
@@ -37,6 +37,13 @@ class Status;
 
 namespace pyarrow {
 
+PYARROW_EXPORT arrow::Status InferArrowType(
+    PyObject* obj, int64_t* size, std::shared_ptr<arrow::DataType>* out_type);
+
+PYARROW_EXPORT arrow::Status AppendPySequence(PyObject* obj,
+    const std::shared_ptr<arrow::DataType>& type,
+    const std::shared_ptr<arrow::ArrayBuilder>& builder);
+
 PYARROW_EXPORT
 arrow::Status ConvertPySequence(
     PyObject* obj, arrow::MemoryPool* pool, std::shared_ptr<arrow::Array>* out);

http://git-wip-us.apache.org/repos/asf/arrow/blob/2cf36ef2/python/src/pyarrow/adapters/pandas.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc
index eb3ab49..40079b4 100644
--- a/python/src/pyarrow/adapters/pandas.cc
+++ b/python/src/pyarrow/adapters/pandas.cc
@@ -19,6 +19,7 @@
 
 #include <Python.h>
 
+#include "pyarrow/adapters/builtin.h"
 #include "pyarrow/adapters/pandas.h"
 #include "pyarrow/numpy_interop.h"
 
@@ -1661,6 +1662,7 @@ inline Status ArrowSerializer<TYPE>::ConvertTypedLists(
   typedef npy_traits<ITEM_TYPE> traits;
   typedef typename traits::value_type T;
   typedef typename traits::BuilderClass BuilderT;
+  PyAcquireGIL lock;
 
   auto value_builder = std::make_shared<BuilderT>(pool_, field->type);
   ListBuilder list_builder(pool_, value_builder);
@@ -1688,7 +1690,16 @@ inline Status ArrowSerializer<TYPE>::ConvertTypedLists(
         RETURN_NOT_OK(value_builder->Append(data, size));
       }
     } else if (PyList_Check(objects[i])) {
-      return Status::TypeError("Python lists are not yet supported");
+      int64_t size;
+      std::shared_ptr<arrow::DataType> type;
+      RETURN_NOT_OK(list_builder.Append(true));
+      RETURN_NOT_OK(InferArrowType(objects[i], &size, &type));
+      if (type->type != field->type->type) {
+        std::stringstream ss;
+        ss << type->ToString() << " cannot be converted to " << field->type->ToString();
+        return Status::TypeError(ss.str());
+      }
+      RETURN_NOT_OK(AppendPySequence(objects[i], field->type, value_builder));
     } else {
       return Status::TypeError("Unsupported Python type for list items");
     }
@@ -1702,6 +1713,7 @@ inline Status
 ArrowSerializer<NPY_OBJECT>::ConvertTypedLists<NPY_OBJECT, ::arrow::StringType>(
     const std::shared_ptr<Field>& field, std::shared_ptr<Array>* out) {
   // TODO: If there are bytes involed, convert to Binary representation
+  PyAcquireGIL lock;
   bool have_bytes = false;
 
   auto value_builder = std::make_shared<arrow::StringBuilder>(pool_);
@@ -1721,7 +1733,16 @@ ArrowSerializer<NPY_OBJECT>::ConvertTypedLists<NPY_OBJECT, ::arrow::StringType>(
       auto data = reinterpret_cast<PyObject**>(PyArray_DATA(numpy_array));
       RETURN_NOT_OK(AppendObjectStrings(*value_builder.get(), data, size, &have_bytes));
     } else if (PyList_Check(objects[i])) {
-      return Status::TypeError("Python lists are not yet supported");
+      int64_t size;
+      std::shared_ptr<arrow::DataType> type;
+      RETURN_NOT_OK(list_builder.Append(true));
+      RETURN_NOT_OK(InferArrowType(objects[i], &size, &type));
+      if (type->type != Type::STRING) {
+        std::stringstream ss;
+        ss << type->ToString() << " cannot be converted to STRING.";
+        return Status::TypeError(ss.str());
+      }
+      RETURN_NOT_OK(AppendPySequence(objects[i], type, value_builder));
     } else {
       return Status::TypeError("Unsupported Python type for list items");
     }