You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/07/01 22:12:15 UTC

arrow git commit: ARROW-1125: partial schemas for Table.from_pandas

Repository: arrow
Updated Branches:
  refs/heads/master 930db87d6 -> c294ec3db


ARROW-1125: partial schemas for Table.from_pandas

Author: fjetter <fl...@blue-yonder.com>

Closes #790 from fjetter/ARROW-1125-partial-schemas and squashes the following commits:

0a58b708 [fjetter] Remove trailing whitespaces
87ccb0c4 [fjetter] Fix indentation to respect max line length
92001422 [fjetter] Remove template from TypeNotImplemented status message
67dbba5d [fjetter] Remove range from test due to pandas bug on Windows
4890b5af [fjetter] Refactor TypeNotImplemented message
9de8611c [fjetter] Partial schema test in test_convert_pandas.py
dcf44f09 [fjetter] Allow partial schemas in Table.from_pandas again
66671a27 [fjetter] Improved NotImplemented messages in PandasConverter


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/c294ec3d
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/c294ec3d
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/c294ec3d

Branch: refs/heads/master
Commit: c294ec3dbdd3f5cfa55a6eb5c7b27535b240ccf0
Parents: 930db87
Author: fjetter <fl...@blue-yonder.com>
Authored: Sat Jul 1 18:12:09 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Sat Jul 1 18:12:09 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/python/pandas_convert.cc      | 22 ++++++++++++----------
 cpp/src/arrow/type.h                        |  3 +++
 python/pyarrow/table.pxi                    |  3 ++-
 python/pyarrow/tests/test_convert_pandas.py | 24 ++++++++++++++++++++++--
 python/pyarrow/tests/test_table.py          | 18 ------------------
 5 files changed, 39 insertions(+), 31 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/c294ec3d/cpp/src/arrow/python/pandas_convert.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc
index 2918f9e..9b65570 100644
--- a/cpp/src/arrow/python/pandas_convert.cc
+++ b/cpp/src/arrow/python/pandas_convert.cc
@@ -362,23 +362,25 @@ class PandasConverter {
   Status Visit(const Time32Type& type) { return VisitNative<Int32Type>(); }
   Status Visit(const Time64Type& type) { return VisitNative<Int64Type>(); }
 
-  Status Visit(const NullType& type) { return Status::NotImplemented("null"); }
+  Status TypeNotImplemented(std::string type_name) {
+    std::stringstream ss;
+    ss << "PandasConverter doesn't implement <" << type_name << "> conversion. ";
+    return Status::NotImplemented(ss.str());
+  }
 
-  Status Visit(const BinaryType& type) { return Status::NotImplemented(type.ToString()); }
+  Status Visit(const NullType& type) { return TypeNotImplemented(type.ToString()); }
+
+  Status Visit(const BinaryType& type) { return TypeNotImplemented(type.ToString()); }
 
   Status Visit(const FixedSizeBinaryType& type) {
-    return Status::NotImplemented(type.ToString());
+    return TypeNotImplemented(type.ToString());
   }
 
-  Status Visit(const DecimalType& type) {
-    return Status::NotImplemented(type.ToString());
-  }
+  Status Visit(const DecimalType& type) { return TypeNotImplemented(type.ToString()); }
 
-  Status Visit(const DictionaryType& type) {
-    return Status::NotImplemented(type.ToString());
-  }
+  Status Visit(const DictionaryType& type) { return TypeNotImplemented(type.ToString()); }
 
-  Status Visit(const NestedType& type) { return Status::NotImplemented(type.ToString()); }
+  Status Visit(const NestedType& type) { return TypeNotImplemented(type.ToString()); }
 
   Status Convert() {
     if (PyArray_NDIM(arr_) != 1) {

http://git-wip-us.apache.org/repos/asf/arrow/blob/c294ec3d/cpp/src/arrow/type.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 3e85291..8338800 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -194,6 +194,7 @@ class ARROW_EXPORT FloatingPoint : public PrimitiveCType {
 class ARROW_EXPORT NestedType : public DataType {
  public:
   using DataType::DataType;
+  static std::string name() { return "nested"; }
 };
 
 class NoExtraMeta {};
@@ -406,6 +407,7 @@ class ARROW_EXPORT FixedSizeBinaryType : public FixedWidthType {
 
   Status Accept(TypeVisitor* visitor) const override;
   std::string ToString() const override;
+  static std::string name() {return "fixed_size_binary"; }
 
   std::vector<BufferDescr> GetBufferLayout() const override;
 
@@ -674,6 +676,7 @@ class ARROW_EXPORT DictionaryType : public FixedWidthType {
 
   Status Accept(TypeVisitor* visitor) const override;
   std::string ToString() const override;
+  static std::string name() { return "dictionary"; }
 
   bool ordered() const { return ordered_; }
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/c294ec3d/python/pyarrow/table.pxi
----------------------------------------------------------------------
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 7d44f2e..ef83636 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -333,7 +333,8 @@ cdef tuple _dataframe_to_arrays(
     for name in df.columns:
         col = df[name]
         if schema is not None:
-            type = schema.field_by_name(name).type
+            field = schema.field_by_name(name)
+            type = getattr(field, "type", None)
 
         arr = arrays.append(
             Array.from_pandas(

http://git-wip-us.apache.org/repos/asf/arrow/blob/c294ec3d/python/pyarrow/tests/test_convert_pandas.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 49b7eb7..9cce7bb 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -102,13 +102,11 @@ class TestPandasConversion(unittest.TestCase):
         df = pd.DataFrame({'a': [None, None, None]})
         self._check_pandas_roundtrip(df)
 
-
     def test_all_none_category(self):
         df = pd.DataFrame({'a': [None, None, None]})
         df['a'] = df['a'].astype('category')
         self._check_pandas_roundtrip(df)
 
-
     def test_float_no_nulls(self):
         data = {}
         fields = []
@@ -654,3 +652,25 @@ class TestPandasConversion(unittest.TestCase):
         table = pa.Table.from_pandas(df)
         result_df = table.to_pandas()
         tm.assert_frame_equal(result_df, df)
+
+    def test_partial_schema(self):
+        data = OrderedDict([
+            ('a', [0, 1, 2, 3, 4]),
+            ('b', np.array([-10, -5, 0, 5, 10], dtype=np.int32)),
+            ('c', [-10, -5, 0, 5, 10])
+        ])
+        df = pd.DataFrame(data)
+
+        partial_schema = pa.schema([
+            pa.field('a', pa.int64()),
+            pa.field('b', pa.int32())
+        ])
+
+        expected_schema = pa.schema([
+            pa.field('a', pa.int64()),
+            pa.field('b', pa.int32()),
+            pa.field('c', pa.int64())
+        ])
+
+        self._check_pandas_roundtrip(df, schema=partial_schema,
+                                     expected_schema=expected_schema)

http://git-wip-us.apache.org/repos/asf/arrow/blob/c294ec3d/python/pyarrow/tests/test_table.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
index afc9520..3198941 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -238,24 +238,6 @@ def test_concat_tables():
     assert result.equals(expected)
 
 
-def test_table_pandas():
-    data = [
-        pa.array(range(5)),
-        pa.array([-10, -5, 0, 5, 10])
-    ]
-    table = pa.Table.from_arrays(data, names=('a', 'b'))
-
-    # TODO: Use this part once from_pandas is implemented
-    # data = {'a': range(5), 'b': [-10, -5, 0, 5, 10]}
-    # df = pd.DataFrame(data)
-    # pa.Table.from_pandas(df)
-
-    df = table.to_pandas()
-    assert set(df.columns) == set(('a', 'b'))
-    assert df.shape == (5, 2)
-    assert df.loc[0, 'b'] == -10
-
-
 def test_table_negative_indexing():
     data = [
         pa.array(range(5)),