You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/04/06 13:10:18 UTC

arrow git commit: ARROW-752: [Python] Support boxed Arrow arrays as input to DictionaryArray.from_arrays

Repository: arrow
Updated Branches:
  refs/heads/master f4fcb42c2 -> ddf880b31


ARROW-752: [Python] Support boxed Arrow arrays as input to DictionaryArray.from_arrays

Author: Wes McKinney <we...@twosigma.com>

Closes #496 from wesm/ARROW-752 and squashes the following commits:

2f57574 [Wes McKinney] Support boxed Arrow arrays as input to DictionaryArray.from_arrays


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/ddf880b3
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/ddf880b3
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/ddf880b3

Branch: refs/heads/master
Commit: ddf880b312c1b11739d09bc014d4649b8f2f26d4
Parents: f4fcb42
Author: Wes McKinney <we...@twosigma.com>
Authored: Thu Apr 6 09:10:13 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Thu Apr 6 09:10:13 2017 -0400

----------------------------------------------------------------------
 python/pyarrow/array.pyx           | 31 +++++++++----
 python/pyarrow/tests/test_array.py | 81 +++++++++++++++++++++++++++------
 2 files changed, 89 insertions(+), 23 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/ddf880b3/python/pyarrow/array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx
index 1f59556..9f302e0 100644
--- a/python/pyarrow/array.pyx
+++ b/python/pyarrow/array.pyx
@@ -497,8 +497,12 @@ cdef class DictionaryArray(Array):
 
     cdef getitem(self, int64_t i):
         cdef Array dictionary = self.dictionary
-        cdef int64_t index = self.indices[i].as_py()
-        return scalar.box_scalar(dictionary.type, dictionary.sp_array, index)
+        index = self.indices[i]
+        if index is NA:
+            return index
+        else:
+            return scalar.box_scalar(dictionary.type, dictionary.sp_array,
+                                     index.as_py())
 
     property dictionary:
 
@@ -544,15 +548,24 @@ cdef class DictionaryArray(Array):
             shared_ptr[CDataType] c_type
             shared_ptr[CArray] c_result
 
-        if mask is None:
-            mask = indices == -1
+        if isinstance(indices, Array):
+            if mask is not None:
+                raise NotImplementedError(
+                    "mask not implemented with Arrow array inputs yet")
+            arrow_indices = indices
         else:
-            mask = mask | (indices == -1)
+            if mask is None:
+                mask = indices == -1
+            else:
+                mask = mask | (indices == -1)
+            arrow_indices = Array.from_numpy(indices, mask=mask,
+                                             memory_pool=memory_pool)
 
-        arrow_indices = Array.from_numpy(indices, mask=mask,
-                                         memory_pool=memory_pool)
-        arrow_dictionary = Array.from_numpy(dictionary,
-                                            memory_pool=memory_pool)
+        if isinstance(dictionary, Array):
+            arrow_dictionary = dictionary
+        else:
+            arrow_dictionary = Array.from_numpy(dictionary,
+                                                memory_pool=memory_pool)
 
         if not isinstance(arrow_indices, IntegerArray):
             raise ValueError('Indices must be integer type')

http://git-wip-us.apache.org/repos/asf/arrow/blob/ddf880b3/python/pyarrow/tests/test_array.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index d8b2e2f..57b17f6 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -15,30 +15,33 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import pytest
 import sys
 
-import pytest
+import numpy as np
+import pandas as pd
+import pandas.util.testing as tm
 
-import pyarrow
+import pyarrow as pa
 import pyarrow.formatting as fmt
 
 
 def test_total_bytes_allocated():
-    assert pyarrow.total_allocated_bytes() == 0
+    assert pa.total_allocated_bytes() == 0
 
 
 def test_repr_on_pre_init_array():
-    arr = pyarrow.array.Array()
+    arr = pa.Array()
     assert len(repr(arr)) > 0
 
 
 def test_getitem_NA():
-    arr = pyarrow.from_pylist([1, None, 2])
-    assert arr[1] is pyarrow.NA
+    arr = pa.from_pylist([1, None, 2])
+    assert arr[1] is pa.NA
 
 
 def test_list_format():
-    arr = pyarrow.from_pylist([[1], None, [2, 3, None]])
+    arr = pa.from_pylist([[1], None, [2, 3, None]])
     result = fmt.array_format(arr)
     expected = """\
 [
@@ -52,7 +55,7 @@ def test_list_format():
 
 
 def test_string_format():
-    arr = pyarrow.from_pylist(['', None, 'foo'])
+    arr = pa.from_pylist(['', None, 'foo'])
     result = fmt.array_format(arr)
     expected = """\
 [
@@ -64,7 +67,7 @@ def test_string_format():
 
 
 def test_long_array_format():
-    arr = pyarrow.from_pylist(range(100))
+    arr = pa.from_pylist(range(100))
     result = fmt.array_format(arr, window=2)
     expected = """\
 [
@@ -80,7 +83,7 @@ def test_long_array_format():
 def test_to_pandas_zero_copy():
     import gc
 
-    arr = pyarrow.from_pylist(range(10))
+    arr = pa.from_pylist(range(10))
 
     for i in range(10):
         np_arr = arr.to_pandas()
@@ -90,7 +93,7 @@ def test_to_pandas_zero_copy():
     assert sys.getrefcount(arr) == 2
 
     for i in range(10):
-        arr = pyarrow.from_pylist(range(10))
+        arr = pa.from_pylist(range(10))
         np_arr = arr.to_pandas()
         arr = None
         gc.collect()
@@ -105,14 +108,14 @@ def test_to_pandas_zero_copy():
 
 
 def test_array_slice():
-    arr = pyarrow.from_pylist(range(10))
+    arr = pa.from_pylist(range(10))
 
     sliced = arr.slice(2)
-    expected = pyarrow.from_pylist(range(2, 10))
+    expected = pa.from_pylist(range(2, 10))
     assert sliced.equals(expected)
 
     sliced2 = arr.slice(2, 4)
-    expected2 = pyarrow.from_pylist(range(2, 6))
+    expected2 = pa.from_pylist(range(2, 6))
     assert sliced2.equals(expected2)
 
     # 0 offset
@@ -136,3 +139,53 @@ def test_array_slice():
 
     with pytest.raises(IndexError):
         arr[::2]
+
+
+def test_dictionary_from_numpy():
+    indices = np.repeat([0, 1, 2], 2)
+    dictionary = np.array(['foo', 'bar', 'baz'], dtype=object)
+    mask = np.array([False, False, True, False, False, False])
+
+    d1 = pa.DictionaryArray.from_arrays(indices, dictionary)
+    d2 = pa.DictionaryArray.from_arrays(indices, dictionary, mask=mask)
+
+    for i in range(len(indices)):
+        assert d1[i].as_py() == dictionary[indices[i]]
+
+        if mask[i]:
+            assert d2[i] is pa.NA
+        else:
+            assert d2[i].as_py() == dictionary[indices[i]]
+
+
+def test_dictionary_from_boxed_arrays():
+    indices = np.repeat([0, 1, 2], 2)
+    dictionary = np.array(['foo', 'bar', 'baz'], dtype=object)
+
+    iarr = pa.Array.from_numpy(indices)
+    darr = pa.Array.from_numpy(dictionary)
+
+    d1 = pa.DictionaryArray.from_arrays(iarr, darr)
+
+    for i in range(len(indices)):
+        assert d1[i].as_py() == dictionary[indices[i]]
+
+
+def test_dictionary_with_pandas():
+    indices = np.repeat([0, 1, 2], 2)
+    dictionary = np.array(['foo', 'bar', 'baz'], dtype=object)
+    mask = np.array([False, False, True, False, False, False])
+
+    d1 = pa.DictionaryArray.from_arrays(indices, dictionary)
+    d2 = pa.DictionaryArray.from_arrays(indices, dictionary, mask=mask)
+
+    pandas1 = d1.to_pandas()
+    ex_pandas1 = pd.Categorical.from_codes(indices, categories=dictionary)
+
+    tm.assert_series_equal(pd.Series(pandas1), pd.Series(ex_pandas1))
+
+    pandas2 = d2.to_pandas()
+    ex_pandas2 = pd.Categorical.from_codes(np.where(mask, -1, indices),
+                                           categories=dictionary)
+
+    tm.assert_series_equal(pd.Series(pandas2), pd.Series(ex_pandas2))