You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/07/17 12:31:20 UTC

[arrow] branch master updated: ARROW-2663: [Python] Make dictionary_encode and unique accesible on Column / ChunkedArray

This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new c49a57d  ARROW-2663: [Python] Make dictionary_encode and unique accesible on Column / ChunkedArray
c49a57d is described below

commit c49a57d614b8d6fe11b2a3341d3ae725ba79b827
Author: Korn, Uwe <Uw...@blue-yonder.com>
AuthorDate: Tue Jul 17 14:31:13 2018 +0200

    ARROW-2663: [Python] Make dictionary_encode and unique accesible on Column / ChunkedArray
    
    Author: Korn, Uwe <Uw...@blue-yonder.com>
    
    Closes #2166 from xhochy/ARROW-2663 and squashes the following commits:
    
    e59f5fae <Korn, Uwe> Add documentation about return type
    99c1f127 <Korn, Uwe> ARROW-2663:  Make dictionary_encode and unique accesible on Column / ChunkedArray
---
 python/pyarrow/table.pxi           | 56 ++++++++++++++++++++++++++++++++++++++
 python/pyarrow/tests/test_array.py |  8 ++++++
 2 files changed, 64 insertions(+)

diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 963444d..2e3f260 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -124,6 +124,40 @@ cdef class ChunkedArray:
 
         return wrap_array_output(out)
 
+    def dictionary_encode(self):
+        """
+        Compute dictionary-encoded representation of array
+
+        Returns
+        -------
+        pyarrow.ChunkedArray
+            Same chunking as the input, all chunks share a common dictionary.
+        """
+        cdef CDatum out
+
+        with nogil:
+            check_status(
+                DictionaryEncode(_context(), CDatum(self.sp_chunked_array),
+                                 &out))
+
+        return wrap_datum(out)
+
+    def unique(self):
+        """
+        Compute distinct elements in array
+
+        Returns
+        -------
+        pyarrow.Array
+        """
+        cdef shared_ptr[CArray] result
+
+        with nogil:
+            check_status(
+                Unique(_context(), CDatum(self.sp_chunked_array), &result))
+
+        return pyarrow_wrap_array(result)
+
     def slice(self, offset=0, length=None):
         """
         Compute zero-copy slice of this ChunkedArray
@@ -364,6 +398,28 @@ cdef class Column:
         casted_data = pyarrow_wrap_chunked_array(out.chunked_array())
         return column(self.name, casted_data)
 
+    def dictionary_encode(self):
+        """
+        Compute dictionary-encoded representation of array
+
+        Returns
+        -------
+        pyarrow.Column
+            Same chunking as the input, all chunks share a common dictionary.
+        """
+        ca = self.data.dictionary_encode()
+        return column(self.name, ca)
+
+    def unique(self):
+        """
+        Compute distinct elements in array
+
+        Returns
+        -------
+        pyarrow.Array
+        """
+        return self.data.unique()
+
     def flatten(self, MemoryPool memory_pool=None):
         """
         Flatten this Column.  If it has a struct type, the column is
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index a0b1a51..3852211 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -516,6 +516,10 @@ def test_unique_simple():
     for arr, expected in cases:
         result = arr.unique()
         assert result.equals(expected)
+        result = pa.column("column", arr).unique()
+        assert result.equals(expected)
+        result = pa.chunked_array([arr]).unique()
+        assert result.equals(expected)
 
 
 def test_dictionary_encode_simple():
@@ -532,6 +536,10 @@ def test_dictionary_encode_simple():
     for arr, expected in cases:
         result = arr.dictionary_encode()
         assert result.equals(expected)
+        result = pa.column("column", arr).dictionary_encode()
+        assert result.data.chunk(0).equals(expected)
+        result = pa.chunked_array([arr]).dictionary_encode()
+        assert result.chunk(0).equals(expected)
 
 
 def test_cast_time32_to_int():