You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/07/17 12:31:20 UTC
[arrow] branch master updated: ARROW-2663: [Python] Make
dictionary_encode and unique accesible on Column / ChunkedArray
This is an automated email from the ASF dual-hosted git repository.
uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new c49a57d ARROW-2663: [Python] Make dictionary_encode and unique accesible on Column / ChunkedArray
c49a57d is described below
commit c49a57d614b8d6fe11b2a3341d3ae725ba79b827
Author: Korn, Uwe <Uw...@blue-yonder.com>
AuthorDate: Tue Jul 17 14:31:13 2018 +0200
ARROW-2663: [Python] Make dictionary_encode and unique accesible on Column / ChunkedArray
Author: Korn, Uwe <Uw...@blue-yonder.com>
Closes #2166 from xhochy/ARROW-2663 and squashes the following commits:
e59f5fae <Korn, Uwe> Add documentation about return type
99c1f127 <Korn, Uwe> ARROW-2663: Make dictionary_encode and unique accesible on Column / ChunkedArray
---
python/pyarrow/table.pxi | 56 ++++++++++++++++++++++++++++++++++++++
python/pyarrow/tests/test_array.py | 8 ++++++
2 files changed, 64 insertions(+)
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 963444d..2e3f260 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -124,6 +124,40 @@ cdef class ChunkedArray:
return wrap_array_output(out)
+ def dictionary_encode(self):
+ """
+ Compute dictionary-encoded representation of array
+
+ Returns
+ -------
+ pyarrow.ChunkedArray
+ Same chunking as the input, all chunks share a common dictionary.
+ """
+ cdef CDatum out
+
+ with nogil:
+ check_status(
+ DictionaryEncode(_context(), CDatum(self.sp_chunked_array),
+ &out))
+
+ return wrap_datum(out)
+
+ def unique(self):
+ """
+ Compute distinct elements in array
+
+ Returns
+ -------
+ pyarrow.Array
+ """
+ cdef shared_ptr[CArray] result
+
+ with nogil:
+ check_status(
+ Unique(_context(), CDatum(self.sp_chunked_array), &result))
+
+ return pyarrow_wrap_array(result)
+
def slice(self, offset=0, length=None):
"""
Compute zero-copy slice of this ChunkedArray
@@ -364,6 +398,28 @@ cdef class Column:
casted_data = pyarrow_wrap_chunked_array(out.chunked_array())
return column(self.name, casted_data)
+ def dictionary_encode(self):
+ """
+ Compute dictionary-encoded representation of array
+
+ Returns
+ -------
+ pyarrow.Column
+ Same chunking as the input, all chunks share a common dictionary.
+ """
+ ca = self.data.dictionary_encode()
+ return column(self.name, ca)
+
+ def unique(self):
+ """
+ Compute distinct elements in array
+
+ Returns
+ -------
+ pyarrow.Array
+ """
+ return self.data.unique()
+
def flatten(self, MemoryPool memory_pool=None):
"""
Flatten this Column. If it has a struct type, the column is
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index a0b1a51..3852211 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -516,6 +516,10 @@ def test_unique_simple():
for arr, expected in cases:
result = arr.unique()
assert result.equals(expected)
+ result = pa.column("column", arr).unique()
+ assert result.equals(expected)
+ result = pa.chunked_array([arr]).unique()
+ assert result.equals(expected)
def test_dictionary_encode_simple():
@@ -532,6 +536,10 @@ def test_dictionary_encode_simple():
for arr, expected in cases:
result = arr.dictionary_encode()
assert result.equals(expected)
+ result = pa.column("column", arr).dictionary_encode()
+ assert result.data.chunk(0).equals(expected)
+ result = pa.chunked_array([arr]).dictionary_encode()
+ assert result.chunk(0).equals(expected)
def test_cast_time32_to_int():