You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2018/08/08 15:10:08 UTC

[arrow] branch master updated: ARROW-3002: [Python] Hash more parts of pyarrow.Field

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new d247c5b  ARROW-3002: [Python] Hash more parts of pyarrow.Field
d247c5b is described below

commit d247c5b902ca51064a5d7b3787e9f474c38d93b4
Author: Krisztián Szűcs <sz...@gmail.com>
AuthorDate: Wed Aug 8 17:10:01 2018 +0200

    ARROW-3002: [Python] Hash more parts of pyarrow.Field
    
    Author: Krisztián Szűcs <sz...@gmail.com>
    
    Closes #2385 from kszucs/ARROW-3002 and squashes the following commits:
    
    e2425e5 <Krisztián Szűcs> prevent holding a global array reference
    151dac0 <Krisztián Szűcs> support for pickling dictionary type
    0e577da <Krisztián Szűcs> hash field nullable property
---
 python/pyarrow/tests/test_types.py | 107 +++++++++++++++++++++----------------
 python/pyarrow/types.pxi           |   5 +-
 2 files changed, 64 insertions(+), 48 deletions(-)

diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py
index 4408859..61f6018 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -23,34 +23,42 @@ import pyarrow as pa
 import pyarrow.types as types
 
 
-MANY_TYPES = [
-    pa.null(),
-    pa.bool_(),
-    pa.int32(),
-    pa.time32('s'),
-    pa.time64('us'),
-    pa.date32(),
-    pa.timestamp('us'),
-    pa.timestamp('us', tz='UTC'),
-    pa.timestamp('us', tz='Europe/Paris'),
-    pa.float16(),
-    pa.float32(),
-    pa.float64(),
-    pa.decimal128(19, 4),
-    pa.string(),
-    pa.binary(),
-    pa.binary(10),
-    pa.list_(pa.int32()),
-    pa.struct([pa.field('a', pa.int32()),
-               pa.field('b', pa.int8()),
-               pa.field('c', pa.string())]),
-    pa.union([pa.field('a', pa.binary(10)),
-              pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
-    pa.union([pa.field('a', pa.binary(10)),
-              pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
-    # XXX Needs array pickling
-    # pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])),
-]
+def get_many_types():
+    # returning them from a function is required because of pa.dictionary
+    # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
+    # checks that the default memory pool has zero allocated bytes
+    return (
+        pa.null(),
+        pa.bool_(),
+        pa.int32(),
+        pa.time32('s'),
+        pa.time64('us'),
+        pa.date32(),
+        pa.timestamp('us'),
+        pa.timestamp('us', tz='UTC'),
+        pa.timestamp('us', tz='Europe/Paris'),
+        pa.float16(),
+        pa.float32(),
+        pa.float64(),
+        pa.decimal128(19, 4),
+        pa.string(),
+        pa.binary(),
+        pa.binary(10),
+        pa.list_(pa.int32()),
+        pa.struct([pa.field('a', pa.int32()),
+                   pa.field('b', pa.int8()),
+                   pa.field('c', pa.string())]),
+        pa.struct([pa.field('a', pa.int32(), nullable=False),
+                   pa.field('b', pa.int8(), nullable=False),
+                   pa.field('c', pa.string())]),
+        pa.union([pa.field('a', pa.binary(10)),
+                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
+        pa.union([pa.field('a', pa.binary(10)),
+                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
+        pa.union([pa.field('a', pa.binary(10), nullable=False),
+                  pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
+        pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c']))
+    )
 
 
 def test_is_boolean():
@@ -208,19 +216,20 @@ def test_union_type():
 
 
 def test_types_hashable():
+    many_types = get_many_types()
     in_dict = {}
-    for i, type_ in enumerate(MANY_TYPES):
+    for i, type_ in enumerate(many_types):
         assert hash(type_) == hash(type_)
         in_dict[type_] = i
-    assert len(in_dict) == len(MANY_TYPES)
-    for i, type_ in enumerate(MANY_TYPES):
+    assert len(in_dict) == len(many_types)
+    for i, type_ in enumerate(many_types):
         assert in_dict[type_] == i
 
 
-@pytest.mark.parametrize('ty', MANY_TYPES, ids=str)
-def test_types_picklable(ty):
-    data = pickle.dumps(ty)
-    assert pickle.loads(data) == ty
+def test_types_picklable():
+    for ty in get_many_types():
+        data = pickle.dumps(ty)
+        assert pickle.loads(data) == ty
 
 
 def test_dictionary_type():
@@ -231,9 +240,11 @@ def test_dictionary_type():
 
 def test_fields_hashable():
     in_dict = {}
-    fields = [pa.field('a', pa.int64()),
-              pa.field('a', pa.int32()),
-              pa.field('b', pa.int32())]
+    fields = [pa.field('a', pa.int32()),
+              pa.field('a', pa.int64()),
+              pa.field('a', pa.int64(), nullable=False),
+              pa.field('b', pa.int32()),
+              pa.field('b', pa.int32(), nullable=False)]
     for i, field in enumerate(fields):
         in_dict[field] = i
     assert len(in_dict) == len(fields)
@@ -285,16 +296,18 @@ def test_decimal_byte_width():
     assert ty.byte_width == 16
 
 
-@pytest.mark.parametrize(('index', 'ty'), enumerate(MANY_TYPES), ids=str)
-def test_type_equality_operators(index, ty):
-    non_pyarrow = ['foo', 16, {'s', 'e', 't'}]
+def test_type_equality_operators():
+    many_types = get_many_types()
+    non_pyarrow = ('foo', 16, {'s', 'e', 't'})
 
-    # could use two parametrization levels, but that'd bloat pytest's output
-    for i, other in enumerate(MANY_TYPES + non_pyarrow):
-        if i == index:
-            assert ty == other
-        else:
-            assert ty != other
+    for index, ty in enumerate(many_types):
+        # could use two parametrization levels,
+        # but that'd bloat pytest's output
+        for i, other in enumerate(many_types + non_pyarrow):
+            if i == index:
+                assert ty == other
+            else:
+                assert ty != other
 
 
 def test_field_basic():
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 8185f39..1e13cef 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -182,6 +182,9 @@ cdef class DictionaryType(DataType):
         DataType.init(self, type)
         self.dict_type = <const CDictionaryType*> type.get()
 
+    def __reduce__(self):
+        return dictionary, (self.index_type, self.dictionary, self.ordered)
+
     property ordered:
 
         def __get__(self):
@@ -420,7 +423,7 @@ cdef class Field:
         return self.__str__()
 
     def __hash__(self):
-        return hash((self.field.name(), self.type.id))
+        return hash((self.field.name(), self.type, self.field.nullable()))
 
     property nullable: