You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2018/08/08 15:10:08 UTC
[arrow] branch master updated: ARROW-3002: [Python] Hash more parts
of pyarrow.Field
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new d247c5b ARROW-3002: [Python] Hash more parts of pyarrow.Field
d247c5b is described below
commit d247c5b902ca51064a5d7b3787e9f474c38d93b4
Author: Krisztián Szűcs <sz...@gmail.com>
AuthorDate: Wed Aug 8 17:10:01 2018 +0200
ARROW-3002: [Python] Hash more parts of pyarrow.Field
Author: Krisztián Szűcs <sz...@gmail.com>
Closes #2385 from kszucs/ARROW-3002 and squashes the following commits:
e2425e5 <Krisztián Szűcs> prevent holding a global array reference
151dac0 <Krisztián Szűcs> support for pickling dictionary type
0e577da <Krisztián Szűcs> hash field nullable property
---
python/pyarrow/tests/test_types.py | 107 +++++++++++++++++++++----------------
python/pyarrow/types.pxi | 5 +-
2 files changed, 64 insertions(+), 48 deletions(-)
diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py
index 4408859..61f6018 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -23,34 +23,42 @@ import pyarrow as pa
import pyarrow.types as types
-MANY_TYPES = [
- pa.null(),
- pa.bool_(),
- pa.int32(),
- pa.time32('s'),
- pa.time64('us'),
- pa.date32(),
- pa.timestamp('us'),
- pa.timestamp('us', tz='UTC'),
- pa.timestamp('us', tz='Europe/Paris'),
- pa.float16(),
- pa.float32(),
- pa.float64(),
- pa.decimal128(19, 4),
- pa.string(),
- pa.binary(),
- pa.binary(10),
- pa.list_(pa.int32()),
- pa.struct([pa.field('a', pa.int32()),
- pa.field('b', pa.int8()),
- pa.field('c', pa.string())]),
- pa.union([pa.field('a', pa.binary(10)),
- pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
- pa.union([pa.field('a', pa.binary(10)),
- pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
- # XXX Needs array pickling
- # pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c'])),
-]
+def get_many_types():
+ # returning them from a function is required because of pa.dictionary
+ # type holds a pyarrow array and test_array.py::test_toal_bytes_allocated
+ # checks that the default memory pool has zero allocated bytes
+ return (
+ pa.null(),
+ pa.bool_(),
+ pa.int32(),
+ pa.time32('s'),
+ pa.time64('us'),
+ pa.date32(),
+ pa.timestamp('us'),
+ pa.timestamp('us', tz='UTC'),
+ pa.timestamp('us', tz='Europe/Paris'),
+ pa.float16(),
+ pa.float32(),
+ pa.float64(),
+ pa.decimal128(19, 4),
+ pa.string(),
+ pa.binary(),
+ pa.binary(10),
+ pa.list_(pa.int32()),
+ pa.struct([pa.field('a', pa.int32()),
+ pa.field('b', pa.int8()),
+ pa.field('c', pa.string())]),
+ pa.struct([pa.field('a', pa.int32(), nullable=False),
+ pa.field('b', pa.int8(), nullable=False),
+ pa.field('c', pa.string())]),
+ pa.union([pa.field('a', pa.binary(10)),
+ pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE),
+ pa.union([pa.field('a', pa.binary(10)),
+ pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
+ pa.union([pa.field('a', pa.binary(10), nullable=False),
+ pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE),
+ pa.dictionary(pa.int32(), pa.array(['a', 'b', 'c']))
+ )
def test_is_boolean():
@@ -208,19 +216,20 @@ def test_union_type():
def test_types_hashable():
+ many_types = get_many_types()
in_dict = {}
- for i, type_ in enumerate(MANY_TYPES):
+ for i, type_ in enumerate(many_types):
assert hash(type_) == hash(type_)
in_dict[type_] = i
- assert len(in_dict) == len(MANY_TYPES)
- for i, type_ in enumerate(MANY_TYPES):
+ assert len(in_dict) == len(many_types)
+ for i, type_ in enumerate(many_types):
assert in_dict[type_] == i
-@pytest.mark.parametrize('ty', MANY_TYPES, ids=str)
-def test_types_picklable(ty):
- data = pickle.dumps(ty)
- assert pickle.loads(data) == ty
+def test_types_picklable():
+ for ty in get_many_types():
+ data = pickle.dumps(ty)
+ assert pickle.loads(data) == ty
def test_dictionary_type():
@@ -231,9 +240,11 @@ def test_dictionary_type():
def test_fields_hashable():
in_dict = {}
- fields = [pa.field('a', pa.int64()),
- pa.field('a', pa.int32()),
- pa.field('b', pa.int32())]
+ fields = [pa.field('a', pa.int32()),
+ pa.field('a', pa.int64()),
+ pa.field('a', pa.int64(), nullable=False),
+ pa.field('b', pa.int32()),
+ pa.field('b', pa.int32(), nullable=False)]
for i, field in enumerate(fields):
in_dict[field] = i
assert len(in_dict) == len(fields)
@@ -285,16 +296,18 @@ def test_decimal_byte_width():
assert ty.byte_width == 16
-@pytest.mark.parametrize(('index', 'ty'), enumerate(MANY_TYPES), ids=str)
-def test_type_equality_operators(index, ty):
- non_pyarrow = ['foo', 16, {'s', 'e', 't'}]
+def test_type_equality_operators():
+ many_types = get_many_types()
+ non_pyarrow = ('foo', 16, {'s', 'e', 't'})
- # could use two parametrization levels, but that'd bloat pytest's output
- for i, other in enumerate(MANY_TYPES + non_pyarrow):
- if i == index:
- assert ty == other
- else:
- assert ty != other
+ for index, ty in enumerate(many_types):
+ # could use two parametrization levels,
+ # but that'd bloat pytest's output
+ for i, other in enumerate(many_types + non_pyarrow):
+ if i == index:
+ assert ty == other
+ else:
+ assert ty != other
def test_field_basic():
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 8185f39..1e13cef 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -182,6 +182,9 @@ cdef class DictionaryType(DataType):
DataType.init(self, type)
self.dict_type = <const CDictionaryType*> type.get()
+ def __reduce__(self):
+ return dictionary, (self.index_type, self.dictionary, self.ordered)
+
property ordered:
def __get__(self):
@@ -420,7 +423,7 @@ cdef class Field:
return self.__str__()
def __hash__(self):
- return hash((self.field.name(), self.type.id))
+ return hash((self.field.name(), self.type, self.field.nullable()))
property nullable: