You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/04/04 20:22:34 UTC
arrow git commit: ARROW-668: [Python] Box timestamp values as
pandas.Timestamp if available, attach tzinfo
Repository: arrow
Updated Branches:
refs/heads/master 360942e61 -> e29a7d4ca
ARROW-668: [Python] Box timestamp values as pandas.Timestamp if available, attach tzinfo
I'm not sure how to easily test the behavior if pandas is not present. I created an environment without pandas and added some fixes so that I verify the behavior, but at some point we should create a "no pandas" test suite to see what using pyarrow is like without pandas installed.
Author: Wes McKinney <we...@twosigma.com>
Closes #487 from wesm/ARROW-668 and squashes the following commits:
554a647 [Wes McKinney] Remove cython from requirements.txt
649d28a [Wes McKinney] Box timestamp values as pandas.Timestamp if available, return timezone also if available
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/e29a7d4c
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/e29a7d4c
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/e29a7d4c
Branch: refs/heads/master
Commit: e29a7d4cae943312a1f8598e71c5d46c1954b5fa
Parents: 360942e
Author: Wes McKinney <we...@twosigma.com>
Authored: Tue Apr 4 16:22:29 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Tue Apr 4 16:22:29 2017 -0400
----------------------------------------------------------------------
python/pyarrow/array.pyx | 25 ++++------
python/pyarrow/compat.py | 17 +++++++
python/pyarrow/scalar.pyx | 47 +++++++++++++++----
python/pyarrow/tests/test_scalars.py | 76 ++++++++++++++++++++-----------
4 files changed, 112 insertions(+), 53 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/e29a7d4c/python/pyarrow/array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx
index 67785e3..1f59556 100644
--- a/python/pyarrow/array.pyx
+++ b/python/pyarrow/array.pyx
@@ -29,7 +29,7 @@ cimport pyarrow.includes.pyarrow as pyarrow
import pyarrow.config
-from pyarrow.compat import frombytes, tobytes
+from pyarrow.compat import frombytes, tobytes, PandasSeries, Categorical
from pyarrow.error cimport check_status
from pyarrow.memory cimport MemoryPool, maybe_unbox_memory_pool
@@ -44,11 +44,6 @@ import pyarrow.schema as schema
cimport cpython
-cdef _pandas():
- import pandas as pd
- return pd
-
-
cdef maybe_coerce_datetime64(values, dtype, DataType type,
timestamps_to_ms=False):
@@ -66,7 +61,7 @@ cdef maybe_coerce_datetime64(values, dtype, DataType type,
tz = dtype.tz
unit = 'ms' if coerce_ms else dtype.unit
type = schema.timestamp(unit, tz)
- else:
+ elif type is None:
# Trust the NumPy dtype
type = schema.type_from_numpy_dtype(values.dtype)
@@ -141,15 +136,13 @@ cdef class Array:
shared_ptr[CDataType] c_type
CMemoryPool* pool
- pd = _pandas()
-
if mask is not None:
mask = get_series_values(mask)
values = get_series_values(obj)
pool = maybe_unbox_memory_pool(memory_pool)
- if isinstance(values, pd.Categorical):
+ if isinstance(values, Categorical):
return DictionaryArray.from_arrays(
values.codes, values.categories.values,
mask=mask, memory_pool=memory_pool)
@@ -397,9 +390,9 @@ cdef wrap_array_output(PyObject* output):
cdef object obj = PyObject_to_object(output)
if isinstance(obj, dict):
- return _pandas().Categorical(obj['indices'],
- categories=obj['dictionary'],
- fastpath=True)
+ return Categorical(obj['indices'],
+ categories=obj['dictionary'],
+ fastpath=True)
else:
return obj
@@ -622,14 +615,12 @@ cdef object box_tensor(const shared_ptr[CTensor]& sp_tensor):
cdef object get_series_values(object obj):
- import pandas as pd
-
- if isinstance(obj, pd.Series):
+ if isinstance(obj, PandasSeries):
result = obj.values
elif isinstance(obj, np.ndarray):
result = obj
else:
- result = pd.Series(obj).values
+ result = PandasSeries(obj).values
return result
http://git-wip-us.apache.org/repos/asf/arrow/blob/e29a7d4c/python/pyarrow/compat.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/compat.py b/python/pyarrow/compat.py
index b9206aa..4dcc116 100644
--- a/python/pyarrow/compat.py
+++ b/python/pyarrow/compat.py
@@ -38,9 +38,26 @@ try:
else:
from pandas.types.dtypes import DatetimeTZDtype
pdapi = pd.api.types
+
+ PandasSeries = pd.Series
+ Categorical = pd.Categorical
HAVE_PANDAS = True
except:
HAVE_PANDAS = False
+ class DatetimeTZDtype(object):
+ pass
+
+ class ClassPlaceholder(object):
+
+ def __init__(self, *args, **kwargs):
+ raise NotImplementedError
+
+ class PandasSeries(ClassPlaceholder):
+ pass
+
+ class Categorical(ClassPlaceholder):
+ pass
+
if PY26:
import unittest2 as unittest
http://git-wip-us.apache.org/repos/asf/arrow/blob/e29a7d4c/python/pyarrow/scalar.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/scalar.pyx b/python/pyarrow/scalar.pyx
index 983a9a7..1c0790a 100644
--- a/python/pyarrow/scalar.pyx
+++ b/python/pyarrow/scalar.pyx
@@ -26,6 +26,12 @@ cimport cpython as cp
NA = None
+
+cdef _pandas():
+ import pandas as pd
+ return pd
+
+
cdef class NAType(Scalar):
def __cinit__(self):
@@ -146,16 +152,37 @@ cdef class TimestampValue(ArrayValue):
CTimestampType* dtype = <CTimestampType*>ap.type().get()
int64_t val = ap.Value(self.index)
- if dtype.unit == TimeUnit_SECOND:
- return datetime.datetime.utcfromtimestamp(val)
- elif dtype.unit == TimeUnit_MILLI:
- return datetime.datetime.utcfromtimestamp(float(val) / 1000)
- elif dtype.unit == TimeUnit_MICRO:
- return datetime.datetime.utcfromtimestamp(float(val) / 1000000)
- else:
- # TimeUnit_NANO
- raise NotImplementedError("Cannot convert nanosecond timestamps "
- "to datetime.datetime")
+ timezone = None
+ tzinfo = None
+ if dtype.timezone.size() > 0:
+ timezone = frombytes(dtype.timezone)
+ import pytz
+ tzinfo = pytz.timezone(timezone)
+
+ try:
+ pd = _pandas()
+ if dtype.unit == TimeUnit_SECOND:
+ val = val * 1000000000
+ elif dtype.unit == TimeUnit_MILLI:
+ val = val * 1000000
+ elif dtype.unit == TimeUnit_MICRO:
+ val = val * 1000
+ return pd.Timestamp(val, tz=tzinfo)
+ except ImportError:
+ if dtype.unit == TimeUnit_SECOND:
+ result = datetime.datetime.utcfromtimestamp(val)
+ elif dtype.unit == TimeUnit_MILLI:
+ result = datetime.datetime.utcfromtimestamp(float(val) / 1000)
+ elif dtype.unit == TimeUnit_MICRO:
+ result = datetime.datetime.utcfromtimestamp(
+ float(val) / 1000000)
+ else:
+ # TimeUnit_NANO
+ raise NotImplementedError("Cannot convert nanosecond "
+ "timestamps without pandas")
+ if timezone is not None:
+ result = result.replace(tzinfo=tzinfo)
+ return result
cdef class FloatValue(ArrayValue):
http://git-wip-us.apache.org/repos/asf/arrow/blob/e29a7d4c/python/pyarrow/tests/test_scalars.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py
index a5db7e0..f4f275b 100644
--- a/python/pyarrow/tests/test_scalars.py
+++ b/python/pyarrow/tests/test_scalars.py
@@ -19,69 +19,69 @@
import pandas as pd
from pyarrow.compat import unittest, u, unicode_type
-import pyarrow as A
+import pyarrow as pa
class TestScalars(unittest.TestCase):
def test_null_singleton(self):
with self.assertRaises(Exception):
- A.NAType()
+ pa.NAType()
def test_bool(self):
- arr = A.from_pylist([True, None, False, None])
+ arr = pa.from_pylist([True, None, False, None])
v = arr[0]
- assert isinstance(v, A.BooleanValue)
+ assert isinstance(v, pa.BooleanValue)
assert repr(v) == "True"
assert v.as_py() is True
- assert arr[1] is A.NA
+ assert arr[1] is pa.NA
def test_int64(self):
- arr = A.from_pylist([1, 2, None])
+ arr = pa.from_pylist([1, 2, None])
v = arr[0]
- assert isinstance(v, A.Int64Value)
+ assert isinstance(v, pa.Int64Value)
assert repr(v) == "1"
assert v.as_py() == 1
- assert arr[2] is A.NA
+ assert arr[2] is pa.NA
def test_double(self):
- arr = A.from_pylist([1.5, None, 3])
+ arr = pa.from_pylist([1.5, None, 3])
v = arr[0]
- assert isinstance(v, A.DoubleValue)
+ assert isinstance(v, pa.DoubleValue)
assert repr(v) == "1.5"
assert v.as_py() == 1.5
- assert arr[1] is A.NA
+ assert arr[1] is pa.NA
v = arr[2]
assert v.as_py() == 3.0
def test_string_unicode(self):
- arr = A.from_pylist([u'foo', None, u'ma�ana'])
+ arr = pa.from_pylist([u'foo', None, u'ma�ana'])
v = arr[0]
- assert isinstance(v, A.StringValue)
+ assert isinstance(v, pa.StringValue)
assert v.as_py() == 'foo'
- assert arr[1] is A.NA
+ assert arr[1] is pa.NA
v = arr[2].as_py()
assert v == u'ma�ana'
assert isinstance(v, unicode_type)
def test_bytes(self):
- arr = A.from_pylist([b'foo', None, u('bar')])
+ arr = pa.from_pylist([b'foo', None, u('bar')])
v = arr[0]
- assert isinstance(v, A.BinaryValue)
+ assert isinstance(v, pa.BinaryValue)
assert v.as_py() == b'foo'
- assert arr[1] is A.NA
+ assert arr[1] is pa.NA
v = arr[2].as_py()
assert v == b'bar'
@@ -89,41 +89,65 @@ class TestScalars(unittest.TestCase):
def test_fixed_size_bytes(self):
data = [b'foof', None, b'barb']
- arr = A.from_pylist(data, type=A.binary(4))
+ arr = pa.from_pylist(data, type=pa.binary(4))
v = arr[0]
- assert isinstance(v, A.FixedSizeBinaryValue)
+ assert isinstance(v, pa.FixedSizeBinaryValue)
assert v.as_py() == b'foof'
- assert arr[1] is A.NA
+ assert arr[1] is pa.NA
v = arr[2].as_py()
assert v == b'barb'
assert isinstance(v, bytes)
def test_list(self):
- arr = A.from_pylist([['foo', None], None, ['bar'], []])
+ arr = pa.from_pylist([['foo', None], None, ['bar'], []])
v = arr[0]
assert len(v) == 2
- assert isinstance(v, A.ListValue)
+ assert isinstance(v, pa.ListValue)
assert repr(v) == "['foo', None]"
assert v.as_py() == ['foo', None]
assert v[0].as_py() == 'foo'
- assert v[1] is A.NA
+ assert v[1] is pa.NA
- assert arr[1] is A.NA
+ assert arr[1] is pa.NA
v = arr[3]
assert len(v) == 0
+ def test_timestamp(self):
+ arr = pd.date_range('2000-01-01 12:34:56', periods=10).values
+
+ units = ['s', 'ms', 'us', 'ns']
+
+ for unit in units:
+ dtype = 'datetime64[{0}]'.format(unit)
+ arrow_arr = pa.Array.from_numpy(arr.astype(dtype))
+ expected = pd.Timestamp('2000-01-01 12:34:56')
+
+ assert arrow_arr[0].as_py() == expected
+
+ tz = 'America/New_York'
+ arrow_type = pa.timestamp(unit, tz=tz)
+
+ dtype = 'datetime64[{0}]'.format(unit)
+ arrow_arr = pa.Array.from_numpy(arr.astype(dtype),
+ type=arrow_type)
+ expected = (pd.Timestamp('2000-01-01 12:34:56')
+ .tz_localize('utc')
+ .tz_convert(tz))
+
+ assert arrow_arr[0].as_py() == expected
+
def test_dictionary(self):
colors = ['red', 'green', 'blue']
values = pd.Series(colors * 4)
categorical = pd.Categorical(values, categories=colors)
- v = A.DictionaryArray.from_arrays(categorical.codes,
- categorical.categories)
+ v = pa.DictionaryArray.from_arrays(categorical.codes,
+ categorical.categories)
for i, c in enumerate(values):
assert v[i].as_py() == c