You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/04/04 20:22:34 UTC

arrow git commit: ARROW-668: [Python] Box timestamp values as pandas.Timestamp if available, attach tzinfo

Repository: arrow
Updated Branches:
  refs/heads/master 360942e61 -> e29a7d4ca


ARROW-668: [Python] Box timestamp values as pandas.Timestamp if available, attach tzinfo

I'm not sure how to easily test the behavior if pandas is not present. I created an environment without pandas and added some fixes so that I verify the behavior, but at some point we should create a "no pandas" test suite to see what using pyarrow is like without pandas installed.

Author: Wes McKinney <we...@twosigma.com>

Closes #487 from wesm/ARROW-668 and squashes the following commits:

554a647 [Wes McKinney] Remove cython from requirements.txt
649d28a [Wes McKinney] Box timestamp values as pandas.Timestamp if available, return timezone also if available


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/e29a7d4c
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/e29a7d4c
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/e29a7d4c

Branch: refs/heads/master
Commit: e29a7d4cae943312a1f8598e71c5d46c1954b5fa
Parents: 360942e
Author: Wes McKinney <we...@twosigma.com>
Authored: Tue Apr 4 16:22:29 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Tue Apr 4 16:22:29 2017 -0400

----------------------------------------------------------------------
 python/pyarrow/array.pyx             | 25 ++++------
 python/pyarrow/compat.py             | 17 +++++++
 python/pyarrow/scalar.pyx            | 47 +++++++++++++++----
 python/pyarrow/tests/test_scalars.py | 76 ++++++++++++++++++++-----------
 4 files changed, 112 insertions(+), 53 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/e29a7d4c/python/pyarrow/array.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx
index 67785e3..1f59556 100644
--- a/python/pyarrow/array.pyx
+++ b/python/pyarrow/array.pyx
@@ -29,7 +29,7 @@ cimport pyarrow.includes.pyarrow as pyarrow
 
 import pyarrow.config
 
-from pyarrow.compat import frombytes, tobytes
+from pyarrow.compat import frombytes, tobytes, PandasSeries, Categorical
 from pyarrow.error cimport check_status
 from pyarrow.memory cimport MemoryPool, maybe_unbox_memory_pool
 
@@ -44,11 +44,6 @@ import pyarrow.schema as schema
 cimport cpython
 
 
-cdef _pandas():
-    import pandas as pd
-    return pd
-
-
 cdef maybe_coerce_datetime64(values, dtype, DataType type,
                              timestamps_to_ms=False):
 
@@ -66,7 +61,7 @@ cdef maybe_coerce_datetime64(values, dtype, DataType type,
         tz = dtype.tz
         unit = 'ms' if coerce_ms else dtype.unit
         type = schema.timestamp(unit, tz)
-    else:
+    elif type is None:
         # Trust the NumPy dtype
         type = schema.type_from_numpy_dtype(values.dtype)
 
@@ -141,15 +136,13 @@ cdef class Array:
             shared_ptr[CDataType] c_type
             CMemoryPool* pool
 
-        pd = _pandas()
-
         if mask is not None:
             mask = get_series_values(mask)
 
         values = get_series_values(obj)
         pool = maybe_unbox_memory_pool(memory_pool)
 
-        if isinstance(values, pd.Categorical):
+        if isinstance(values, Categorical):
             return DictionaryArray.from_arrays(
                 values.codes, values.categories.values,
                 mask=mask, memory_pool=memory_pool)
@@ -397,9 +390,9 @@ cdef wrap_array_output(PyObject* output):
     cdef object obj = PyObject_to_object(output)
 
     if isinstance(obj, dict):
-        return _pandas().Categorical(obj['indices'],
-                                     categories=obj['dictionary'],
-                                     fastpath=True)
+        return Categorical(obj['indices'],
+                           categories=obj['dictionary'],
+                           fastpath=True)
     else:
         return obj
 
@@ -622,14 +615,12 @@ cdef object box_tensor(const shared_ptr[CTensor]& sp_tensor):
 
 
 cdef object get_series_values(object obj):
-    import pandas as pd
-
-    if isinstance(obj, pd.Series):
+    if isinstance(obj, PandasSeries):
         result = obj.values
     elif isinstance(obj, np.ndarray):
         result = obj
     else:
-        result = pd.Series(obj).values
+        result = PandasSeries(obj).values
 
     return result
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/e29a7d4c/python/pyarrow/compat.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/compat.py b/python/pyarrow/compat.py
index b9206aa..4dcc116 100644
--- a/python/pyarrow/compat.py
+++ b/python/pyarrow/compat.py
@@ -38,9 +38,26 @@ try:
     else:
         from pandas.types.dtypes import DatetimeTZDtype
         pdapi = pd.api.types
+
+    PandasSeries = pd.Series
+    Categorical = pd.Categorical
     HAVE_PANDAS = True
 except:
     HAVE_PANDAS = False
+    class DatetimeTZDtype(object):
+        pass
+
+    class ClassPlaceholder(object):
+
+        def __init__(self, *args, **kwargs):
+            raise NotImplementedError
+
+    class PandasSeries(ClassPlaceholder):
+        pass
+
+    class Categorical(ClassPlaceholder):
+        pass
+
 
 if PY26:
     import unittest2 as unittest

http://git-wip-us.apache.org/repos/asf/arrow/blob/e29a7d4c/python/pyarrow/scalar.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/scalar.pyx b/python/pyarrow/scalar.pyx
index 983a9a7..1c0790a 100644
--- a/python/pyarrow/scalar.pyx
+++ b/python/pyarrow/scalar.pyx
@@ -26,6 +26,12 @@ cimport cpython as cp
 
 NA = None
 
+
+cdef _pandas():
+    import pandas as pd
+    return pd
+
+
 cdef class NAType(Scalar):
 
     def __cinit__(self):
@@ -146,16 +152,37 @@ cdef class TimestampValue(ArrayValue):
             CTimestampType* dtype = <CTimestampType*>ap.type().get()
             int64_t val = ap.Value(self.index)
 
-        if dtype.unit == TimeUnit_SECOND:
-            return datetime.datetime.utcfromtimestamp(val)
-        elif dtype.unit == TimeUnit_MILLI:
-            return datetime.datetime.utcfromtimestamp(float(val) / 1000)
-        elif dtype.unit == TimeUnit_MICRO:
-            return datetime.datetime.utcfromtimestamp(float(val) / 1000000)
-        else:
-            # TimeUnit_NANO
-            raise NotImplementedError("Cannot convert nanosecond timestamps "
-                                      "to datetime.datetime")
+        timezone = None
+        tzinfo = None
+        if dtype.timezone.size() > 0:
+            timezone = frombytes(dtype.timezone)
+            import pytz
+            tzinfo = pytz.timezone(timezone)
+
+        try:
+            pd = _pandas()
+            if dtype.unit == TimeUnit_SECOND:
+                val = val * 1000000000
+            elif dtype.unit == TimeUnit_MILLI:
+                val = val * 1000000
+            elif dtype.unit == TimeUnit_MICRO:
+                val = val * 1000
+            return pd.Timestamp(val, tz=tzinfo)
+        except ImportError:
+            if dtype.unit == TimeUnit_SECOND:
+                result = datetime.datetime.utcfromtimestamp(val)
+            elif dtype.unit == TimeUnit_MILLI:
+                result = datetime.datetime.utcfromtimestamp(float(val) / 1000)
+            elif dtype.unit == TimeUnit_MICRO:
+                result = datetime.datetime.utcfromtimestamp(
+                    float(val) / 1000000)
+            else:
+                # TimeUnit_NANO
+                raise NotImplementedError("Cannot convert nanosecond "
+                                          "timestamps without pandas")
+            if timezone is not None:
+                result = result.replace(tzinfo=tzinfo)
+            return result
 
 
 cdef class FloatValue(ArrayValue):

http://git-wip-us.apache.org/repos/asf/arrow/blob/e29a7d4c/python/pyarrow/tests/test_scalars.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py
index a5db7e0..f4f275b 100644
--- a/python/pyarrow/tests/test_scalars.py
+++ b/python/pyarrow/tests/test_scalars.py
@@ -19,69 +19,69 @@
 import pandas as pd
 
 from pyarrow.compat import unittest, u, unicode_type
-import pyarrow as A
+import pyarrow as pa
 
 
 class TestScalars(unittest.TestCase):
 
     def test_null_singleton(self):
         with self.assertRaises(Exception):
-            A.NAType()
+            pa.NAType()
 
     def test_bool(self):
-        arr = A.from_pylist([True, None, False, None])
+        arr = pa.from_pylist([True, None, False, None])
 
         v = arr[0]
-        assert isinstance(v, A.BooleanValue)
+        assert isinstance(v, pa.BooleanValue)
         assert repr(v) == "True"
         assert v.as_py() is True
 
-        assert arr[1] is A.NA
+        assert arr[1] is pa.NA
 
     def test_int64(self):
-        arr = A.from_pylist([1, 2, None])
+        arr = pa.from_pylist([1, 2, None])
 
         v = arr[0]
-        assert isinstance(v, A.Int64Value)
+        assert isinstance(v, pa.Int64Value)
         assert repr(v) == "1"
         assert v.as_py() == 1
 
-        assert arr[2] is A.NA
+        assert arr[2] is pa.NA
 
     def test_double(self):
-        arr = A.from_pylist([1.5, None, 3])
+        arr = pa.from_pylist([1.5, None, 3])
 
         v = arr[0]
-        assert isinstance(v, A.DoubleValue)
+        assert isinstance(v, pa.DoubleValue)
         assert repr(v) == "1.5"
         assert v.as_py() == 1.5
 
-        assert arr[1] is A.NA
+        assert arr[1] is pa.NA
 
         v = arr[2]
         assert v.as_py() == 3.0
 
     def test_string_unicode(self):
-        arr = A.from_pylist([u'foo', None, u'ma�ana'])
+        arr = pa.from_pylist([u'foo', None, u'ma�ana'])
 
         v = arr[0]
-        assert isinstance(v, A.StringValue)
+        assert isinstance(v, pa.StringValue)
         assert v.as_py() == 'foo'
 
-        assert arr[1] is A.NA
+        assert arr[1] is pa.NA
 
         v = arr[2].as_py()
         assert v == u'ma�ana'
         assert isinstance(v, unicode_type)
 
     def test_bytes(self):
-        arr = A.from_pylist([b'foo', None, u('bar')])
+        arr = pa.from_pylist([b'foo', None, u('bar')])
 
         v = arr[0]
-        assert isinstance(v, A.BinaryValue)
+        assert isinstance(v, pa.BinaryValue)
         assert v.as_py() == b'foo'
 
-        assert arr[1] is A.NA
+        assert arr[1] is pa.NA
 
         v = arr[2].as_py()
         assert v == b'bar'
@@ -89,41 +89,65 @@ class TestScalars(unittest.TestCase):
 
     def test_fixed_size_bytes(self):
         data = [b'foof', None, b'barb']
-        arr = A.from_pylist(data, type=A.binary(4))
+        arr = pa.from_pylist(data, type=pa.binary(4))
 
         v = arr[0]
-        assert isinstance(v, A.FixedSizeBinaryValue)
+        assert isinstance(v, pa.FixedSizeBinaryValue)
         assert v.as_py() == b'foof'
 
-        assert arr[1] is A.NA
+        assert arr[1] is pa.NA
 
         v = arr[2].as_py()
         assert v == b'barb'
         assert isinstance(v, bytes)
 
     def test_list(self):
-        arr = A.from_pylist([['foo', None], None, ['bar'], []])
+        arr = pa.from_pylist([['foo', None], None, ['bar'], []])
 
         v = arr[0]
         assert len(v) == 2
-        assert isinstance(v, A.ListValue)
+        assert isinstance(v, pa.ListValue)
         assert repr(v) == "['foo', None]"
         assert v.as_py() == ['foo', None]
         assert v[0].as_py() == 'foo'
-        assert v[1] is A.NA
+        assert v[1] is pa.NA
 
-        assert arr[1] is A.NA
+        assert arr[1] is pa.NA
 
         v = arr[3]
         assert len(v) == 0
 
+    def test_timestamp(self):
+        arr = pd.date_range('2000-01-01 12:34:56', periods=10).values
+
+        units = ['s', 'ms', 'us', 'ns']
+
+        for unit in units:
+            dtype = 'datetime64[{0}]'.format(unit)
+            arrow_arr = pa.Array.from_numpy(arr.astype(dtype))
+            expected = pd.Timestamp('2000-01-01 12:34:56')
+
+            assert arrow_arr[0].as_py() == expected
+
+            tz = 'America/New_York'
+            arrow_type = pa.timestamp(unit, tz=tz)
+
+            dtype = 'datetime64[{0}]'.format(unit)
+            arrow_arr = pa.Array.from_numpy(arr.astype(dtype),
+                                            type=arrow_type)
+            expected = (pd.Timestamp('2000-01-01 12:34:56')
+                        .tz_localize('utc')
+                        .tz_convert(tz))
+
+            assert arrow_arr[0].as_py() == expected
+
     def test_dictionary(self):
         colors = ['red', 'green', 'blue']
         values = pd.Series(colors * 4)
 
         categorical = pd.Categorical(values, categories=colors)
 
-        v = A.DictionaryArray.from_arrays(categorical.codes,
-                                          categorical.categories)
+        v = pa.DictionaryArray.from_arrays(categorical.codes,
+                                           categorical.categories)
         for i, c in enumerate(values):
             assert v[i].as_py() == c