You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/04/03 04:39:28 UTC
[arrow] branch master updated: ARROW-2122: [Python] Pyarrow fails
to serialize dataframe with timestamp.
This is an automated email from the ASF dual-hosted git repository.
uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new fff992a ARROW-2122: [Python] Pyarrow fails to serialize dataframe with timestamp.
fff992a is described below
commit fff992a048c4e4feb241772047f89f48ebe6201c
Author: Albert Shieh <as...@ansatzcapital.com>
AuthorDate: Tue Apr 3 06:39:21 2018 +0200
ARROW-2122: [Python] Pyarrow fails to serialize dataframe with timestamp.
Fixes [ARROW-2122](https://issues.apache.org/jira/browse/ARROW-2122).
Author: Albert Shieh <as...@ansatzcapital.com>
Closes #1707 from adshieh/ARROW-2122 and squashes the following commits:
2fae121 <Albert Shieh> No trailing characters
1c5e093 <Albert Shieh> Change fixed offset timezone format and add tests
71c5c47 <Albert Shieh> Fixed offset timezone handling
---
python/pyarrow/pandas_compat.py | 5 ++-
python/pyarrow/scalar.pxi | 3 +-
python/pyarrow/table.pxi | 6 ++-
python/pyarrow/tests/test_convert_builtin.py | 12 ++++++
python/pyarrow/tests/test_convert_pandas.py | 10 +++++
python/pyarrow/types.pxi | 61 +++++++++++++++++++++++++++-
6 files changed, 90 insertions(+), 7 deletions(-)
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 97ea51d..24da744 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -129,7 +129,7 @@ def get_extension_dtype_info(column):
}
physical_dtype = str(cats.codes.dtype)
elif hasattr(dtype, 'tz'):
- metadata = {'timezone': str(dtype.tz)}
+ metadata = {'timezone': pa.lib.tzinfo_to_string(dtype.tz)}
physical_dtype = 'datetime64[ns]'
else:
metadata = None
@@ -419,7 +419,7 @@ def dataframe_to_serialized_dict(frame):
block_data = {}
if isinstance(block, _int.DatetimeTZBlock):
- block_data['timezone'] = values.tz.zone
+ block_data['timezone'] = pa.lib.tzinfo_to_string(values.tz)
values = values.values
elif isinstance(block, _int.CategoricalBlock):
block_data.update(dictionary=values.categories,
@@ -483,6 +483,7 @@ def _reconstruct_block(item):
def _make_datetimetz(tz):
from pyarrow.compat import DatetimeTZDtype
+ tz = pa.lib.string_to_tzinfo(tz)
return DatetimeTZDtype('ns', tz=tz)
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 6784e56..e152771 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -237,8 +237,7 @@ cdef class TimestampValue(ArrayValue):
value = self.value
if not dtype.timezone().empty():
- import pytz
- tzinfo = pytz.timezone(frombytes(dtype.timezone()))
+ tzinfo = string_to_tzinfo(frombytes(dtype.timezone()))
else:
tzinfo = None
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 672b9fb..cbf2a69 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -344,9 +344,11 @@ cdef class Column:
result = pd.Series(values, name=self.name)
if isinstance(self.type, TimestampType):
- if self.type.tz is not None:
+ tz = self.type.tz
+ if tz is not None:
+ tz = string_to_tzinfo(tz)
result = (result.dt.tz_localize('utc')
- .dt.tz_convert(self.type.tz))
+ .dt.tz_convert(tz))
return result
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index 19b59a4..988d512 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -26,6 +26,7 @@ import decimal
import itertools
import numpy as np
import six
+import pytz
int_type_pairs = [
@@ -649,3 +650,14 @@ def test_decimal_array_with_none_and_nan():
array = pa.array(values, type=pa.decimal128(10, 4))
assert array.to_pylist() == [decimal.Decimal('1.2340'), None, None, None]
+
+
+@pytest.mark.parametrize('tz,name', [
+ (pytz.FixedOffset(90), '+01:30'),
+ (pytz.FixedOffset(-90), '-01:30'),
+ (pytz.utc, 'UTC'),
+ (pytz.timezone('America/New_York'), 'America/New_York')
+])
+def test_timezone_string(tz, name):
+ assert pa.lib.tzinfo_to_string(tz) == name
+ assert pa.lib.string_to_tzinfo(name) == tz
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 45ec66d..d4d3294 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1039,6 +1039,16 @@ class TestConvertDateTimeLikeTypes(object):
expected = pd.Series([None, date(1991, 1, 1), None])
assert pa.Array.from_pandas(expected).equals(result)
+ def test_fixed_offset_timezone(self):
+ df = pd.DataFrame({
+ 'a': [
+ pd.Timestamp('2012-11-11 00:00:00+01:00'),
+ pd.NaT
+ ]
+ })
+ _check_pandas_roundtrip(df)
+ _check_serialize_components_roundtrip(df)
+
class TestConvertStringLikeTypes(object):
"""
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index b0557eb..2abdb30 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -15,6 +15,8 @@
# specific language governing permissions and limitations
# under the License.
+import re
+
# These are imprecise because the type (in pandas 0.x) depends on the presence
# of nulls
cdef dict _pandas_type_map = {
@@ -840,6 +842,63 @@ cdef timeunit_to_string(TimeUnit unit):
return 'ns'
+_FIXED_OFFSET_RE = re.compile(r'([+-])(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])$')
+
+
+def tzinfo_to_string(tz):
+ """
+ Converts a time zone object into a string indicating the name of a time
+ zone, one of:
+ * As used in the Olson time zone database (the "tz database" or
+ "tzdata"), such as "America/New_York"
+ * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
+
+ Parameters
+ ----------
+ tz : datetime.tzinfo
+ Time zone object
+
+ Returns
+ -------
+ name : string
+ Time zone name
+ """
+ if tz.zone is None:
+ sign = '+' if tz._minutes >= 0 else '-'
+ hours, minutes = divmod(abs(tz._minutes), 60)
+ return '{}{:02d}:{:02d}'.format(sign, hours, minutes)
+ else:
+ return tz.zone
+
+
+def string_to_tzinfo(name):
+ """
+ Converts a string indicating the name of a time zone into a time zone
+ object, one of:
+ * As used in the Olson time zone database (the "tz database" or
+ "tzdata"), such as "America/New_York"
+ * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
+
+ Parameters
+ ----------
+ name: string
+ Time zone name
+
+ Returns
+ -------
+ tz : datetime.tzinfo
+ Time zone object
+ """
+ import pytz
+ m = _FIXED_OFFSET_RE.match(name)
+ if m:
+ sign = 1 if m.group(1) == '+' else -1
+ hours, minutes = map(int, m.group(2, 3))
+ return pytz.FixedOffset(sign * (hours * 60 + minutes))
+ else:
+ return pytz.timezone(name)
+
+
def timestamp(unit, tz=None):
"""
Create instance of timestamp type with resolution and optional time zone
@@ -887,7 +946,7 @@ def timestamp(unit, tz=None):
_timestamp_type_cache[unit_code] = out
else:
if not isinstance(tz, six.string_types):
- tz = tz.zone
+ tz = tzinfo_to_string(tz)
c_timezone = tobytes(tz)
out.init(ctimestamp(unit_code, c_timezone))
--
To stop receiving notification emails like this one, please contact
uwe@apache.org.