You are viewing a plain text version of this content. The canonical link for it is here.

Posted to user@spark.apache.org by ericjhilton <er...@gmail.com> on 2016/06/16 16:16:29 UTC

converting timestamp from UTC to many time zones

This is using python with Spark 1.6.1 and dataframes.

I have timestamps in UTC that I want to convert to local time, but a given
row could be in any of several timezones. I have an 'offset' value (or
alternately, the local timezone abbreviation. I can adjust all the
timestamps to a single zone or with a single offset easily enough, but I
can't figure out how to make the adjustment dependent on the 'offset' or
'tz' column.

There appear to be 2 main ways of adjusting a timestamp: using the
'INTERVAL' method, or using pyspark.sql.from_utc_timestamp.

Here's an example:
---

data = [ ("2015-01-01 23:59:59", "2015-01-02 00:01:02", 1, 300,"MST"),
("2015-01-02 23:00:00", "2015-01-02 23:59:59", 2, 60,"EST"),
("2015-01-02 22:59:58", "2015-01-02 23:59:59", 3, 120,"EST"),
("2015-03-02 15:59:58", "2015-01-02 23:59:59", 4, 120,"PST"),
("2015-03-16 15:15:58", "2015-01-02 23:59:59", 5, 120,"PST"),
("2015-10-02 18:59:58", "2015-01-02 23:59:59", 4, 120,"PST"),
("2015-11-16 18:58:58", "2015-01-02 23:59:59", 5, 120,"PST"),
("2015-03-02 15:59:58", "2015-01-02 23:59:59", 4, 120,"MST"),
("2015-03-16 15:15:58", "2015-01-02 23:59:59", 5, 120,"MST"),
("2015-10-02 18:59:58", "2015-01-02 23:59:59", 4, 120,"MST"),
("2015-11-16 18:58:58", "2015-01-02 23:59:59", 5, 120,"MST"),]

df = sqlCtx.createDataFrame(data, ["start_time", "end_time",
"id","offset","tz"])
from pyspark.sql import functions as F

df.withColumn('testthis', F.from_utc_timestamp(df.start_time, "PST")).show()
df.withColumn('testThat', df.start_time.cast("timestamp") - F.expr("INTERVAL
50 MINUTES")).show()

----
those last 2 lines work as expected, but I want to replace "PST" with the
df.tz column or use the df.offset column with INTERVAL 


Here's the error I get. Is there a workaround to this?

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-14-fe409c16a012> in <module>()
----> 1 df.withColumn('testthis', F.from_utc_timestamp(df.start_time,
df.tz)).show()

/opt/spark-1.6.1/python/pyspark/sql/functions.py in
from_utc_timestamp(timestamp, tz)
    967     """
    968     sc = SparkContext._active_spark_context
--> 969     return
Column(sc._jvm.functions.from_utc_timestamp(_to_java_column(timestamp), tz))
    970 
    971 

/opt/spark-1.6.1/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in
__call__(self, *args)
    796     def __call__(self, *args):
    797         if self.converters is not None and len(self.converters) > 0:
--> 798             (new_args, temp_args) = self._get_args(args)
    799         else:
    800             new_args = args

/opt/spark-1.6.1/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in
_get_args(self, args)
    783                 for converter in self.gateway_client.converters:
    784                     if converter.can_convert(arg):
--> 785                         temp_arg = converter.convert(arg,
self.gateway_client)
    786                         temp_args.append(temp_arg)
    787                         new_args.append(temp_arg)

/opt/spark-1.6.1/python/lib/py4j-0.9-src.zip/py4j/java_collections.py in
convert(self, object, gateway_client)
    510         HashMap = JavaClass("java.util.HashMap", gateway_client)
    511         java_map = HashMap()
--> 512         for key in object.keys():
    513             java_map[key] = object[key]
    514         return java_map

TypeError: 'Column' object is not callable



--
View this message in context: http://apache-spark-user-list.1001560.n3.nabble.com/converting-timestamp-from-UTC-to-many-time-zones-tp27182.html
Sent from the Apache Spark User List mailing list archive at Nabble.com.

---------------------------------------------------------------------
To unsubscribe, e-mail: user-unsubscribe@spark.apache.org
For additional commands, e-mail: user-help@spark.apache.org

Re: converting timestamp from UTC to many time zones

Posted by Davies Liu <da...@databricks.com>.

The DataFrame API does not support this use case, you can use still
use SQL do that,

df.selectExpr("from_utc_timestamp(start, tz) as testthis")

On Thu, Jun 16, 2016 at 9:16 AM, ericjhilton <er...@gmail.com> wrote:
> This is using python with Spark 1.6.1 and dataframes.
>
> I have timestamps in UTC that I want to convert to local time, but a given
> row could be in any of several timezones. I have an 'offset' value (or
> alternately, the local timezone abbreviation. I can adjust all the
> timestamps to a single zone or with a single offset easily enough, but I
> can't figure out how to make the adjustment dependent on the 'offset' or
> 'tz' column.
>
> There appear to be 2 main ways of adjusting a timestamp: using the
> 'INTERVAL' method, or using pyspark.sql.from_utc_timestamp.
>
> Here's an example:
> ---
>
> data = [ ("2015-01-01 23:59:59", "2015-01-02 00:01:02", 1, 300,"MST"),
> ("2015-01-02 23:00:00", "2015-01-02 23:59:59", 2, 60,"EST"),
> ("2015-01-02 22:59:58", "2015-01-02 23:59:59", 3, 120,"EST"),
> ("2015-03-02 15:59:58", "2015-01-02 23:59:59", 4, 120,"PST"),
> ("2015-03-16 15:15:58", "2015-01-02 23:59:59", 5, 120,"PST"),
> ("2015-10-02 18:59:58", "2015-01-02 23:59:59", 4, 120,"PST"),
> ("2015-11-16 18:58:58", "2015-01-02 23:59:59", 5, 120,"PST"),
> ("2015-03-02 15:59:58", "2015-01-02 23:59:59", 4, 120,"MST"),
> ("2015-03-16 15:15:58", "2015-01-02 23:59:59", 5, 120,"MST"),
> ("2015-10-02 18:59:58", "2015-01-02 23:59:59", 4, 120,"MST"),
> ("2015-11-16 18:58:58", "2015-01-02 23:59:59", 5, 120,"MST"),]
>
> df = sqlCtx.createDataFrame(data, ["start_time", "end_time",
> "id","offset","tz"])
> from pyspark.sql import functions as F
>
> df.withColumn('testthis', F.from_utc_timestamp(df.start_time, "PST")).show()
> df.withColumn('testThat', df.start_time.cast("timestamp") - F.expr("INTERVAL
> 50 MINUTES")).show()
>
> ----
> those last 2 lines work as expected, but I want to replace "PST" with the
> df.tz column or use the df.offset column with INTERVAL
>
>
> Here's the error I get. Is there a workaround to this?
>
> ---------------------------------------------------------------------------
> TypeError                                 Traceback (most recent call last)
> <ipython-input-14-fe409c16a012> in <module>()
> ----> 1 df.withColumn('testthis', F.from_utc_timestamp(df.start_time,
> df.tz)).show()
>
> /opt/spark-1.6.1/python/pyspark/sql/functions.py in
> from_utc_timestamp(timestamp, tz)
>     967     """
>     968     sc = SparkContext._active_spark_context
> --> 969     return
> Column(sc._jvm.functions.from_utc_timestamp(_to_java_column(timestamp), tz))
>     970
>     971
>
> /opt/spark-1.6.1/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in
> __call__(self, *args)
>     796     def __call__(self, *args):
>     797         if self.converters is not None and len(self.converters) > 0:
> --> 798             (new_args, temp_args) = self._get_args(args)
>     799         else:
>     800             new_args = args
>
> /opt/spark-1.6.1/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in
> _get_args(self, args)
>     783                 for converter in self.gateway_client.converters:
>     784                     if converter.can_convert(arg):
> --> 785                         temp_arg = converter.convert(arg,
> self.gateway_client)
>     786                         temp_args.append(temp_arg)
>     787                         new_args.append(temp_arg)
>
> /opt/spark-1.6.1/python/lib/py4j-0.9-src.zip/py4j/java_collections.py in
> convert(self, object, gateway_client)
>     510         HashMap = JavaClass("java.util.HashMap", gateway_client)
>     511         java_map = HashMap()
> --> 512         for key in object.keys():
>     513             java_map[key] = object[key]
>     514         return java_map
>
> TypeError: 'Column' object is not callable
>
>
>
> --
> View this message in context: http://apache-spark-user-list.1001560.n3.nabble.com/converting-timestamp-from-UTC-to-many-time-zones-tp27182.html
> Sent from the Apache Spark User List mailing list archive at Nabble.com.
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: user-unsubscribe@spark.apache.org
> For additional commands, e-mail: user-help@spark.apache.org
>

---------------------------------------------------------------------
To unsubscribe, e-mail: user-unsubscribe@spark.apache.org
For additional commands, e-mail: user-help@spark.apache.org