You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Vladimir Smelov (JIRA)" <ji...@apache.org> on 2017/10/02 09:54:00 UTC
[jira] [Updated] (SPARK-22182) Incorrect Date and Timestamp
conversion beyon before 1000 year
[ https://issues.apache.org/jira/browse/SPARK-22182?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Vladimir Smelov updated SPARK-22182:
------------------------------------
Description:
I create DF with None values in each odd row
{code:python}
from pyspark.sql.functions import lit
from datetime import datetime
from pprint import pprint
df = spark.range(0, 10).withColumnRenamed('id', 'field_int')
d = datetime.now().date()
dt = datetime.now()
df = df.withColumn('field_date', lit(d))
df = df.withColumn('field_datetime', lit(dt))
def foo(part_idx, it):
for it_idx, row in enumerate(it):
real_idx = part_idx + it_idx
if real_idx % 2:
yield None, None, None
else:
yield row
df = df.rdd.mapPartitionsWithIndex(foo).toDF()
print('df:')
pprint(df.collect())
{code}
after that I want to fill None to default values, where default Date is 0001-01-01 and default Timestamp is 0001-01-01 00:00:00
{code:python}
from pyspark.sql.types import (
StringType, BinaryType, BooleanType, DateType,
TimestampType, DecimalType, DoubleType, FloatType, ByteType, IntegerType,
LongType, ShortType)
SparkType2Default = {
StringType: '',
BinaryType: '',
BooleanType: 0,
DateType: '0001-01-01',
TimestampType: '0001-01-01 00:00:00',
DoubleType: 0.0,
FloatType: 0.0,
ByteType: 0,
IntegerType: 0,
LongType: 0,
ShortType: 0,
}
def smart_fillna(df):
mapping = {}
for field in df.schema.fields:
name = field.name
spark_type = type(field.dataType)
default_value = SparkType2Default[spark_type]
mapping[name] = default_value
df = df.fillna(mapping)
return df
df = smart_fillna(df)
print('df:')
pprint(df.collect())
{code}
Then I got an error:
{code:python}
Traceback (most recent call last):
File "/home/vsmelov/PycharmProjects/etl/spark_test/import.py", line 220, in <module>
write_test()
File "/home/vsmelov/PycharmProjects/etl/spark_test/import.py", line 203, in write_test
pprint(df.collect())
File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/dataframe.py", line 439, in collect
File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 144, in load_stream
File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 169, in _read_with_length
File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 451, in loads
File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/types.py", line 1371, in <lambda>
File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/types.py", line 602, in fromInternal
File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/types.py", line 602, in <listcomp>
File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/types.py", line 439, in fromInternal
File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/types.py", line 176, in fromInternal
def send(self, val):
ValueError: ordinal must be >= 1
{code}
So after some experiments, I realized that there is error in conversion of string value to Date and Timestamp and I have to replace default values with
{code:python}
SparkType2Default = {
StringType: '',
BinaryType: '',
BooleanType: 0,
DateType: '0001-01-03',
TimestampType: '0001-01-03 00:29:43',
DateType: '0001-01-01',
TimestampType: '0001-01-01 00:00:00',
DoubleType: 0.0,
FloatType: 0.0,
ByteType: 0,
IntegerType: 0,
LongType: 0,
ShortType: 0,
}
{code}
DateType: *'0001-01-03'*,
TimestampType: *'0001-01-03 00:29:43'*,
Then it works correct.
was:
I create DF with None values in each odd row
```
from pyspark.sql.functions import lit
from datetime import datetime
from pprint import pprint
df = spark.range(0, 10).withColumnRenamed('id', 'field_int')
d = datetime.now().date()
dt = datetime.now()
df = df.withColumn('field_date', lit(d))
df = df.withColumn('field_datetime', lit(dt))
def foo(part_idx, it):
for it_idx, row in enumerate(it):
real_idx = part_idx + it_idx
if real_idx % 2:
yield None, None, None
else:
yield row
df = df.rdd.mapPartitionsWithIndex(foo).toDF()
print('df:')
pprint(df.collect())
```
after that I want to fill None to default values, where default Date is 0001-01-01 and default Timestamp is 0001-01-01 00:00:00
```
from pyspark.sql.types import (
StringType, BinaryType, BooleanType, DateType,
TimestampType, DecimalType, DoubleType, FloatType, ByteType, IntegerType,
LongType, ShortType)
SparkType2Default = {
StringType: '',
BinaryType: '',
BooleanType: 0,
DateType: '0001-01-01',
TimestampType: '0001-01-01 00:00:00',
DoubleType: 0.0,
FloatType: 0.0,
ByteType: 0,
IntegerType: 0,
LongType: 0,
ShortType: 0,
}
def smart_fillna(df):
mapping = {}
for field in df.schema.fields:
name = field.name
spark_type = type(field.dataType)
default_value = SparkType2Default[spark_type]
mapping[name] = default_value
df = df.fillna(mapping)
return df
df = smart_fillna(df)
print('df:')
pprint(df.collect())
```
Then I got an error:
```
Traceback (most recent call last):
File "/home/vsmelov/PycharmProjects/etl/spark_test/import.py", line 220, in <module>
write_test()
File "/home/vsmelov/PycharmProjects/etl/spark_test/import.py", line 203, in write_test
pprint(df.collect())
File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/dataframe.py", line 439, in collect
File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 144, in load_stream
File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 169, in _read_with_length
File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 451, in loads
File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/types.py", line 1371, in <lambda>
File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/types.py", line 602, in fromInternal
File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/types.py", line 602, in <listcomp>
File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/types.py", line 439, in fromInternal
File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/types.py", line 176, in fromInternal
def send(self, val):
ValueError: ordinal must be >= 1
```
So after some experiments, I realized that there is error in conversion of string value to Date and Timestamp and I have to replace default values with
```
SparkType2Default = {
StringType: '',
BinaryType: '',
BooleanType: 0,
DateType: '0001-01-03',
TimestampType: '0001-01-03 00:29:43',
DateType: '0001-01-01',
TimestampType: '0001-01-01 00:00:00',
DoubleType: 0.0,
FloatType: 0.0,
ByteType: 0,
IntegerType: 0,
LongType: 0,
ShortType: 0,
}
```
DateType: *'0001-01-03'*,
TimestampType: *'0001-01-03 00:29:43'*,
Then it works correct.
> Incorrect Date and Timestamp conversion beyon before 1000 year
> --------------------------------------------------------------
>
> Key: SPARK-22182
> URL: https://issues.apache.org/jira/browse/SPARK-22182
> Project: Spark
> Issue Type: Bug
> Components: PySpark
> Affects Versions: 2.2.0
> Reporter: Vladimir Smelov
>
> I create DF with None values in each odd row
> {code:python}
> from pyspark.sql.functions import lit
> from datetime import datetime
> from pprint import pprint
> df = spark.range(0, 10).withColumnRenamed('id', 'field_int')
> d = datetime.now().date()
> dt = datetime.now()
> df = df.withColumn('field_date', lit(d))
> df = df.withColumn('field_datetime', lit(dt))
> def foo(part_idx, it):
> for it_idx, row in enumerate(it):
> real_idx = part_idx + it_idx
> if real_idx % 2:
> yield None, None, None
> else:
> yield row
> df = df.rdd.mapPartitionsWithIndex(foo).toDF()
> print('df:')
> pprint(df.collect())
> {code}
> after that I want to fill None to default values, where default Date is 0001-01-01 and default Timestamp is 0001-01-01 00:00:00
> {code:python}
> from pyspark.sql.types import (
> StringType, BinaryType, BooleanType, DateType,
> TimestampType, DecimalType, DoubleType, FloatType, ByteType, IntegerType,
> LongType, ShortType)
> SparkType2Default = {
> StringType: '',
> BinaryType: '',
> BooleanType: 0,
> DateType: '0001-01-01',
> TimestampType: '0001-01-01 00:00:00',
> DoubleType: 0.0,
> FloatType: 0.0,
> ByteType: 0,
> IntegerType: 0,
> LongType: 0,
> ShortType: 0,
> }
> def smart_fillna(df):
> mapping = {}
> for field in df.schema.fields:
> name = field.name
> spark_type = type(field.dataType)
> default_value = SparkType2Default[spark_type]
> mapping[name] = default_value
> df = df.fillna(mapping)
> return df
> df = smart_fillna(df)
> print('df:')
> pprint(df.collect())
> {code}
> Then I got an error:
> {code:python}
> Traceback (most recent call last):
> File "/home/vsmelov/PycharmProjects/etl/spark_test/import.py", line 220, in <module>
> write_test()
> File "/home/vsmelov/PycharmProjects/etl/spark_test/import.py", line 203, in write_test
> pprint(df.collect())
> File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/dataframe.py", line 439, in collect
> File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 144, in load_stream
> File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 169, in _read_with_length
> File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 451, in loads
> File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/types.py", line 1371, in <lambda>
> File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/types.py", line 602, in fromInternal
> File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/types.py", line 602, in <listcomp>
> File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/types.py", line 439, in fromInternal
> File "/var/bigdata/spark-2.2.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/sql/types.py", line 176, in fromInternal
> def send(self, val):
> ValueError: ordinal must be >= 1
> {code}
> So after some experiments, I realized that there is error in conversion of string value to Date and Timestamp and I have to replace default values with
> {code:python}
> SparkType2Default = {
> StringType: '',
> BinaryType: '',
> BooleanType: 0,
> DateType: '0001-01-03',
> TimestampType: '0001-01-03 00:29:43',
> DateType: '0001-01-01',
> TimestampType: '0001-01-01 00:00:00',
> DoubleType: 0.0,
> FloatType: 0.0,
> ByteType: 0,
> IntegerType: 0,
> LongType: 0,
> ShortType: 0,
> }
> {code}
> DateType: *'0001-01-03'*,
> TimestampType: *'0001-01-03 00:29:43'*,
> Then it works correct.
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org