You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@arrow.apache.org by "Evangelos Pertsinis (Jira)" <ji...@apache.org> on 2020/02/04 02:29:00 UTC
[jira] [Updated] (ARROW-7758) Specific dates such as 0000-01-01 raise ValueError and parquet file is not loaded into pandas dataframe

     [ https://issues.apache.org/jira/browse/ARROW-7758?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Evangelos Pertsinis updated ARROW-7758:
---------------------------------------
    Description: 
Using pandas.read_parquet() with pyarrow as the engine produces ValueError when the parquet file contains a date column with the value 0000-01-01.

PySpark can read the same parquet with no issues and PyArrow up to version 0.11.1 could read it as well. 

 
{code:java}
// code placeholder

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-7-06e3cce13e18> in <module>
----> 1 df_init_df = read_parquet_files('{}/DebtFacility'.format(ext_path))

<ipython-input-4-f12125c1c8fe> in read_parquet_files(folder_path)
      2     files = [f for f in os.listdir(folder_path) if f.endswith('parquet')]
      3 
----> 4     df_list = [pd.read_parquet(os.path.join(folder_path, f)) for f in files]
      5 
      6     print(files)

<ipython-input-4-f12125c1c8fe> in <listcomp>(.0)
      2     files = [f for f in os.listdir(folder_path) if f.endswith('parquet')]
      3 
----> 4     df_list = [pd.read_parquet(os.path.join(folder_path, f)) for f in files]
      5 
      6     print(files)

/opt/conda/lib/python3.6/site-packages/pandas/io/parquet.py in read_parquet(path, engine, columns, **kwargs)
    294 
    295     impl = get_engine(engine)
--> 296     return impl.read(path, columns=columns, **kwargs)

/opt/conda/lib/python3.6/site-packages/pandas/io/parquet.py in read(self, path, columns, **kwargs)
    123         kwargs["use_pandas_metadata"] = True
    124         result = self.api.parquet.read_table(
--> 125             path, columns=columns, **kwargs
    126         ).to_pandas()
    127         if should_close:

/opt/conda/lib/python3.6/site-packages/pyarrow/array.pxi in pyarrow.lib._PandasConvertible.to_pandas()

/opt/conda/lib/python3.6/site-packages/pyarrow/table.pxi in pyarrow.lib.Table._to_pandas()

/opt/conda/lib/python3.6/site-packages/pyarrow/pandas_compat.py in table_to_blockmanager(options, table, categories, ignore_metadata)
    702 
    703     _check_data_column_metadata_consistency(all_columns)
--> 704     blocks = _table_to_blocks(options, table, categories)
    705     columns = _deserialize_column_index(table, all_columns, column_indexes)
    706 

/opt/conda/lib/python3.6/site-packages/pyarrow/pandas_compat.py in _table_to_blocks(options, block_table, categories)
    974 
    975     # Convert an arrow table to Block from the internal pandas API
--> 976     result = pa.lib.table_to_blocks(options, block_table, categories)
    977 
    978     # Defined above

/opt/conda/lib/python3.6/site-packages/pyarrow/table.pxi in pyarrow.lib.table_to_blocks()

ValueError: year -1 is out of range

{code}

  was:
Using pandas.read_parquet() with pyarrow as the engine produces ValueError when the parquet file contains a date column with the value 0000-01-01.

PySpark can read the same parquet with no issues. 

 
{code:java}
// code placeholder

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-7-06e3cce13e18> in <module>
----> 1 df_init_df = read_parquet_files('{}/DebtFacility'.format(ext_path))

<ipython-input-4-f12125c1c8fe> in read_parquet_files(folder_path)
      2     files = [f for f in os.listdir(folder_path) if f.endswith('parquet')]
      3 
----> 4     df_list = [pd.read_parquet(os.path.join(folder_path, f)) for f in files]
      5 
      6     print(files)

<ipython-input-4-f12125c1c8fe> in <listcomp>(.0)
      2     files = [f for f in os.listdir(folder_path) if f.endswith('parquet')]
      3 
----> 4     df_list = [pd.read_parquet(os.path.join(folder_path, f)) for f in files]
      5 
      6     print(files)

/opt/conda/lib/python3.6/site-packages/pandas/io/parquet.py in read_parquet(path, engine, columns, **kwargs)
    294 
    295     impl = get_engine(engine)
--> 296     return impl.read(path, columns=columns, **kwargs)

/opt/conda/lib/python3.6/site-packages/pandas/io/parquet.py in read(self, path, columns, **kwargs)
    123         kwargs["use_pandas_metadata"] = True
    124         result = self.api.parquet.read_table(
--> 125             path, columns=columns, **kwargs
    126         ).to_pandas()
    127         if should_close:

/opt/conda/lib/python3.6/site-packages/pyarrow/array.pxi in pyarrow.lib._PandasConvertible.to_pandas()

/opt/conda/lib/python3.6/site-packages/pyarrow/table.pxi in pyarrow.lib.Table._to_pandas()

/opt/conda/lib/python3.6/site-packages/pyarrow/pandas_compat.py in table_to_blockmanager(options, table, categories, ignore_metadata)
    702 
    703     _check_data_column_metadata_consistency(all_columns)
--> 704     blocks = _table_to_blocks(options, table, categories)
    705     columns = _deserialize_column_index(table, all_columns, column_indexes)
    706 

/opt/conda/lib/python3.6/site-packages/pyarrow/pandas_compat.py in _table_to_blocks(options, block_table, categories)
    974 
    975     # Convert an arrow table to Block from the internal pandas API
--> 976     result = pa.lib.table_to_blocks(options, block_table, categories)
    977 
    978     # Defined above

/opt/conda/lib/python3.6/site-packages/pyarrow/table.pxi in pyarrow.lib.table_to_blocks()

ValueError: year -1 is out of range

{code}


> Specific dates such as 0000-01-01 raise ValueError and parquet file is not loaded into pandas dataframe
> -------------------------------------------------------------------------------------------------------
>
>                 Key: ARROW-7758
>                 URL: https://issues.apache.org/jira/browse/ARROW-7758
>             Project: Apache Arrow
>          Issue Type: Bug
>            Reporter: Evangelos Pertsinis
>            Priority: Major
>
> Using pandas.read_parquet() with pyarrow as the engine produces ValueError when the parquet file contains a date column with the value 0000-01-01.
> PySpark can read the same parquet with no issues and PyArrow up to version 0.11.1 could read it as well. 
>  
> {code:java}
> // code placeholder
> ---------------------------------------------------------------------------
> ValueError                                Traceback (most recent call last)
> <ipython-input-7-06e3cce13e18> in <module>
> ----> 1 df_init_df = read_parquet_files('{}/DebtFacility'.format(ext_path))
> <ipython-input-4-f12125c1c8fe> in read_parquet_files(folder_path)
>       2     files = [f for f in os.listdir(folder_path) if f.endswith('parquet')]
>       3 
> ----> 4     df_list = [pd.read_parquet(os.path.join(folder_path, f)) for f in files]
>       5 
>       6     print(files)
> <ipython-input-4-f12125c1c8fe> in <listcomp>(.0)
>       2     files = [f for f in os.listdir(folder_path) if f.endswith('parquet')]
>       3 
> ----> 4     df_list = [pd.read_parquet(os.path.join(folder_path, f)) for f in files]
>       5 
>       6     print(files)
> /opt/conda/lib/python3.6/site-packages/pandas/io/parquet.py in read_parquet(path, engine, columns, **kwargs)
>     294 
>     295     impl = get_engine(engine)
> --> 296     return impl.read(path, columns=columns, **kwargs)
> /opt/conda/lib/python3.6/site-packages/pandas/io/parquet.py in read(self, path, columns, **kwargs)
>     123         kwargs["use_pandas_metadata"] = True
>     124         result = self.api.parquet.read_table(
> --> 125             path, columns=columns, **kwargs
>     126         ).to_pandas()
>     127         if should_close:
> /opt/conda/lib/python3.6/site-packages/pyarrow/array.pxi in pyarrow.lib._PandasConvertible.to_pandas()
> /opt/conda/lib/python3.6/site-packages/pyarrow/table.pxi in pyarrow.lib.Table._to_pandas()
> /opt/conda/lib/python3.6/site-packages/pyarrow/pandas_compat.py in table_to_blockmanager(options, table, categories, ignore_metadata)
>     702 
>     703     _check_data_column_metadata_consistency(all_columns)
> --> 704     blocks = _table_to_blocks(options, table, categories)
>     705     columns = _deserialize_column_index(table, all_columns, column_indexes)
>     706 
> /opt/conda/lib/python3.6/site-packages/pyarrow/pandas_compat.py in _table_to_blocks(options, block_table, categories)
>     974 
>     975     # Convert an arrow table to Block from the internal pandas API
> --> 976     result = pa.lib.table_to_blocks(options, block_table, categories)
>     977 
>     978     # Defined above
> /opt/conda/lib/python3.6/site-packages/pyarrow/table.pxi in pyarrow.lib.table_to_blocks()
> ValueError: year -1 is out of range
> {code}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)