You are viewing a plain text version of this content. The canonical link for it is here.
Posted to jira@arrow.apache.org by "Gert Hulselmans (Jira)" <ji...@apache.org> on 2020/09/22 07:16:00 UTC
[jira] [Comment Edited] (ARROW-10056) [Python] PyArrow writes
invalid Feather v2 file: OSError: Verification of flatbuffer-encoded Footer
failed.
[ https://issues.apache.org/jira/browse/ARROW-10056?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17199885#comment-17199885 ]
Gert Hulselmans edited comment on ARROW-10056 at 9/22/20, 7:15 AM:
-------------------------------------------------------------------
It seems like the length of the column names does not matter:
{code:python}
def feather_bisect_long_names(nbr_motifs, nbr_regions):
print(f'shape: ({nbr_motifs}, {nbr_regions})')
df_bisect = pd.DataFrame(
np.arange(nbr_motifs * nbr_regions, dtype=np.float32).reshape((nbr_motifs, nbr_regions)),
index=pd.Index(['motif' * 100 + str(i) for i in range(nbr_motifs)], name='motifs'),
columns=pd.Index(['region' * 100 + str(i) for i in range(nbr_regions)], name='regions'),
)
print('write')
pf.write_feather(df_bisect, 'df_bisect.feather', compression='uncompressed')
print('read')
df_bisect_read = pf.read_feather('df_bisect.feather')
df_bisect_read.set_index(pd.Index(['motif0'], name='motifs'), inplace=True)
print(df_bisect_read.equals(df_bisect))
def feather_bisect_short_names(nbr_motifs, nbr_regions):
print(f'shape: ({nb_motifs}, {nbr_regions})')
df_bisect = pd.DataFrame(
np.arange(nbr_motifs * nbr_regions, dtype=np.float32).reshape((nbr_motifs, nbr_regions)),
index=pd.Index(['m' + str(i) for i in range(nbr_motifs)], name='motifs'),
columns=pd.Index(['r' + str(i) for i in range(nbr_regions)], name='regions'),
)
print('write')
pf.write_feather(df_bisect, 'df_bisect.feather', compression='uncompressed')
print('read')
df_bisect_read = pf.read_feather('df_bisect.feather')
df_bisect_read.set_index(pd.Index(['motif0'], name='motifs'), inplace=True)
print(df_bisect_read.equals(df_bisect))
def feather_bisect_no_names(nbr_motifs, nbr_regions):
print(f'shape: ({nb_motifs}, {nbr_regions})')
df_bisect = pd.DataFrame(
np.arange(nbr_motifs * nbr_regions, dtype=np.float32).reshape((nbr_motifs, nbr_regions))
)
print('write')
pf.write_feather(df_bisect, 'df_bisect.feather', compression='uncompressed')
print('read')
df_bisect_read = pf.read_feather('df_bisect.feather')
df_bisect_read.set_index(pd.Index(['motif0'], name='motifs'), inplace=True)
print(df_bisect_read.equals(df_bisect))
{code}
All work with 499998 but not with 499999 columns.
I also managed to get another error message when I constructed very big column names. If the same happens above (wrong calculation of buffer size, but not a negative number), it might explain the corruption of the written Feather v2 file.
{code:python}
def feather_bisect_very_long_names(nbr_motifs, nbr_regions):
print(f'shape: ({nbr_motifs}, {nbr_regions})')
df_bisect = pd.DataFrame(
np.arange(nbr_motifs * nbr_regions, dtype=np.float32).reshape((nbr_motifs, nbr_regions)),
index=pd.Index(['motif' * 10000 + str(i) for i in range(nbr_motifs)], name='motifs'),
columns=pd.Index(['region' * 10000 + str(i) for i in range(nbr_regions)], name='regions'),
)
print('write')
pf.write_feather(df_bisect, 'df_bisect.feather', compression='uncompressed')
print('read')
df_bisect_read = pf.read_feather('df_bisect.feather')
df_bisect_read.set_index(pd.Index(['motif0'], name='motifs'), inplace=True)
print(df_bisect_read.equals(df_bisect))
In [174]: feather_bisect_very_long_names(1, 500000-2)
shape: (1, 499998)
write
---------------------------------------------------------------------------
ArrowInvalid Traceback (most recent call last)
<ipython-input-174-5a1ef546402a> in <module>
----> 1 feather_bisect(1, 500000-2)
<ipython-input-173-dced00174027> in feather_bisect_very_long_names(nbr_motifs, nbr_regions)
8
9 print('write')
---> 10 pf.write_feather(df_bisect, 'df_bisect.feather', compression='uncompressed')
11
12 print('read')
/software/miniconda3/envs/pyarrow/lib/python3.8/site-packages/pyarrow/feather.py in write_feather(df, dest, compression, compression_level, chunksize, version)
180
181 try:
--> 182 ext.write_feather(table, dest, compression=compression,
183 compression_level=compression_level,
184 chunksize=chunksize, version=version)
/software/miniconda3/envs/pyarrow/lib/python3.8/site-packages/pyarrow/feather.pxi in pyarrow.lib.write_feather()
/software/miniconda3/envs/pyarrow/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.check_status()
ArrowInvalid: Negative buffer resize: -114399492
{code}
was (Author: ghuls):
It seems like the length of the column names does not matter:
{code:python}
def feather_bisect_long_names(nbr_motifs, nbr_regions):
print(f'shape: ({nbr_motifs}, {nbr_regions})')
df_bisect = pd.DataFrame(
np.arange(nbr_motifs * nbr_regions, dtype=np.float32).reshape((nbr_motifs, nbr_regions)),
index=pd.Index(['motif' * 100 + str(i) for i in range(nbr_motifs)], name='motifs'),
columns=pd.Index(['region' * 100 + str(i) for i in range(nbr_regions)], name='regions'),
)
print('write')
pf.write_feather(df_bisect, 'df_bisect.feather', compression='uncompressed')
print('read')
df_bisect_read = pf.read_feather('df_bisect.feather')
df_bisect_read.set_index(pd.Index(['motif0'], name='motifs'), inplace=True)
print(df_bisect_read.equals(df_bisect))
def feather_bisect_short_names(nbr_motifs, nbr_regions):
print(f'shape: ({nb_motifs}, {nbr_regions})')
df_bisect = pd.DataFrame(
np.arange(nbr_motifs * nbr_regions, dtype=np.float32).reshape((nbr_motifs, nbr_regions)),
index=pd.Index(['m' + str(i) for i in range(nbr_motifs)], name='motifs'),
columns=pd.Index(['r' + str(i) for i in range(nbr_regions)], name='regions'),
)
print('write')
pf.write_feather(df_bisect, 'df_bisect.feather', compression='uncompressed')
print('read')
df_bisect_read = pf.read_feather('df_bisect.feather')
df_bisect_read.set_index(pd.Index(['motif0'], name='motifs'), inplace=True)
print(df_bisect_read.equals(df_bisect))
def feather_bisect_no_names(nbr_motifs, nbr_regions):
print(f'shape: ({nb_motifs}, {nbr_regions})')
df_bisect = pd.DataFrame(
np.arange(nbr_motifs * nbr_regions, dtype=np.float32).reshape((nbr_motifs, nbr_regions))
)
print('write')
pf.write_feather(df_bisect, 'df_bisect.feather', compression='uncompressed')
print('read')
df_bisect_read = pf.read_feather('df_bisect.feather')
df_bisect_read.set_index(pd.Index(['motif0'], name='motifs'), inplace=True)
print(df_bisect_read.equals(df_bisect))
{code}
All work with 499998 but not with 499999 columns.
I also managed to get another error message when I constructed very big column names. If the same happens above (wrong calculation of buffer size), it might explain the corruption of the written Feather v2 file.
{code:python}
def feather_bisect_very_long_names(nbr_motifs, nbr_regions):
print(f'shape: ({nbr_motifs}, {nbr_regions})')
df_bisect = pd.DataFrame(
np.arange(nbr_motifs * nbr_regions, dtype=np.float32).reshape((nbr_motifs, nbr_regions)),
index=pd.Index(['motif' * 10000 + str(i) for i in range(nbr_motifs)], name='motifs'),
columns=pd.Index(['region' * 10000 + str(i) for i in range(nbr_regions)], name='regions'),
)
print('write')
pf.write_feather(df_bisect, 'df_bisect.feather', compression='uncompressed')
print('read')
df_bisect_read = pf.read_feather('df_bisect.feather')
df_bisect_read.set_index(pd.Index(['motif0'], name='motifs'), inplace=True)
print(df_bisect_read.equals(df_bisect))
In [174]: feather_bisect_very_long_names(1, 500000-2)
shape: (1, 499998)
write
---------------------------------------------------------------------------
ArrowInvalid Traceback (most recent call last)
<ipython-input-174-5a1ef546402a> in <module>
----> 1 feather_bisect(1, 500000-2)
<ipython-input-173-dced00174027> in feather_bisect_very_long_names(nbr_motifs, nbr_regions)
8
9 print('write')
---> 10 pf.write_feather(df_bisect, 'df_bisect.feather', compression='uncompressed')
11
12 print('read')
/software/miniconda3/envs/pyarrow/lib/python3.8/site-packages/pyarrow/feather.py in write_feather(df, dest, compression, compression_level, chunksize, version)
180
181 try:
--> 182 ext.write_feather(table, dest, compression=compression,
183 compression_level=compression_level,
184 chunksize=chunksize, version=version)
/software/miniconda3/envs/pyarrow/lib/python3.8/site-packages/pyarrow/feather.pxi in pyarrow.lib.write_feather()
/software/miniconda3/envs/pyarrow/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.check_status()
ArrowInvalid: Negative buffer resize: -114399492
{code}
> [Python] PyArrow writes invalid Feather v2 file: OSError: Verification of flatbuffer-encoded Footer failed.
> -----------------------------------------------------------------------------------------------------------
>
> Key: ARROW-10056
> URL: https://issues.apache.org/jira/browse/ARROW-10056
> Project: Apache Arrow
> Issue Type: Bug
> Components: Python
> Affects Versions: 1.0.1
> Environment: CentOS7
> conda environment with pyarrow 1.0.1, numpy 1.19.1 and pandas 1.1.1
> Reporter: Gert Hulselmans
> Priority: Major
>
> pyarrow writes an invalid Feather v2 file, which it can't read afterwards.
> {code:java}
> OSError: Verification of flatbuffer-encoded Footer failed.
> {code}
> The following code reproduces the problem for me:
> {code:python}
> import pyarrow as pa
> import numpy as np
> import pandas as pd
> nbr_regions = 1223024
> nbr_motifs = 4891
> # Create (big) dataframe.
> df = pd.DataFrame(
> np.arange(nbr_regions * nbr_motifs, dtype=np.float32).reshape((nbr_regions, nbr_motifs)),
> index=pd.Index(['region' + str(i) for i in range(nbr_regions)], name='regions'),
> columns=pd.Index(['motif' + str(i) for i in range(nbr_motifs)], name='motifs')
> )
> # Transpose dataframe
> df_transposed = df.transpose()
> # Write transposed dataframe to Feather v2 format.
> pf.write_feather(df_transposed, 'df_transposed.feather')
> # Trying to read the transposed dataframe from Feather v2 format, results in this error:
> df_transposed_read = pf.read_feather('df_transposed.feather')
> {code}
> {code:python}
> ---------------------------------------------------------------------------
> OSError Traceback (most recent call last)
> <ipython-input-64-b41ad5157e77> in <module>
> ----> 1 df_transposed_read = pf.read_feather('df_transposed.feather')
> /software/miniconda3/envs/pyarrow/lib/python3.8/site-packages/pyarrow/feather.py in read_feather(source, columns, use_threads, memory_map)
> 213 """
> 214 _check_pandas_version()
> --> 215 return (read_table(source, columns=columns, memory_map=memory_map)
> 216 .to_pandas(use_threads=use_threads))
> 217
> /software/miniconda3/envs/pyarrow/lib/python3.8/site-packages/pyarrow/feather.py in read_table(source, columns, memory_map)
> 235 """
> 236 reader = ext.FeatherReader()
> --> 237 reader.open(source, use_memory_map=memory_map)
> 238
> 239 if columns is None:
> /software/miniconda3/envs/pyarrow/lib/python3.8/site-packages/pyarrow/feather.pxi in pyarrow.lib.FeatherReader.open()
> /software/miniconda3/envs/pyarrow/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.pyarrow_internal_check_status()
> /software/miniconda3/envs/pyarrow/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.check_status()
> OSError: Verification of flatbuffer-encoded Footer failed.
> {code}
> Later I discovered that it happens also if the original dataframe is created in the transposed order:
> {code:python}
> # Create (big) dataframe.
> df_without_transpose = pd.DataFrame(
> np.arange(nbr_motifs * nbr_regions, dtype=np.float32).reshape((nbr_motifs, nbr_regions)),
> index=pd.Index(['motif' + str(i) for i in range(nbr_motifs)], name='motifs'),
> columns=pd.Index(['region' + str(i) for i in range(nbr_regions)], name='regions'),
> )
> pf.write_feather(df_without_transpose, 'df_without_transpose.feather')
> df_without_transpose_read = pf.read_feather('df_without_transpose.feather')
> ---------------------------------------------------------------------------
> OSError Traceback (most recent call last)
> <ipython-input-91-3cdad1d58c35> in <module>
> ----> 1 df_without_transpose_read = pf.read_feather('df_without_transpose.feather')
> /software/miniconda3/envs/pyarrow/lib/python3.8/site-packages/pyarrow/feather.py in read_feather(source, columns, use_threads, memory_map)
> 213 """
> 214 _check_pandas_version()
> --> 215 return (read_table(source, columns=columns, memory_map=memory_map)
> 216 .to_pandas(use_threads=use_threads))
> 217
> /software/miniconda3/envs/pyarrow/lib/python3.8/site-packages/pyarrow/feather.py in read_table(source, columns, memory_map)
> 235 """
> 236 reader = ext.FeatherReader()
> --> 237 reader.open(source, use_memory_map=memory_map)
> 238
> 239 if columns is None:
> /software/miniconda3/envs/pyarrow/lib/python3.8/site-packages/pyarrow/feather.pxi in pyarrow.lib.FeatherReader.open()
> /software/miniconda3/envs/pyarrow/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.pyarrow_internal_check_status()
> /software/miniconda3/envs/pyarrow/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.check_status()
> OSError: Verification of flatbuffer-encoded Footer failed.
> {code}
> Writing to Feather v1 format works:
> {code:python}
> pf.write_feather(df_transposed, 'df_transposed.v1.feather', version=1)
> df_transposed_read_v1 = pf.read_feather('df_transposed.v1.feather')
> # Now do the same, but also save the index in the Feather v1 file.
> df_transposed_reset_index = df_transposed.reset_index()
> pf.write_feather(df_transposed_reset_index, 'df_transposed_reset_index.v1.feather', version=1)
> df_transposed_reset_index_read_v1 = pf.read_feather('df_transposed_reset_index.v1.feather')
> # Returns True
> df_transposed_reset_index_read_v1.equals(df_transposed)
> {code}
--
This message was sent by Atlassian Jira
(v8.3.4#803005)