You are viewing a plain text version of this content. The canonical link for it is here.
Posted to jira@arrow.apache.org by "Joris Van den Bossche (Jira)" <ji...@apache.org> on 2022/03/21 16:35:00 UTC

[jira] [Comment Edited] (ARROW-15910) [Python] pyarrow.parquet.read_table either returns FileNotFound or ArrowInvalid

    [ https://issues.apache.org/jira/browse/ARROW-15910?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17508892#comment-17508892 ] 

Joris Van den Bossche edited comment on ARROW-15910 at 3/21/22, 4:34 PM:
-------------------------------------------------------------------------

{code}
FileNotFoundError                         Traceback (most recent call last)
/tmp/ipykernel_1684296/1250703457.py in <module>
      3 file_path="MyBucket/path/Name_of_parquet.parquet/"
      4 fs=gcsfs.GCSFileSystem()
----> 5 table=pq.read_table(file_path,filesystem=fs)

/opt/conda/lib/python3.7/site-packages/pyarrow/parquet.py in read_table(source, columns, use_threads, metadata, use_pandas_metadata, memory_map, read_dictionary, filesystem, filters, buffer_size, partitioning, use_legacy_dataset, ignore_prefixes, pre_buffer, coerce_int96_timestamp_unit)
   1968                 ignore_prefixes=ignore_prefixes,
   1969                 pre_buffer=pre_buffer,
-> 1970                 coerce_int96_timestamp_unit=coerce_int96_timestamp_unit
   1971             )
   1972         except ImportError:

/opt/conda/lib/python3.7/site-packages/pyarrow/parquet.py in __init__(self, path_or_paths, filesystem, filters, partitioning, read_dictionary, buffer_size, memory_map, ignore_prefixes, pre_buffer, coerce_int96_timestamp_unit, **kwargs)
   1782                                    format=parquet_format,
   1783                                    partitioning=partitioning,
-> 1784                                    ignore_prefixes=ignore_prefixes)
   1785 
   1786     @property

/opt/conda/lib/python3.7/site-packages/pyarrow/dataset.py in dataset(source, schema, format, filesystem, partitioning, partition_base_dir, exclude_invalid_files, ignore_prefixes)
    665 
    666     if _is_path_like(source):
--> 667         return _filesystem_dataset(source, **kwargs)
    668     elif isinstance(source, (tuple, list)):
    669         if all(_is_path_like(elem) for elem in source):

/opt/conda/lib/python3.7/site-packages/pyarrow/dataset.py in _filesystem_dataset(source, schema, filesystem, partitioning, format, partition_base_dir, exclude_invalid_files, selector_ignore_prefixes)
    420     factory = FileSystemDatasetFactory(fs, paths_or_selector, format, options)
    421 
--> 422     return factory.finish(schema)
    423 
    424 

/opt/conda/lib/python3.7/site-packages/pyarrow/_dataset.pyx in pyarrow._dataset.DatasetFactory.finish()

/opt/conda/lib/python3.7/site-packages/pyarrow/error.pxi in pyarrow.lib.pyarrow_internal_check_status()

/opt/conda/lib/python3.7/site-packages/pyarrow/_fs.pyx in pyarrow._fs._cb_open_input_file()

/opt/conda/lib/python3.7/site-packages/pyarrow/fs.py in open_input_file(self, path)
    392 
    393         if not self.fs.isfile(path):
--> 394             raise FileNotFoundError(path)
    395 
    396         return PythonFile(self.fs.open(path, mode="rb"), mode="r")
{code}


was (Author: JIRAUSER286431):
FileNotFoundError                         Traceback (most recent call last)
/tmp/ipykernel_1684296/1250703457.py in <module>
      3 file_path="MyBucket/path/Name_of_parquet.parquet/"
      4 fs=gcsfs.GCSFileSystem()
----> 5 table=pq.read_table(file_path,filesystem=fs)

/opt/conda/lib/python3.7/site-packages/pyarrow/parquet.py in read_table(source, columns, use_threads, metadata, use_pandas_metadata, memory_map, read_dictionary, filesystem, filters, buffer_size, partitioning, use_legacy_dataset, ignore_prefixes, pre_buffer, coerce_int96_timestamp_unit)
   1968                 ignore_prefixes=ignore_prefixes,
   1969                 pre_buffer=pre_buffer,
-> 1970                 coerce_int96_timestamp_unit=coerce_int96_timestamp_unit
   1971             )
   1972         except ImportError:

/opt/conda/lib/python3.7/site-packages/pyarrow/parquet.py in __init__(self, path_or_paths, filesystem, filters, partitioning, read_dictionary, buffer_size, memory_map, ignore_prefixes, pre_buffer, coerce_int96_timestamp_unit, **kwargs)
   1782                                    format=parquet_format,
   1783                                    partitioning=partitioning,
-> 1784                                    ignore_prefixes=ignore_prefixes)
   1785 
   1786     @property

/opt/conda/lib/python3.7/site-packages/pyarrow/dataset.py in dataset(source, schema, format, filesystem, partitioning, partition_base_dir, exclude_invalid_files, ignore_prefixes)
    665 
    666     if _is_path_like(source):
--> 667         return _filesystem_dataset(source, **kwargs)
    668     elif isinstance(source, (tuple, list)):
    669         if all(_is_path_like(elem) for elem in source):

/opt/conda/lib/python3.7/site-packages/pyarrow/dataset.py in _filesystem_dataset(source, schema, filesystem, partitioning, format, partition_base_dir, exclude_invalid_files, selector_ignore_prefixes)
    420     factory = FileSystemDatasetFactory(fs, paths_or_selector, format, options)
    421 
--> 422     return factory.finish(schema)
    423 
    424 

/opt/conda/lib/python3.7/site-packages/pyarrow/_dataset.pyx in pyarrow._dataset.DatasetFactory.finish()

/opt/conda/lib/python3.7/site-packages/pyarrow/error.pxi in pyarrow.lib.pyarrow_internal_check_status()

/opt/conda/lib/python3.7/site-packages/pyarrow/_fs.pyx in pyarrow._fs._cb_open_input_file()

/opt/conda/lib/python3.7/site-packages/pyarrow/fs.py in open_input_file(self, path)
    392 
    393         if not self.fs.isfile(path):
--> 394             raise FileNotFoundError(path)
    395 
    396         return PythonFile(self.fs.open(path, mode="rb"), mode="r")

> [Python] pyarrow.parquet.read_table either returns FileNotFound or ArrowInvalid
> -------------------------------------------------------------------------------
>
>                 Key: ARROW-15910
>                 URL: https://issues.apache.org/jira/browse/ARROW-15910
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: Parquet, Python
>    Affects Versions: 6.0.1, 7.0.0
>         Environment: GCP JupyterLab notebooks
>            Reporter: Callista Rogers
>            Priority: Major
>
> running below results in {{"GetFileIno() yielded path 'myBucket/features/MyParquet.parquet/year=2022/part-0019.snappy.parquet' which is outside base dir 'gs://myBucket/features/MyParquet.parquet/' "}}
> {code}
> import pyarrow.parquet as pq
> import gcsfs
> file_path="gs://myBucket/features/MyParquet.parquet/"
> fs=gcsfs.GCSFileSystem()
> table=pq.read_table(file_path,filesystem=fs) 
> {code}
> Removing the gs:// from file_path results in a {{FileNotFoundError}}. Any variation of / or // at the beginning of the path gives me the 'outside base dir' error.
> I also ran the below and got valid results using both file_path patterns, so I know it finds the path just fine.
> {code}
> from pyarrow.fs import FileSelector, PyFileSystem, FSSpecHandler
> filesys = PyFileSystem(FSSpecHandler(fs))
> selector = FileSelector(file_path, recursive=True)
> filesys.get_file_info(selector)
> {code}



--
This message was sent by Atlassian Jira
(v8.20.1#820001)