You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@arrow.apache.org by "Moritz Meister (Jira)" <ji...@apache.org> on 2022/11/07 23:02:00 UTC
[jira] [Created] (ARROW-18276) Reading from hdfs using pyarrow 10.0.0 throws OSError: [Errno 22] Opening HDFS file
Moritz Meister created ARROW-18276:
--------------------------------------
Summary: Reading from hdfs using pyarrow 10.0.0 throws OSError: [Errno 22] Opening HDFS file
Key: ARROW-18276
URL: https://issues.apache.org/jira/browse/ARROW-18276
Project: Apache Arrow
Issue Type: Bug
Components: Python
Affects Versions: 10.0.0
Environment: pyarrow 10.0.0
fsspec 2022.7.1
pandas 1.3.3
python 3.8.11.
Reporter: Moritz Meister
Hey!
I am trying to read a CSV file using pyarrow together with fsspec from HDFS.
I used to do this with pyarrow 9.0.0 and fsspec 2022.7.1, however, after I upgraded to pyarrow 10.0.0 this stopped working.
I am not quite sure if this is an incompatibility introduced in the new pyarrow version or if it is a Bug in fsspec. So if I am in the wrong place here, please let me know.
Apart from pyarrow 10.0.0 and fsspec 2022.7.1, I am using pandas version 1.3.3 and python 3.8.11.
Here is the full stack trace
```python
pd.read_csv("hdfs://10.0.2.15:8020/Projects/testing/testing_Training_Datasets/transactions_view_fraud_batch_fv_1_1/validation/part-00000-42b57ad2-57eb-4a63-bfaa-7375e82863e8-c000.csv")
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/readers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
584 kwds.update(kwds_defaults)
585
--> 586 return _read(filepath_or_buffer, kwds)
587
588
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/readers.py in _read(filepath_or_buffer, kwds)
480
481 # Create the parser.
--> 482 parser = TextFileReader(filepath_or_buffer, **kwds)
483
484 if chunksize or iterator:
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/readers.py in __init__(self, f, engine, **kwds)
809 self.options["has_index_names"] = kwds["has_index_names"]
810
--> 811 self._engine = self._make_engine(self.engine)
812
813 def close(self):
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/readers.py in _make_engine(self, engine)
1038 )
1039 # error: Too many arguments for "ParserBase"
-> 1040 return mapping[engine](self.f, **self.options) # type: ignore[call-arg]
1041
1042 def _failover_to_python(self):
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/c_parser_wrapper.py in __init__(self, src, **kwds)
49
50 # open handles
---> 51 self._open_handles(src, kwds)
52 assert self.handles is not None
53
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/base_parser.py in _open_handles(self, src, kwds)
220 Let the readers open IOHandles after they are done with their potential raises.
221 """
--> 222 self.handles = get_handle(
223 src,
224 "r",
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
607
608 # open URLs
--> 609 ioargs = _get_filepath_or_buffer(
610 path_or_buf,
611 encoding=encoding,
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/common.py in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
356
357 try:
--> 358 file_obj = fsspec.open(
359 filepath_or_buffer, mode=fsspec_mode, **(storage_options or {})
360 ).open()
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/core.py in open(self)
133 during the life of the file-like it generates.
134 """
--> 135 return self.__enter__()
136
137 def close(self):
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/core.py in __enter__(self)
101 mode = self.mode.replace("t", "").replace("b", "") + "b"
102
--> 103 f = self.fs.open(self.path, mode=mode)
104
105 self.fobjects = [f]
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/spec.py in open(self, path, mode, block_size, cache_options, compression, **kwargs)
1092 else:
1093 ac = kwargs.pop("autocommit", not self._intrans)
-> 1094 f = self._open(
1095 path,
1096 mode=mode,
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/implementations/arrow.py in wrapper(*args, **kwargs)
15 def wrapper(*args, **kwargs):
16 try:
---> 17 return func(*args, **kwargs)
18 except OSError as exception:
19 if not exception.args:
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/implementations/arrow.py in _open(self, path, mode, block_size, **kwargs)
157 # disable compression auto-detection
158 _kwargs["compression"] = None
--> 159 stream = method(path, **_kwargs)
160
161 return ArrowFile(self, stream, path, mode, block_size, **kwargs)
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pyarrow/_fs.pyx in pyarrow._fs.FileSystem.open_input_stream()
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.pyarrow_internal_check_status()
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.check_status()
OSError: [Errno 22] Opening HDFS file '10.0.2.15:8020/Projects/test/test_Training_Datasets/td_1/td/part-00000-acb66eca-636e-4917-9673-e6646f0b4486-c000.csv' failed. Detail: [errno 22] Invalid argument
```
However, if I leave out the namenode IP and port, it works as expected:
```python
pd.read_csv("hdfs:///Projects/testing/testing_Training_Datasets/transactions_view_fraud_batch_fv_1_1/validation/part-00000-42b57ad2-57eb-4a63-bfaa-7375e82863e8-c000.csv")
```
Any help is appreciated, thank you!
--
This message was sent by Atlassian Jira
(v8.20.10#820010)