You are viewing a plain text version of this content. The canonical link for it is here.
Posted to jira@arrow.apache.org by "Joris Van den Bossche (Jira)" <ji...@apache.org> on 2021/11/08 19:04:00 UTC
[jira] [Assigned] (ARROW-14629) [Release][Python] Parquet test
fails on AlmaLinux8
[ https://issues.apache.org/jira/browse/ARROW-14629?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Joris Van den Bossche reassigned ARROW-14629:
---------------------------------------------
Assignee: Joris Van den Bossche
> [Release][Python] Parquet test fails on AlmaLinux8
> --------------------------------------------------
>
> Key: ARROW-14629
> URL: https://issues.apache.org/jira/browse/ARROW-14629
> Project: Apache Arrow
> Issue Type: Bug
> Reporter: Benson Muite
> Assignee: Joris Van den Bossche
> Priority: Minor
>
> When running verification tests on AlmaLinux 8, Parquet test fails
> Main steps to reproduce
> {code:bash}
> dnf -y update
> dnf clean all
> dnf -y install \
> dnf-plugins-core \
> yum-utils
> dnf config-manager --set-enabled powertools
> dnf -y update
> dnf -y module disable ruby
> dnf -y module enable ruby:2.7
> dnf -y groupinstall "Development Tools"
> dnf -y install \
> epel-release \
> ninja-build \
> libcurl-devel \
> python3-pip \
> python3-devel \
> cmake \
> git \
> ncurses-devel \
> gobject-introspection-devel \
> libffi-devel \
> openssl-devel \
> maven \
> java-1.8.0-openjdk-devel \
> wget \
> readline-devel \
> gdbm-devel \
> ruby-devel \
> llvm-toolset \
> llvm-devel
> dnf -y update
> alias pip=pip3
> alternatives --set python /usr/bin/python3
> ln -s /usr/bin/pip3 /usr/bin/pip
> git clone https://github.com/apache/arrow/
> pip install -r arrow/python/requirements-build.txt \
> -r arrow/python/requirements-test.txt
> cd arrow
> mkdir dist
> export ARROW_HOME=$(pwd)/dist
> export LD_LIBRARY_PATH=$(pwd)/dist/lib:$LD_LIBRARY_PATH
> cd cpp
> mkdir build
> cmake -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
> -DCMAKE_INSTALL_LIBDIR=lib \
> -DARROW_WITH_BZ2=ON \
> -DARROW_WITH_ZLIB=ON \
> -DARROW_WITH_ZSTD=ON \
> -DARROW_WITH_LZ4=ON \
> -DARROW_WITH_SNAPPY=ON \
> -DARROW_WITH_BROTLI=ON \
> -DARROW_PARQUET=ON \
> -DARROW_PYTHON=ON \
> -DARROW_BUILD_TESTS=ON \
> ..
> make -j4
> make install
> cd ..
> cd ..
> cd python
> export PYARROW_WITH_PARQUET=1
> python setup.py build_ext --inplace
> export PYARROW_TEST_PARQUET=ON
> python -m pytest -r s --pyargs pyarrow
> {code}
> Resulting error:
> {code:bash}
> ============================================ FAILURES =============================================
> ________________________________ test_permutation_of_column_order _________________________________
> source = '/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0/dataset_column_order_permutation'
> columns = None, use_threads = True, metadata = None, use_pandas_metadata = False
> memory_map = False, read_dictionary = None
> filesystem = <pyarrow._fs.LocalFileSystem object at 0x7f70875b7e30>, filters = None
> buffer_size = 0, partitioning = 'hive', use_legacy_dataset = False, ignore_prefixes = None
> pre_buffer = True, coerce_int96_timestamp_unit = None
> def read_table(source, columns=None, use_threads=True, metadata=None,
> use_pandas_metadata=False, memory_map=False,
> read_dictionary=None, filesystem=None, filters=None,
> buffer_size=0, partitioning="hive", use_legacy_dataset=False,
> ignore_prefixes=None, pre_buffer=True,
> coerce_int96_timestamp_unit=None):
> if not use_legacy_dataset:
> if metadata is not None:
> raise ValueError(
> "The 'metadata' keyword is no longer supported with the new "
> "datasets-based implementation. Specify "
> "'use_legacy_dataset=True' to temporarily recover the old "
> "behaviour."
> )
> try:
> dataset = _ParquetDatasetV2(
> source,
> filesystem=filesystem,
> partitioning=partitioning,
> memory_map=memory_map,
> read_dictionary=read_dictionary,
> buffer_size=buffer_size,
> filters=filters,
> ignore_prefixes=ignore_prefixes,
> pre_buffer=pre_buffer,
> > coerce_int96_timestamp_unit=coerce_int96_timestamp_unit
> )
> pyarrow/parquet.py:1960:
> _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
> self = <pyarrow.parquet._ParquetDatasetV2 object at 0x7f7087556da0>
> path_or_paths = '/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0/dataset_column_order_permutation'
> filesystem = None, filters = None, partitioning = 'hive', read_dictionary = None, buffer_size = 0
> memory_map = False, ignore_prefixes = None, pre_buffer = True, coerce_int96_timestamp_unit = None
> kwargs = {}
> def __init__(self, path_or_paths, filesystem=None, filters=None,
> partitioning="hive", read_dictionary=None, buffer_size=None,
> memory_map=False, ignore_prefixes=None, pre_buffer=True,
> coerce_int96_timestamp_unit=None, **kwargs):
> > import pyarrow.dataset as ds
> pyarrow/parquet.py:1680:
> _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
> """Dataset is currently unstable. APIs subject to change without notice."""
>
> import pyarrow as pa
> from pyarrow.util import _is_iterable, _stringify_path, _is_path_like
>
> > from pyarrow._dataset import ( # noqa
> CsvFileFormat,
> CsvFragmentScanOptions,
> Expression,
> Dataset,
> DatasetFactory,
> DirectoryPartitioning,
> FileFormat,
> FileFragment,
> FileSystemDataset,
> FileSystemDatasetFactory,
> FileSystemFactoryOptions,
> FileWriteOptions,
> Fragment,
> HivePartitioning,
> IpcFileFormat,
> IpcFileWriteOptions,
> InMemoryDataset,
> ParquetDatasetFactory,
> ParquetFactoryOptions,
> ParquetFileFormat,
> ParquetFileFragment,
> ParquetFileWriteOptions,
> ParquetFragmentScanOptions,
> ParquetReadOptions,
> Partitioning,
> PartitioningFactory,
> RowGroupInfo,
> Scanner,
> TaggedRecordBatch,
> UnionDataset,
> UnionDatasetFactory,
> _get_partition_keys,
> _filesystemdataset_write,
> )
> E ModuleNotFoundError: No module named 'pyarrow._dataset'
> pyarrow/dataset.py:23: ModuleNotFoundError
> During handling of the above exception, another exception occurred:
> tempdir = PosixPath('/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0')
> def test_permutation_of_column_order(tempdir):
> # ARROW-2366
> case = tempdir / "dataset_column_order_permutation"
> case.mkdir(exist_ok=True)
>
> data1 = pa.table([[1, 2, 3], [.1, .2, .3]], names=['a', 'b'])
> pq.write_table(data1, case / "data1.parquet")
>
> data2 = pa.table([[.4, .5, .6], [4, 5, 6]], names=['b', 'a'])
> pq.write_table(data2, case / "data2.parquet")
>
> > table = pq.read_table(str(case))
> pyarrow/tests/parquet/test_basic.py:645:
> _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
> pyarrow/parquet.py:1977: in read_table
> source = filesystem.open_input_file(path)
> pyarrow/_fs.pyx:588: in pyarrow._fs.FileSystem.open_input_file
> in_handle = GetResultValue(self.fs.OpenInputFile(pathstr))
> pyarrow/error.pxi:143: in pyarrow.lib.pyarrow_internal_check_status
> return check_status(status)
> _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
> > raise IOError(message)
> E OSError: Cannot open for reading: path '/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0/dataset_column_order_permutation' is a directory
> pyarrow/error.pxi:114: OSError
> {code}
--
This message was sent by Atlassian Jira
(v8.20.1#820001)