You are viewing a plain text version of this content. The canonical link for it is here.
Posted to jira@arrow.apache.org by "Joris Van den Bossche (Jira)" <ji...@apache.org> on 2021/11/08 19:08:00 UTC
[jira] [Commented] (ARROW-14629) [Release][Python] Parquet test fails on AlmaLinux8

    [ https://issues.apache.org/jira/browse/ARROW-14629?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17440682#comment-17440682 ] 

Joris Van den Bossche commented on ARROW-14629:
-----------------------------------------------

Ah, we forgot to add a {{dataset}} marker in this case. (it's easy to forget since we don't have a CI build that does _not_ include dataset at the moment ..)

> [Release][Python] Parquet test fails on AlmaLinux8
> --------------------------------------------------
>
>                 Key: ARROW-14629
>                 URL: https://issues.apache.org/jira/browse/ARROW-14629
>             Project: Apache Arrow
>          Issue Type: Bug
>            Reporter: Benson Muite
>            Assignee: Joris Van den Bossche
>            Priority: Minor
>             Fix For: 7.0.0, 6.0.1
>
>
> When running verification tests on AlmaLinux 8, Parquet test fails
> Main steps to reproduce
> {code:bash}
> dnf -y update
> dnf clean all
> dnf -y install \
>   dnf-plugins-core \
>   yum-utils
> dnf config-manager --set-enabled powertools
> dnf -y update
> dnf -y module disable ruby
> dnf -y module enable ruby:2.7
> dnf -y groupinstall "Development Tools"
> dnf -y install \
>   epel-release \
>   ninja-build \
>   libcurl-devel \
>   python3-pip \
>   python3-devel \
>   cmake \
>   git \
>   ncurses-devel \
>   gobject-introspection-devel \
>   libffi-devel \
>   openssl-devel \
>   maven \
>   java-1.8.0-openjdk-devel \
>   wget \
>   readline-devel \
>   gdbm-devel \
>   ruby-devel \
>   llvm-toolset \
>   llvm-devel
> dnf -y update
> alias pip=pip3
> alternatives --set python /usr/bin/python3
> ln -s /usr/bin/pip3 /usr/bin/pip
> git clone https://github.com/apache/arrow/
> pip install -r arrow/python/requirements-build.txt \
>      -r arrow/python/requirements-test.txt
> cd arrow
> mkdir dist
> export ARROW_HOME=$(pwd)/dist
> export LD_LIBRARY_PATH=$(pwd)/dist/lib:$LD_LIBRARY_PATH
> cd cpp
> mkdir build
> cmake -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
>       -DCMAKE_INSTALL_LIBDIR=lib \
>       -DARROW_WITH_BZ2=ON \
>       -DARROW_WITH_ZLIB=ON \
>       -DARROW_WITH_ZSTD=ON \
>       -DARROW_WITH_LZ4=ON \
>       -DARROW_WITH_SNAPPY=ON \
>       -DARROW_WITH_BROTLI=ON \
>       -DARROW_PARQUET=ON \
>       -DARROW_PYTHON=ON \
>       -DARROW_BUILD_TESTS=ON \
>       ..
> make -j4
> make install
> cd ..
> cd ..
> cd python
> export PYARROW_WITH_PARQUET=1
> python setup.py build_ext --inplace
> export PYARROW_TEST_PARQUET=ON
>  python -m pytest -r s --pyargs pyarrow
> {code}
> Resulting error:
> {code:bash}
> ============================================ FAILURES =============================================
> ________________________________ test_permutation_of_column_order _________________________________
> source = '/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0/dataset_column_order_permutation'
> columns = None, use_threads = True, metadata = None, use_pandas_metadata = False
> memory_map = False, read_dictionary = None
> filesystem = <pyarrow._fs.LocalFileSystem object at 0x7f70875b7e30>, filters = None
> buffer_size = 0, partitioning = 'hive', use_legacy_dataset = False, ignore_prefixes = None
> pre_buffer = True, coerce_int96_timestamp_unit = None
>     def read_table(source, columns=None, use_threads=True, metadata=None,
>                    use_pandas_metadata=False, memory_map=False,
>                    read_dictionary=None, filesystem=None, filters=None,
>                    buffer_size=0, partitioning="hive", use_legacy_dataset=False,
>                    ignore_prefixes=None, pre_buffer=True,
>                    coerce_int96_timestamp_unit=None):
>         if not use_legacy_dataset:
>             if metadata is not None:
>                 raise ValueError(
>                     "The 'metadata' keyword is no longer supported with the new "
>                     "datasets-based implementation. Specify "
>                     "'use_legacy_dataset=True' to temporarily recover the old "
>                     "behaviour."
>                 )
>             try:
>                 dataset = _ParquetDatasetV2(
>                     source,
>                     filesystem=filesystem,
>                     partitioning=partitioning,
>                     memory_map=memory_map,
>                     read_dictionary=read_dictionary,
>                     buffer_size=buffer_size,
>                     filters=filters,
>                     ignore_prefixes=ignore_prefixes,
>                     pre_buffer=pre_buffer,
> >                   coerce_int96_timestamp_unit=coerce_int96_timestamp_unit
>                 )
> pyarrow/parquet.py:1960: 
> _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
> self = <pyarrow.parquet._ParquetDatasetV2 object at 0x7f7087556da0>
> path_or_paths = '/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0/dataset_column_order_permutation'
> filesystem = None, filters = None, partitioning = 'hive', read_dictionary = None, buffer_size = 0
> memory_map = False, ignore_prefixes = None, pre_buffer = True, coerce_int96_timestamp_unit = None
> kwargs = {}
>     def __init__(self, path_or_paths, filesystem=None, filters=None,
>                  partitioning="hive", read_dictionary=None, buffer_size=None,
>                  memory_map=False, ignore_prefixes=None, pre_buffer=True,
>                  coerce_int96_timestamp_unit=None, **kwargs):
> >       import pyarrow.dataset as ds
> pyarrow/parquet.py:1680: 
> _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
>     """Dataset is currently unstable. APIs subject to change without notice."""
>     
>     import pyarrow as pa
>     from pyarrow.util import _is_iterable, _stringify_path, _is_path_like
>     
> >   from pyarrow._dataset import (  # noqa
>         CsvFileFormat,
>         CsvFragmentScanOptions,
>         Expression,
>         Dataset,
>         DatasetFactory,
>         DirectoryPartitioning,
>         FileFormat,
>         FileFragment,
>         FileSystemDataset,
>         FileSystemDatasetFactory,
>         FileSystemFactoryOptions,
>         FileWriteOptions,
>         Fragment,
>         HivePartitioning,
>         IpcFileFormat,
>         IpcFileWriteOptions,
>         InMemoryDataset,
>         ParquetDatasetFactory,
>         ParquetFactoryOptions,
>         ParquetFileFormat,
>         ParquetFileFragment,
>         ParquetFileWriteOptions,
>         ParquetFragmentScanOptions,
>         ParquetReadOptions,
>         Partitioning,
>         PartitioningFactory,
>         RowGroupInfo,
>         Scanner,
>         TaggedRecordBatch,
>         UnionDataset,
>         UnionDatasetFactory,
>         _get_partition_keys,
>         _filesystemdataset_write,
>     )
> E   ModuleNotFoundError: No module named 'pyarrow._dataset'
> pyarrow/dataset.py:23: ModuleNotFoundError
> During handling of the above exception, another exception occurred:
> tempdir = PosixPath('/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0')
>     def test_permutation_of_column_order(tempdir):
>         # ARROW-2366
>         case = tempdir / "dataset_column_order_permutation"
>         case.mkdir(exist_ok=True)
>     
>         data1 = pa.table([[1, 2, 3], [.1, .2, .3]], names=['a', 'b'])
>         pq.write_table(data1, case / "data1.parquet")
>     
>         data2 = pa.table([[.4, .5, .6], [4, 5, 6]], names=['b', 'a'])
>         pq.write_table(data2, case / "data2.parquet")
>     
> >       table = pq.read_table(str(case))
> pyarrow/tests/parquet/test_basic.py:645: 
> _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
> pyarrow/parquet.py:1977: in read_table
>     source = filesystem.open_input_file(path)
> pyarrow/_fs.pyx:588: in pyarrow._fs.FileSystem.open_input_file
>     in_handle = GetResultValue(self.fs.OpenInputFile(pathstr))
> pyarrow/error.pxi:143: in pyarrow.lib.pyarrow_internal_check_status
>     return check_status(status)
> _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
> >   raise IOError(message)
> E   OSError: Cannot open for reading: path '/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0/dataset_column_order_permutation' is a directory
> pyarrow/error.pxi:114: OSError
> {code}



--
This message was sent by Atlassian Jira
(v8.20.1#820001)