You are viewing a plain text version of this content. The canonical link for it is here.
Posted to jira@arrow.apache.org by "Benson Muite (Jira)" <ji...@apache.org> on 2021/11/08 13:31:00 UTC
[jira] [Created] (ARROW-14629) [Release][Python] Parquet test fails on AlmaLinux8

Benson Muite created ARROW-14629:
------------------------------------

             Summary: [Release][Python] Parquet test fails on AlmaLinux8
                 Key: ARROW-14629
                 URL: https://issues.apache.org/jira/browse/ARROW-14629
             Project: Apache Arrow
          Issue Type: Bug
            Reporter: Benson Muite


When running verification tests on AlmaLinux 8, Parquet test fails

Main steps to reproduce

{code:bash}
dnf -y update
dnf clean all
dnf -y install \
  dnf-plugins-core \
  yum-utils
dnf config-manager --set-enabled powertools
dnf -y update
dnf -y module disable ruby
dnf -y module enable ruby:2.7
dnf -y groupinstall "Development Tools"
dnf -y install \
  epel-release \
  ninja-build \
  libcurl-devel \
  python3-pip \
  python3-devel \
  cmake \
  git \
  ncurses-devel \
  gobject-introspection-devel \
  libffi-devel \
  openssl-devel \
  maven \
  java-1.8.0-openjdk-devel \
  wget \
  readline-devel \
  gdbm-devel \
  ruby-devel \
  llvm-toolset \
  llvm-devel
dnf -y update
alias pip=pip3
alternatives --set python /usr/bin/python3
ln -s /usr/bin/pip3 /usr/bin/pip
git clone https://github.com/apache/arrow/
pip install -r arrow/python/requirements-build.txt \
     -r arrow/python/requirements-test.txt
cd arrow
mkdir dist
export ARROW_HOME=$(pwd)/dist
export LD_LIBRARY_PATH=$(pwd)/dist/lib:$LD_LIBRARY_PATH
cd cpp
mkdir build
cmake -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
      -DCMAKE_INSTALL_LIBDIR=lib \
      -DARROW_WITH_BZ2=ON \
      -DARROW_WITH_ZLIB=ON \
      -DARROW_WITH_ZSTD=ON \
      -DARROW_WITH_LZ4=ON \
      -DARROW_WITH_SNAPPY=ON \
      -DARROW_WITH_BROTLI=ON \
      -DARROW_PARQUET=ON \
      -DARROW_PYTHON=ON \
      -DARROW_BUILD_TESTS=ON \
      ..
make -j4
make install
cd ..
cd ..
cd python
export PYARROW_WITH_PARQUET=1
python setup.py build_ext --inplace
export PYARROW_TEST_PARQUET=ON
 python -m pytest -r s --pyargs pyarrow
{code}

Resulting error:

{code:bash}
============================================ FAILURES =============================================
________________________________ test_permutation_of_column_order _________________________________

source = '/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0/dataset_column_order_permutation'
columns = None, use_threads = True, metadata = None, use_pandas_metadata = False
memory_map = False, read_dictionary = None
filesystem = <pyarrow._fs.LocalFileSystem object at 0x7f70875b7e30>, filters = None
buffer_size = 0, partitioning = 'hive', use_legacy_dataset = False, ignore_prefixes = None
pre_buffer = True, coerce_int96_timestamp_unit = None

    def read_table(source, columns=None, use_threads=True, metadata=None,
                   use_pandas_metadata=False, memory_map=False,
                   read_dictionary=None, filesystem=None, filters=None,
                   buffer_size=0, partitioning="hive", use_legacy_dataset=False,
                   ignore_prefixes=None, pre_buffer=True,
                   coerce_int96_timestamp_unit=None):
        if not use_legacy_dataset:
            if metadata is not None:
                raise ValueError(
                    "The 'metadata' keyword is no longer supported with the new "
                    "datasets-based implementation. Specify "
                    "'use_legacy_dataset=True' to temporarily recover the old "
                    "behaviour."
                )
            try:
                dataset = _ParquetDatasetV2(
                    source,
                    filesystem=filesystem,
                    partitioning=partitioning,
                    memory_map=memory_map,
                    read_dictionary=read_dictionary,
                    buffer_size=buffer_size,
                    filters=filters,
                    ignore_prefixes=ignore_prefixes,
                    pre_buffer=pre_buffer,
>                   coerce_int96_timestamp_unit=coerce_int96_timestamp_unit
                )

pyarrow/parquet.py:1960: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <pyarrow.parquet._ParquetDatasetV2 object at 0x7f7087556da0>
path_or_paths = '/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0/dataset_column_order_permutation'
filesystem = None, filters = None, partitioning = 'hive', read_dictionary = None, buffer_size = 0
memory_map = False, ignore_prefixes = None, pre_buffer = True, coerce_int96_timestamp_unit = None
kwargs = {}

    def __init__(self, path_or_paths, filesystem=None, filters=None,
                 partitioning="hive", read_dictionary=None, buffer_size=None,
                 memory_map=False, ignore_prefixes=None, pre_buffer=True,
                 coerce_int96_timestamp_unit=None, **kwargs):
>       import pyarrow.dataset as ds

pyarrow/parquet.py:1680: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    """Dataset is currently unstable. APIs subject to change without notice."""
    
    import pyarrow as pa
    from pyarrow.util import _is_iterable, _stringify_path, _is_path_like
    
>   from pyarrow._dataset import (  # noqa
        CsvFileFormat,
        CsvFragmentScanOptions,
        Expression,
        Dataset,
        DatasetFactory,
        DirectoryPartitioning,
        FileFormat,
        FileFragment,
        FileSystemDataset,
        FileSystemDatasetFactory,
        FileSystemFactoryOptions,
        FileWriteOptions,
        Fragment,
        HivePartitioning,
        IpcFileFormat,
        IpcFileWriteOptions,
        InMemoryDataset,
        ParquetDatasetFactory,
        ParquetFactoryOptions,
        ParquetFileFormat,
        ParquetFileFragment,
        ParquetFileWriteOptions,
        ParquetFragmentScanOptions,
        ParquetReadOptions,
        Partitioning,
        PartitioningFactory,
        RowGroupInfo,
        Scanner,
        TaggedRecordBatch,
        UnionDataset,
        UnionDatasetFactory,
        _get_partition_keys,
        _filesystemdataset_write,
    )
E   ModuleNotFoundError: No module named 'pyarrow._dataset'

pyarrow/dataset.py:23: ModuleNotFoundError

During handling of the above exception, another exception occurred:

tempdir = PosixPath('/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0')

    def test_permutation_of_column_order(tempdir):
        # ARROW-2366
        case = tempdir / "dataset_column_order_permutation"
        case.mkdir(exist_ok=True)
    
        data1 = pa.table([[1, 2, 3], [.1, .2, .3]], names=['a', 'b'])
        pq.write_table(data1, case / "data1.parquet")
    
        data2 = pa.table([[.4, .5, .6], [4, 5, 6]], names=['b', 'a'])
        pq.write_table(data2, case / "data2.parquet")
    
>       table = pq.read_table(str(case))

pyarrow/tests/parquet/test_basic.py:645: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
pyarrow/parquet.py:1977: in read_table
    source = filesystem.open_input_file(path)
pyarrow/_fs.pyx:588: in pyarrow._fs.FileSystem.open_input_file
    in_handle = GetResultValue(self.fs.OpenInputFile(pathstr))
pyarrow/error.pxi:143: in pyarrow.lib.pyarrow_internal_check_status
    return check_status(status)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

>   raise IOError(message)
E   OSError: Cannot open for reading: path '/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0/dataset_column_order_permutation' is a directory

pyarrow/error.pxi:114: OSError

{code}




--
This message was sent by Atlassian Jira
(v8.20.1#820001)