You are viewing a plain text version of this content. The canonical link for it is here.
Posted to jira@arrow.apache.org by "Benson Muite (Jira)" <ji...@apache.org> on 2021/11/08 13:31:00 UTC
[jira] [Created] (ARROW-14629) [Release][Python] Parquet test fails
on AlmaLinux8
Benson Muite created ARROW-14629:
------------------------------------
Summary: [Release][Python] Parquet test fails on AlmaLinux8
Key: ARROW-14629
URL: https://issues.apache.org/jira/browse/ARROW-14629
Project: Apache Arrow
Issue Type: Bug
Reporter: Benson Muite
When running verification tests on AlmaLinux 8, Parquet test fails
Main steps to reproduce
{code:bash}
dnf -y update
dnf clean all
dnf -y install \
dnf-plugins-core \
yum-utils
dnf config-manager --set-enabled powertools
dnf -y update
dnf -y module disable ruby
dnf -y module enable ruby:2.7
dnf -y groupinstall "Development Tools"
dnf -y install \
epel-release \
ninja-build \
libcurl-devel \
python3-pip \
python3-devel \
cmake \
git \
ncurses-devel \
gobject-introspection-devel \
libffi-devel \
openssl-devel \
maven \
java-1.8.0-openjdk-devel \
wget \
readline-devel \
gdbm-devel \
ruby-devel \
llvm-toolset \
llvm-devel
dnf -y update
alias pip=pip3
alternatives --set python /usr/bin/python3
ln -s /usr/bin/pip3 /usr/bin/pip
git clone https://github.com/apache/arrow/
pip install -r arrow/python/requirements-build.txt \
-r arrow/python/requirements-test.txt
cd arrow
mkdir dist
export ARROW_HOME=$(pwd)/dist
export LD_LIBRARY_PATH=$(pwd)/dist/lib:$LD_LIBRARY_PATH
cd cpp
mkdir build
cmake -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
-DCMAKE_INSTALL_LIBDIR=lib \
-DARROW_WITH_BZ2=ON \
-DARROW_WITH_ZLIB=ON \
-DARROW_WITH_ZSTD=ON \
-DARROW_WITH_LZ4=ON \
-DARROW_WITH_SNAPPY=ON \
-DARROW_WITH_BROTLI=ON \
-DARROW_PARQUET=ON \
-DARROW_PYTHON=ON \
-DARROW_BUILD_TESTS=ON \
..
make -j4
make install
cd ..
cd ..
cd python
export PYARROW_WITH_PARQUET=1
python setup.py build_ext --inplace
export PYARROW_TEST_PARQUET=ON
python -m pytest -r s --pyargs pyarrow
{code}
Resulting error:
{code:bash}
============================================ FAILURES =============================================
________________________________ test_permutation_of_column_order _________________________________
source = '/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0/dataset_column_order_permutation'
columns = None, use_threads = True, metadata = None, use_pandas_metadata = False
memory_map = False, read_dictionary = None
filesystem = <pyarrow._fs.LocalFileSystem object at 0x7f70875b7e30>, filters = None
buffer_size = 0, partitioning = 'hive', use_legacy_dataset = False, ignore_prefixes = None
pre_buffer = True, coerce_int96_timestamp_unit = None
def read_table(source, columns=None, use_threads=True, metadata=None,
use_pandas_metadata=False, memory_map=False,
read_dictionary=None, filesystem=None, filters=None,
buffer_size=0, partitioning="hive", use_legacy_dataset=False,
ignore_prefixes=None, pre_buffer=True,
coerce_int96_timestamp_unit=None):
if not use_legacy_dataset:
if metadata is not None:
raise ValueError(
"The 'metadata' keyword is no longer supported with the new "
"datasets-based implementation. Specify "
"'use_legacy_dataset=True' to temporarily recover the old "
"behaviour."
)
try:
dataset = _ParquetDatasetV2(
source,
filesystem=filesystem,
partitioning=partitioning,
memory_map=memory_map,
read_dictionary=read_dictionary,
buffer_size=buffer_size,
filters=filters,
ignore_prefixes=ignore_prefixes,
pre_buffer=pre_buffer,
> coerce_int96_timestamp_unit=coerce_int96_timestamp_unit
)
pyarrow/parquet.py:1960:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <pyarrow.parquet._ParquetDatasetV2 object at 0x7f7087556da0>
path_or_paths = '/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0/dataset_column_order_permutation'
filesystem = None, filters = None, partitioning = 'hive', read_dictionary = None, buffer_size = 0
memory_map = False, ignore_prefixes = None, pre_buffer = True, coerce_int96_timestamp_unit = None
kwargs = {}
def __init__(self, path_or_paths, filesystem=None, filters=None,
partitioning="hive", read_dictionary=None, buffer_size=None,
memory_map=False, ignore_prefixes=None, pre_buffer=True,
coerce_int96_timestamp_unit=None, **kwargs):
> import pyarrow.dataset as ds
pyarrow/parquet.py:1680:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
"""Dataset is currently unstable. APIs subject to change without notice."""
import pyarrow as pa
from pyarrow.util import _is_iterable, _stringify_path, _is_path_like
> from pyarrow._dataset import ( # noqa
CsvFileFormat,
CsvFragmentScanOptions,
Expression,
Dataset,
DatasetFactory,
DirectoryPartitioning,
FileFormat,
FileFragment,
FileSystemDataset,
FileSystemDatasetFactory,
FileSystemFactoryOptions,
FileWriteOptions,
Fragment,
HivePartitioning,
IpcFileFormat,
IpcFileWriteOptions,
InMemoryDataset,
ParquetDatasetFactory,
ParquetFactoryOptions,
ParquetFileFormat,
ParquetFileFragment,
ParquetFileWriteOptions,
ParquetFragmentScanOptions,
ParquetReadOptions,
Partitioning,
PartitioningFactory,
RowGroupInfo,
Scanner,
TaggedRecordBatch,
UnionDataset,
UnionDatasetFactory,
_get_partition_keys,
_filesystemdataset_write,
)
E ModuleNotFoundError: No module named 'pyarrow._dataset'
pyarrow/dataset.py:23: ModuleNotFoundError
During handling of the above exception, another exception occurred:
tempdir = PosixPath('/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0')
def test_permutation_of_column_order(tempdir):
# ARROW-2366
case = tempdir / "dataset_column_order_permutation"
case.mkdir(exist_ok=True)
data1 = pa.table([[1, 2, 3], [.1, .2, .3]], names=['a', 'b'])
pq.write_table(data1, case / "data1.parquet")
data2 = pa.table([[.4, .5, .6], [4, 5, 6]], names=['b', 'a'])
pq.write_table(data2, case / "data2.parquet")
> table = pq.read_table(str(case))
pyarrow/tests/parquet/test_basic.py:645:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
pyarrow/parquet.py:1977: in read_table
source = filesystem.open_input_file(path)
pyarrow/_fs.pyx:588: in pyarrow._fs.FileSystem.open_input_file
in_handle = GetResultValue(self.fs.OpenInputFile(pathstr))
pyarrow/error.pxi:143: in pyarrow.lib.pyarrow_internal_check_status
return check_status(status)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
> raise IOError(message)
E OSError: Cannot open for reading: path '/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0/dataset_column_order_permutation' is a directory
pyarrow/error.pxi:114: OSError
{code}
--
This message was sent by Atlassian Jira
(v8.20.1#820001)