You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@arrow.apache.org by "Wes McKinney (Jira)" <ji...@apache.org> on 2019/09/19 19:12:00 UTC
[jira] [Created] (ARROW-6623) [CI][Python] Dask docker integration
test broken perhaps by statistics-related change
Wes McKinney created ARROW-6623:
-----------------------------------
Summary: [CI][Python] Dask docker integration test broken perhaps by statistics-related change
Key: ARROW-6623
URL: https://issues.apache.org/jira/browse/ARROW-6623
Project: Apache Arrow
Issue Type: Bug
Components: Python
Reporter: Wes McKinney
Fix For: 0.15.0
see new failure
https://circleci.com/gh/ursa-labs/crossbow/3027?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link
{code}
=================================== FAILURES ===================================
___________________ test_timeseries_nulls_in_schema[pyarrow] ___________________
tmpdir = local('/tmp/pytest-of-root/pytest-0/test_timeseries_nulls_in_schem0')
engine = 'pyarrow'
def test_timeseries_nulls_in_schema(tmpdir, engine):
tmp_path = str(tmpdir)
ddf2 = (
dask.datasets.timeseries(start="2000-01-01", end="2000-01-03", freq="1h")
.reset_index()
.map_partitions(lambda x: x.loc[:5])
)
ddf2 = ddf2.set_index("x").reset_index().persist()
ddf2.name = ddf2.name.where(ddf2.timestamp == "2000-01-01", None)
ddf2.to_parquet(tmp_path, engine=engine)
ddf_read = dd.read_parquet(tmp_path, engine=engine)
assert_eq(ddf_read, ddf2, check_divisions=False, check_index=False)
# Can force schema validation on each partition in pyarrow
if engine == "pyarrow":
# The schema mismatch should raise an error
with pytest.raises(ValueError):
ddf_read = dd.read_parquet(
tmp_path, dataset={"validate_schema": True}, engine=engine
)
# There should be no error if you specify a schema on write
schema = pa.schema(
[
("x", pa.float64()),
("timestamp", pa.timestamp("ns")),
("id", pa.int64()),
("name", pa.string()),
("y", pa.float64()),
]
)
ddf2.to_parquet(tmp_path, schema=schema, engine=engine)
assert_eq(
> dd.read_parquet(tmp_path, dataset={"validate_schema": True}, engine=engine),
ddf2,
check_divisions=False,
check_index=False,
)
opt/conda/lib/python3.6/site-packages/dask/dataframe/io/tests/test_parquet.py:1964:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
opt/conda/lib/python3.6/site-packages/dask/dataframe/io/parquet/core.py:190: in read_parquet
out = sorted_columns(statistics)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
statistics = ({'columns': [{'max': -0.25838390663957256, 'min': -0.979681447427093, 'name': 'x', 'null_count': 0}, {'max': Timestam...ull_count': 0}, {'max': 0.8978352477516438, 'min': -0.7218571212693894, 'name': 'y', 'null_count': 0}], 'num-rows': 7})
def sorted_columns(statistics):
""" Find sorted columns given row-group statistics
This finds all columns that are sorted, along with appropriate divisions
values for those columns
Returns
-------
out: List of {'name': str, 'divisions': List[str]} dictionaries
"""
if not statistics:
return []
out = []
for i, c in enumerate(statistics[0]["columns"]):
if not all(
"min" in s["columns"][i] and "max" in s["columns"][i] for s in statistics
):
continue
divisions = [c["min"]]
max = c["max"]
success = True
for stats in statistics[1:]:
c = stats["columns"][i]
> if c["min"] >= max:
E TypeError: '>=' not supported between instances of 'numpy.ndarray' and 'str'
opt/conda/lib/python3.6/site-packages/dask/dataframe/io/parquet/core.py:570: TypeError
{code}
--
This message was sent by Atlassian Jira
(v8.3.4#803005)