You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@arrow.apache.org by "Wes McKinney (Jira)" <ji...@apache.org> on 2019/09/19 19:12:00 UTC
[jira] [Created] (ARROW-6623) [CI][Python] Dask docker integration test broken perhaps by statistics-related change

Wes McKinney created ARROW-6623:
-----------------------------------

             Summary: [CI][Python] Dask docker integration test broken perhaps by statistics-related change
                 Key: ARROW-6623
                 URL: https://issues.apache.org/jira/browse/ARROW-6623
             Project: Apache Arrow
          Issue Type: Bug
          Components: Python
            Reporter: Wes McKinney
             Fix For: 0.15.0


see new failure 

https://circleci.com/gh/ursa-labs/crossbow/3027?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link

{code}
=================================== FAILURES ===================================
___________________ test_timeseries_nulls_in_schema[pyarrow] ___________________

tmpdir = local('/tmp/pytest-of-root/pytest-0/test_timeseries_nulls_in_schem0')
engine = 'pyarrow'

    def test_timeseries_nulls_in_schema(tmpdir, engine):
        tmp_path = str(tmpdir)
        ddf2 = (
            dask.datasets.timeseries(start="2000-01-01", end="2000-01-03", freq="1h")
            .reset_index()
            .map_partitions(lambda x: x.loc[:5])
        )
        ddf2 = ddf2.set_index("x").reset_index().persist()
        ddf2.name = ddf2.name.where(ddf2.timestamp == "2000-01-01", None)
    
        ddf2.to_parquet(tmp_path, engine=engine)
        ddf_read = dd.read_parquet(tmp_path, engine=engine)
    
        assert_eq(ddf_read, ddf2, check_divisions=False, check_index=False)
    
        # Can force schema validation on each partition in pyarrow
        if engine == "pyarrow":
            # The schema mismatch should raise an error
            with pytest.raises(ValueError):
                ddf_read = dd.read_parquet(
                    tmp_path, dataset={"validate_schema": True}, engine=engine
                )
            # There should be no error if you specify a schema on write
            schema = pa.schema(
                [
                    ("x", pa.float64()),
                    ("timestamp", pa.timestamp("ns")),
                    ("id", pa.int64()),
                    ("name", pa.string()),
                    ("y", pa.float64()),
                ]
            )
            ddf2.to_parquet(tmp_path, schema=schema, engine=engine)
            assert_eq(
>               dd.read_parquet(tmp_path, dataset={"validate_schema": True}, engine=engine),
                ddf2,
                check_divisions=False,
                check_index=False,
            )

opt/conda/lib/python3.6/site-packages/dask/dataframe/io/tests/test_parquet.py:1964: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
opt/conda/lib/python3.6/site-packages/dask/dataframe/io/parquet/core.py:190: in read_parquet
    out = sorted_columns(statistics)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

statistics = ({'columns': [{'max': -0.25838390663957256, 'min': -0.979681447427093, 'name': 'x', 'null_count': 0}, {'max': Timestam...ull_count': 0}, {'max': 0.8978352477516438, 'min': -0.7218571212693894, 'name': 'y', 'null_count': 0}], 'num-rows': 7})

    def sorted_columns(statistics):
        """ Find sorted columns given row-group statistics
    
        This finds all columns that are sorted, along with appropriate divisions
        values for those columns
    
        Returns
        -------
        out: List of {'name': str, 'divisions': List[str]} dictionaries
        """
        if not statistics:
            return []
    
        out = []
        for i, c in enumerate(statistics[0]["columns"]):
            if not all(
                "min" in s["columns"][i] and "max" in s["columns"][i] for s in statistics
            ):
                continue
            divisions = [c["min"]]
            max = c["max"]
            success = True
            for stats in statistics[1:]:
                c = stats["columns"][i]
>               if c["min"] >= max:
E               TypeError: '>=' not supported between instances of 'numpy.ndarray' and 'str'

opt/conda/lib/python3.6/site-packages/dask/dataframe/io/parquet/core.py:570: TypeError
{code}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)