You are viewing a plain text version of this content. The canonical link for it is here.
Posted to jira@arrow.apache.org by "Masaaki Hamada (Jira)" <ji...@apache.org> on 2020/11/05 12:48:00 UTC

[jira] [Created] (ARROW-10501) Behavior of parquet.read_table with filter and parquets containing null

Masaaki Hamada created ARROW-10501:
--------------------------------------

             Summary: Behavior of parquet.read_table with filter and parquets containing null
                 Key: ARROW-10501
                 URL: https://issues.apache.org/jira/browse/ARROW-10501
             Project: Apache Arrow
          Issue Type: Bug
          Components: C++, Python
    Affects Versions: 2.0.0, 1.0.1
            Reporter: Masaaki Hamada
         Attachments: read_table_regression.zip

Hi,

I investigated what parquet.read_table with filter returns and
found some strange behaviors.

Please see the following source code to reproduce.
Details are written as comments.
{code:java}
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import osdef test_filter_with_null_contained_parquets():
    def check(path, filter, expected_df):
        params = {
            'columns': ['field'],
            'filters': filter,
        }        tb = pq.read_table(path, **params)
        df = tb.to_pandas()
        ret = df.equals(expected_df)
        return ret    # see below how to make these parquets
    dir_name = './read_table_regression/'
    pq_an = dir_name + 'all_null.snappy.parquet'
    pq_sn = dir_name + 'some_null.snappy.parquet'
    pq_hn = dir_name + 'half_null.snappy.parquet'
    pq_es = dir_name + 'empty_string.snappy.parquet'    # actual DataFrames from read_table
    empty_df = pd.DataFrame(columns=['field'])
    one_null_df = pd.DataFrame({'field': [None]})
    non_null_df = pd.DataFrame({'field': ['123']})
    es_contained_df = pd.DataFrame({'field': ['123', '']})
    es_removed_df = pd.DataFrame({'field': ['123']})
    #
    # case 1: 'not equals' and empty string
    #
    f0 = [('field', '!=', '')]
    # why nulls are removed?
    assert check(pq_an, f0, empty_df)       # [null]              -> []
    assert check(pq_sn, f0, non_null_df)    # [null, null, '123'] -> ['123']
    assert check(pq_es, f0, es_removed_df)  # [null, '123', '']   -> ['123']
    #
    # case 2: 'not equals' and null
    #
    f1 = [('field', '!=', None)]
    # ok.
    assert check(pq_an, f1, empty_df)     # [null]              -> []
    # why empty?
    assert check(pq_sn, f1, empty_df)     # [null, null, '123'] -> []
    assert check(pq_es, f1, empty_df)     # [null, '123', '']   -> []
    #
    # case 3: 'not in' and empty string
    #
    f2 = [('field', 'not in', [''])]
    f3 = [('field', 'not in', ['abc'])]
    # seems inconsistent results
    # null remains.
    assert check(pq_an, f2, one_null_df)      # [null]              -> [null]
    assert check(pq_an, f3, one_null_df)      # [null]              -> [null]
    # null removed.
    assert check(pq_sn, f2, non_null_df)      # [null, null, '123'] -> ['123']
    assert check(pq_es, f2, es_removed_df)    # [null, '123', '']   -> ['123']
    assert check(pq_sn, f3, non_null_df)      # [null, null, '123'] -> ['123']
    assert check(pq_es, f3, es_contained_df)  # [null, '123', '']   -> ['123', '']
    #
    # case 4: 'not in' and null
    #
    f4 = [('field', 'not in', [None])]
    # seems no problem
    assert check(pq_an, f4, empty_df)         # [null]              -> []
    assert check(pq_sn, f4, non_null_df)      # [null, null, '123'] -> ['123']
    assert check(pq_es, f4, es_contained_df)  # [null, '123', '']   -> ['123', '']
    #
    # case 5: half the data are null
    #
    # Obviously, these are wrong results.
    # It seems this only happens with a parquet which have its statistics metadata and
    # just half the data are null.
    #
    # Actually, I already have looked into the c++ layer by myself to find a root cause.
    #
    #   https://github.com/apache/arrow/blob/d4121d8a17d9e53ad4421960e357dd2f89771603/cpp/src/arrow/dataset/file_parquet.cc#L150
    #   > // Optimize for corner case where all values are nulls
    #   > if (statistics->num_values() == statistics->null_count()) {
    #
    # This compare looks wrong because num_values() returs non-null count.
    assert check(pq_hn, f0, empty_df)     # [null, '123'] -> []
    assert check(pq_hn, f1, empty_df)     # [null, '123'] -> []
    assert check(pq_hn, f2, non_null_df)  # [null, '123'] -> ['123']
    assert check(pq_hn, f3, non_null_df)  # [null, '123'] -> ['123']
    assert check(pq_hn, f4, empty_df)     # [null, '123'] -> []

{code}
The code which make the parquets above as follows. Also, I attached the parquets I made just in case.
{code:java}
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os
def write_table(table, path):
    pq.write_table(table, path, compression='snappy', use_dictionary=True)
def main():
    dir_name = 'read_table_regression/'
    os.makedirs(dir_name, exist_ok=True)    schema = pa.schema([('field', pa.string())])    df = pd.DataFrame({'field': [None]})
    table = pa.Table.from_pandas(df, schema=schema)
    write_table(table, f'{dir_name}/all_null.snappy.parquet')    df = pd.DataFrame({'field': [None, None, '123']})
    table = pa.Table.from_pandas(df, schema=schema)
    write_table(table, f'{dir_name}/some_null.snappy.parquet')    df = pd.DataFrame({'field': [None, '123']})
    table = pa.Table.from_pandas(df, schema=schema)
    write_table(table, f'{dir_name}/half_null.snappy.parquet')    df = pd.DataFrame({'field': [None, '123', '']})
    table = pa.Table.from_pandas(df, schema=schema)
    write_table(table, f'{dir_name}/empty_string.snappy.parquet')
if __name__ == "__main__":
    main()

{code}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)