You are viewing a plain text version of this content. The canonical link for it is here.
Posted to jira@arrow.apache.org by "Wes McKinney (Jira)" <ji...@apache.org> on 2020/11/05 16:51:00 UTC
[jira] [Updated] (ARROW-10501) [C++][Python] Behavior of parquet.read_table with filter and parquets containing null

     [ https://issues.apache.org/jira/browse/ARROW-10501?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Wes McKinney updated ARROW-10501:
---------------------------------
    Summary: [C++][Python] Behavior of parquet.read_table with filter and parquets containing null  (was: Behavior of parquet.read_table with filter and parquets containing null)

> [C++][Python] Behavior of parquet.read_table with filter and parquets containing null
> -------------------------------------------------------------------------------------
>
>                 Key: ARROW-10501
>                 URL: https://issues.apache.org/jira/browse/ARROW-10501
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: C++, Python
>    Affects Versions: 1.0.1, 2.0.0
>            Reporter: Masaaki Hamada
>            Priority: Major
>         Attachments: read_table_regression.zip
>
>
> Hi,
> I investigated what parquet.read_table with filter returns and
>  found some strange behaviors.
> Please see the following source code to reproduce.
>  Details are written as comments.
> {code:java}
> import pandas as pd
> import pyarrow as pa
> import pyarrow.parquet as pq
> import os
> def test_filter_with_null_contained_parquets():
>     def check(path, filter, expected_df):
>         params = {
>             'columns': ['field'],
>             'filters': filter,
>         }        tb = pq.read_table(path, **params)
>         df = tb.to_pandas()
>         ret = df.equals(expected_df)
>         return ret
>     # see below how to make these parquets
>     dir_name = './read_table_regression/'
>     pq_an = dir_name + 'all_null.snappy.parquet'
>     pq_sn = dir_name + 'some_null.snappy.parquet'
>     pq_hn = dir_name + 'half_null.snappy.parquet'
>     pq_es = dir_name + 'empty_string.snappy.parquet' 
>    # actual DataFrames from read_table
>     empty_df = pd.DataFrame(columns=['field'])
>     one_null_df = pd.DataFrame({'field': [None]})
>     non_null_df = pd.DataFrame({'field': ['123']})
>     es_contained_df = pd.DataFrame({'field': ['123', '']})
>     es_removed_df = pd.DataFrame({'field': ['123']})
>     #
>     # case 1: 'not equals' and empty string
>     #
>     f0 = [('field', '!=', '')]
>     # why nulls are removed?
>     assert check(pq_an, f0, empty_df)       # [null]              -> []
>     assert check(pq_sn, f0, non_null_df)    # [null, null, '123'] -> ['123']
>     assert check(pq_es, f0, es_removed_df)  # [null, '123', '']   -> ['123']
>     #
>     # case 2: 'not equals' and null
>     #
>     f1 = [('field', '!=', None)]
>     # ok.
>     assert check(pq_an, f1, empty_df)     # [null]              -> []
>     # why empty?
>     assert check(pq_sn, f1, empty_df)     # [null, null, '123'] -> []
>     assert check(pq_es, f1, empty_df)     # [null, '123', '']   -> []
>     #
>     # case 3: 'not in' and empty string
>     #
>     f2 = [('field', 'not in', [''])]
>     f3 = [('field', 'not in', ['abc'])]
>     # seems inconsistent results
>     # null remains.
>     assert check(pq_an, f2, one_null_df)      # [null]              -> [null]
>     assert check(pq_an, f3, one_null_df)      # [null]              -> [null]
>     # null removed.
>     assert check(pq_sn, f2, non_null_df)      # [null, null, '123'] -> ['123']
>     assert check(pq_es, f2, es_removed_df)    # [null, '123', '']   -> ['123']
>     assert check(pq_sn, f3, non_null_df)      # [null, null, '123'] -> ['123']
>     assert check(pq_es, f3, es_contained_df)  # [null, '123', '']   -> ['123', '']
>     #
>     # case 4: 'not in' and null
>     #
>     f4 = [('field', 'not in', [None])]
>     # seems no problem
>     assert check(pq_an, f4, empty_df)         # [null]              -> []
>     assert check(pq_sn, f4, non_null_df)      # [null, null, '123'] -> ['123']
>     assert check(pq_es, f4, es_contained_df)  # [null, '123', '']   -> ['123', '']
>     #
>     # case 5: half the data are null
>     #
>     # Obviously, these are wrong results.
>     # It seems this only happens with a parquet which have its statistics metadata and
>     # just half the data are null.
>     #
>     # Actually, I already have looked into the c++ layer by myself to find a root cause.
>     #
>     #   https://github.com/apache/arrow/blob/d4121d8a17d9e53ad4421960e357dd2f89771603/cpp/src/arrow/dataset/file_parquet.cc#L150
>     #   > // Optimize for corner case where all values are nulls
>     #   > if (statistics->num_values() == statistics->null_count()) {
>     #
>     # This compare looks wrong because num_values() returs non-null count.
>     assert check(pq_hn, f0, empty_df)     # [null, '123'] -> []
>     assert check(pq_hn, f1, empty_df)     # [null, '123'] -> []
>     assert check(pq_hn, f2, non_null_df)  # [null, '123'] -> ['123']
>     assert check(pq_hn, f3, non_null_df)  # [null, '123'] -> ['123']
>     assert check(pq_hn, f4, empty_df)     # [null, '123'] -> []
> {code}
> The code which make the parquets above as follows. 
> I also attached the parquets I made just in case.
> [^read_table_regression.zip]
> {code:java}
> import pandas as pd
> import pyarrow as pa
> import pyarrow.parquet as pq
> import os
> def write_table(table, path):
>     pq.write_table(table, path, compression='snappy', use_dictionary=True)
> def main():
>     dir_name = 'read_table_regression/'
>     os.makedirs(dir_name, exist_ok=True)    schema = pa.schema([('field', pa.string())])    df = pd.DataFrame({'field': [None]})
>     table = pa.Table.from_pandas(df, schema=schema)
>     write_table(table, f'{dir_name}/all_null.snappy.parquet')    df = pd.DataFrame({'field': [None, None, '123']})
>     table = pa.Table.from_pandas(df, schema=schema)
>     write_table(table, f'{dir_name}/some_null.snappy.parquet')    df = pd.DataFrame({'field': [None, '123']})
>     table = pa.Table.from_pandas(df, schema=schema)
>     write_table(table, f'{dir_name}/half_null.snappy.parquet')    df = pd.DataFrame({'field': [None, '123', '']})
>     table = pa.Table.from_pandas(df, schema=schema)
>     write_table(table, f'{dir_name}/empty_string.snappy.parquet')
> if __name__ == "__main__":
>     main()
> {code}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)