You are viewing a plain text version of this content. The canonical link for it is here.
Posted to jira@arrow.apache.org by "Wes McKinney (Jira)" <ji...@apache.org> on 2020/11/05 16:51:00 UTC
[jira] [Updated] (ARROW-10501) [C++][Python] Behavior of
parquet.read_table with filter and parquets containing null
[ https://issues.apache.org/jira/browse/ARROW-10501?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Wes McKinney updated ARROW-10501:
---------------------------------
Summary: [C++][Python] Behavior of parquet.read_table with filter and parquets containing null (was: Behavior of parquet.read_table with filter and parquets containing null)
> [C++][Python] Behavior of parquet.read_table with filter and parquets containing null
> -------------------------------------------------------------------------------------
>
> Key: ARROW-10501
> URL: https://issues.apache.org/jira/browse/ARROW-10501
> Project: Apache Arrow
> Issue Type: Bug
> Components: C++, Python
> Affects Versions: 1.0.1, 2.0.0
> Reporter: Masaaki Hamada
> Priority: Major
> Attachments: read_table_regression.zip
>
>
> Hi,
> I investigated what parquet.read_table with filter returns and
> found some strange behaviors.
> Please see the following source code to reproduce.
> Details are written as comments.
> {code:java}
> import pandas as pd
> import pyarrow as pa
> import pyarrow.parquet as pq
> import os
> def test_filter_with_null_contained_parquets():
> def check(path, filter, expected_df):
> params = {
> 'columns': ['field'],
> 'filters': filter,
> } tb = pq.read_table(path, **params)
> df = tb.to_pandas()
> ret = df.equals(expected_df)
> return ret
> # see below how to make these parquets
> dir_name = './read_table_regression/'
> pq_an = dir_name + 'all_null.snappy.parquet'
> pq_sn = dir_name + 'some_null.snappy.parquet'
> pq_hn = dir_name + 'half_null.snappy.parquet'
> pq_es = dir_name + 'empty_string.snappy.parquet'
> # actual DataFrames from read_table
> empty_df = pd.DataFrame(columns=['field'])
> one_null_df = pd.DataFrame({'field': [None]})
> non_null_df = pd.DataFrame({'field': ['123']})
> es_contained_df = pd.DataFrame({'field': ['123', '']})
> es_removed_df = pd.DataFrame({'field': ['123']})
> #
> # case 1: 'not equals' and empty string
> #
> f0 = [('field', '!=', '')]
> # why nulls are removed?
> assert check(pq_an, f0, empty_df) # [null] -> []
> assert check(pq_sn, f0, non_null_df) # [null, null, '123'] -> ['123']
> assert check(pq_es, f0, es_removed_df) # [null, '123', ''] -> ['123']
> #
> # case 2: 'not equals' and null
> #
> f1 = [('field', '!=', None)]
> # ok.
> assert check(pq_an, f1, empty_df) # [null] -> []
> # why empty?
> assert check(pq_sn, f1, empty_df) # [null, null, '123'] -> []
> assert check(pq_es, f1, empty_df) # [null, '123', ''] -> []
> #
> # case 3: 'not in' and empty string
> #
> f2 = [('field', 'not in', [''])]
> f3 = [('field', 'not in', ['abc'])]
> # seems inconsistent results
> # null remains.
> assert check(pq_an, f2, one_null_df) # [null] -> [null]
> assert check(pq_an, f3, one_null_df) # [null] -> [null]
> # null removed.
> assert check(pq_sn, f2, non_null_df) # [null, null, '123'] -> ['123']
> assert check(pq_es, f2, es_removed_df) # [null, '123', ''] -> ['123']
> assert check(pq_sn, f3, non_null_df) # [null, null, '123'] -> ['123']
> assert check(pq_es, f3, es_contained_df) # [null, '123', ''] -> ['123', '']
> #
> # case 4: 'not in' and null
> #
> f4 = [('field', 'not in', [None])]
> # seems no problem
> assert check(pq_an, f4, empty_df) # [null] -> []
> assert check(pq_sn, f4, non_null_df) # [null, null, '123'] -> ['123']
> assert check(pq_es, f4, es_contained_df) # [null, '123', ''] -> ['123', '']
> #
> # case 5: half the data are null
> #
> # Obviously, these are wrong results.
> # It seems this only happens with a parquet which have its statistics metadata and
> # just half the data are null.
> #
> # Actually, I already have looked into the c++ layer by myself to find a root cause.
> #
> # https://github.com/apache/arrow/blob/d4121d8a17d9e53ad4421960e357dd2f89771603/cpp/src/arrow/dataset/file_parquet.cc#L150
> # > // Optimize for corner case where all values are nulls
> # > if (statistics->num_values() == statistics->null_count()) {
> #
> # This compare looks wrong because num_values() returs non-null count.
> assert check(pq_hn, f0, empty_df) # [null, '123'] -> []
> assert check(pq_hn, f1, empty_df) # [null, '123'] -> []
> assert check(pq_hn, f2, non_null_df) # [null, '123'] -> ['123']
> assert check(pq_hn, f3, non_null_df) # [null, '123'] -> ['123']
> assert check(pq_hn, f4, empty_df) # [null, '123'] -> []
> {code}
> The code which make the parquets above as follows.
> I also attached the parquets I made just in case.
> [^read_table_regression.zip]
> {code:java}
> import pandas as pd
> import pyarrow as pa
> import pyarrow.parquet as pq
> import os
> def write_table(table, path):
> pq.write_table(table, path, compression='snappy', use_dictionary=True)
> def main():
> dir_name = 'read_table_regression/'
> os.makedirs(dir_name, exist_ok=True) schema = pa.schema([('field', pa.string())]) df = pd.DataFrame({'field': [None]})
> table = pa.Table.from_pandas(df, schema=schema)
> write_table(table, f'{dir_name}/all_null.snappy.parquet') df = pd.DataFrame({'field': [None, None, '123']})
> table = pa.Table.from_pandas(df, schema=schema)
> write_table(table, f'{dir_name}/some_null.snappy.parquet') df = pd.DataFrame({'field': [None, '123']})
> table = pa.Table.from_pandas(df, schema=schema)
> write_table(table, f'{dir_name}/half_null.snappy.parquet') df = pd.DataFrame({'field': [None, '123', '']})
> table = pa.Table.from_pandas(df, schema=schema)
> write_table(table, f'{dir_name}/empty_string.snappy.parquet')
> if __name__ == "__main__":
> main()
> {code}
--
This message was sent by Atlassian Jira
(v8.3.4#803005)