You are viewing a plain text version of this content. The canonical link for it is here.
Posted to jira@arrow.apache.org by "Masaaki Hamada (Jira)" <ji...@apache.org> on 2020/11/05 12:48:00 UTC
[jira] [Created] (ARROW-10501) Behavior of parquet.read_table with
filter and parquets containing null
Masaaki Hamada created ARROW-10501:
--------------------------------------
Summary: Behavior of parquet.read_table with filter and parquets containing null
Key: ARROW-10501
URL: https://issues.apache.org/jira/browse/ARROW-10501
Project: Apache Arrow
Issue Type: Bug
Components: C++, Python
Affects Versions: 2.0.0, 1.0.1
Reporter: Masaaki Hamada
Attachments: read_table_regression.zip
Hi,
I investigated what parquet.read_table with filter returns and
found some strange behaviors.
Please see the following source code to reproduce.
Details are written as comments.
{code:java}
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import osdef test_filter_with_null_contained_parquets():
def check(path, filter, expected_df):
params = {
'columns': ['field'],
'filters': filter,
} tb = pq.read_table(path, **params)
df = tb.to_pandas()
ret = df.equals(expected_df)
return ret # see below how to make these parquets
dir_name = './read_table_regression/'
pq_an = dir_name + 'all_null.snappy.parquet'
pq_sn = dir_name + 'some_null.snappy.parquet'
pq_hn = dir_name + 'half_null.snappy.parquet'
pq_es = dir_name + 'empty_string.snappy.parquet' # actual DataFrames from read_table
empty_df = pd.DataFrame(columns=['field'])
one_null_df = pd.DataFrame({'field': [None]})
non_null_df = pd.DataFrame({'field': ['123']})
es_contained_df = pd.DataFrame({'field': ['123', '']})
es_removed_df = pd.DataFrame({'field': ['123']})
#
# case 1: 'not equals' and empty string
#
f0 = [('field', '!=', '')]
# why nulls are removed?
assert check(pq_an, f0, empty_df) # [null] -> []
assert check(pq_sn, f0, non_null_df) # [null, null, '123'] -> ['123']
assert check(pq_es, f0, es_removed_df) # [null, '123', ''] -> ['123']
#
# case 2: 'not equals' and null
#
f1 = [('field', '!=', None)]
# ok.
assert check(pq_an, f1, empty_df) # [null] -> []
# why empty?
assert check(pq_sn, f1, empty_df) # [null, null, '123'] -> []
assert check(pq_es, f1, empty_df) # [null, '123', ''] -> []
#
# case 3: 'not in' and empty string
#
f2 = [('field', 'not in', [''])]
f3 = [('field', 'not in', ['abc'])]
# seems inconsistent results
# null remains.
assert check(pq_an, f2, one_null_df) # [null] -> [null]
assert check(pq_an, f3, one_null_df) # [null] -> [null]
# null removed.
assert check(pq_sn, f2, non_null_df) # [null, null, '123'] -> ['123']
assert check(pq_es, f2, es_removed_df) # [null, '123', ''] -> ['123']
assert check(pq_sn, f3, non_null_df) # [null, null, '123'] -> ['123']
assert check(pq_es, f3, es_contained_df) # [null, '123', ''] -> ['123', '']
#
# case 4: 'not in' and null
#
f4 = [('field', 'not in', [None])]
# seems no problem
assert check(pq_an, f4, empty_df) # [null] -> []
assert check(pq_sn, f4, non_null_df) # [null, null, '123'] -> ['123']
assert check(pq_es, f4, es_contained_df) # [null, '123', ''] -> ['123', '']
#
# case 5: half the data are null
#
# Obviously, these are wrong results.
# It seems this only happens with a parquet which have its statistics metadata and
# just half the data are null.
#
# Actually, I already have looked into the c++ layer by myself to find a root cause.
#
# https://github.com/apache/arrow/blob/d4121d8a17d9e53ad4421960e357dd2f89771603/cpp/src/arrow/dataset/file_parquet.cc#L150
# > // Optimize for corner case where all values are nulls
# > if (statistics->num_values() == statistics->null_count()) {
#
# This compare looks wrong because num_values() returs non-null count.
assert check(pq_hn, f0, empty_df) # [null, '123'] -> []
assert check(pq_hn, f1, empty_df) # [null, '123'] -> []
assert check(pq_hn, f2, non_null_df) # [null, '123'] -> ['123']
assert check(pq_hn, f3, non_null_df) # [null, '123'] -> ['123']
assert check(pq_hn, f4, empty_df) # [null, '123'] -> []
{code}
The code which make the parquets above as follows. Also, I attached the parquets I made just in case.
{code:java}
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os
def write_table(table, path):
pq.write_table(table, path, compression='snappy', use_dictionary=True)
def main():
dir_name = 'read_table_regression/'
os.makedirs(dir_name, exist_ok=True) schema = pa.schema([('field', pa.string())]) df = pd.DataFrame({'field': [None]})
table = pa.Table.from_pandas(df, schema=schema)
write_table(table, f'{dir_name}/all_null.snappy.parquet') df = pd.DataFrame({'field': [None, None, '123']})
table = pa.Table.from_pandas(df, schema=schema)
write_table(table, f'{dir_name}/some_null.snappy.parquet') df = pd.DataFrame({'field': [None, '123']})
table = pa.Table.from_pandas(df, schema=schema)
write_table(table, f'{dir_name}/half_null.snappy.parquet') df = pd.DataFrame({'field': [None, '123', '']})
table = pa.Table.from_pandas(df, schema=schema)
write_table(table, f'{dir_name}/empty_string.snappy.parquet')
if __name__ == "__main__":
main()
{code}
--
This message was sent by Atlassian Jira
(v8.3.4#803005)