You are viewing a plain text version of this content. The canonical link for it is here.
Posted to jira@arrow.apache.org by "Raúl Cumplido (Jira)" <ji...@apache.org> on 2022/06/30 11:26:00 UTC

[jira] [Comment Edited] (ARROW-15977) [Python] Can't ignore the overflow error.

    [ https://issues.apache.org/jira/browse/ARROW-15977?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17560993#comment-17560993 ] 

Raúl Cumplido edited comment on ARROW-15977 at 6/30/22 11:25 AM:
-----------------------------------------------------------------

I have tried to investigate the issue but I'll need a lot of time to understand our numpy_to_arrow.cc workflow. I am just adding a small diff here with some minor unit tests to reproduce the issue in case it helps someone:
{code:java}
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 215bf2f..26108f5 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -697,6 +697,29 @@ class TestConvertPrimitiveTypes:
         assert table[0].to_pylist() == [1, 2, None]
         tm.assert_frame_equal(df, table.to_pandas())
 
+    def test_int_overflow(self):
+        # ARROW-15977
+        df = pd.DataFrame([{"a": 3046682132}])
+        schema = pa.schema([pa.field("a", pa.int32(), nullable=True)])
+        table = pa.Table.from_pandas(df, schema=schema, safe=False)
+        assert table[0].to_pylist() == [-1248285164]
+
+    def test_list_int_overflow(self):
+        # ARROW-15977
+        df = pd.DataFrame([{"a": [1, 3046682132, 0]}])
+        schema = pa.schema([pa.field("a", pa.list_(pa.int32()), nullable=True)])
+        table = pa.Table.from_pandas(df, schema=schema, safe=False)
+        # TODO Fix assert
+        assert table[0].to_pylist() == [-1248285164]
+
+    def test_struct_int_overflow(self):
+        # ARROW-15977
+        df = pd.DataFrame([{"a": {"b": 3046682132}}])
+        schema = pa.schema([pa.field("a", pa.struct([pa.field("b", pa.int32())]), nullable=True)])
+        table = pa.Table.from_pandas(df, schema=schema, safe=False)
+        # TODO Fix assert
+        assert table[0].to_pylist() == [-1248285164]
+
{code}


was (Author: JIRAUSER287560):
I have tried to investigate the issue but I'll need a lot of time to understand our numpy_to_arrow.cc workflow. I am just adding a small diff here with some minor unit tests to reproduce the issue in case it helps someone:
{code:java}
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 215bf2f..26108f5 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -697,6 +697,29 @@ class TestConvertPrimitiveTypes:
         assert table[0].to_pylist() == [1, 2, None]
         tm.assert_frame_equal(df, table.to_pandas())
 
+    def test_int_overflow(self):
+        # ARROW-15977
+        df = pd.DataFrame([{"a": 3046682132}])
+        schema = pa.schema([pa.field("a", pa.int32(), nullable=True)])
+        import pdb; pdb.set_trace()
+        table = pa.Table.from_pandas(df, schema=schema, safe=False)
+        assert table[0].to_pylist() == [-1248285164]
+
+    def test_list_int_overflow(self):
+        # ARROW-15977
+        df = pd.DataFrame([{"a": [1, 3046682132, 0]}])
+        schema = pa.schema([pa.field("a", pa.list_(pa.int32()), nullable=True)])
+        import pdb; pdb.set_trace()
+        table = pa.Table.from_pandas(df, schema=schema, safe=False)
+        # TODO Fix assert
+        assert table[0].to_pylist() == [-1248285164]
+
+    def test_struct_int_overflow(self):
+        # ARROW-15977
+        df = pd.DataFrame([{"a": {"b": 3046682132}}])
+        schema = pa.schema([pa.field("a", pa.struct([pa.field("b", pa.int32())]), nullable=True)])
+        table = pa.Table.from_pandas(df, schema=schema, safe=False)
+        # TODO Fix assert
+        assert table[0].to_pylist() == [-1248285164]
+
{code}

> [Python] Can't ignore the overflow error.
> -----------------------------------------
>
>                 Key: ARROW-15977
>                 URL: https://issues.apache.org/jira/browse/ARROW-15977
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: Python
>    Affects Versions: 7.0.0
>            Reporter: taichi kato
>            Priority: Critical
>
> I know that the argument safe=False in pa.Table.from_pandas ignores overflow errors, but it does not ignore overflow in list or in struct.
> It works.
> {code:java}
> import pyarrow as pa
> import pyarrow.parquet as pq
> import pandas as pd
> import json
> test_json = [
>     {
>         "name": "taro",
>         "id": 3046682132,
>         "points": [2, 2, 2],
>         "groups": {
>             "group_name": "baseball", 
>             "group_id": 1234
>         }
>     },
>     { 
>         "name": "taro",
>         "id": 1234, 
>     }
> ]
> schema = pa.schema([
>     pa.field('name', pa.string()),
>     pa.field('id', pa.int32()),
>     pa.field("points", pa.list_(pa.int32())),
>     pa.field('groups', pa.struct([
>         pa.field("group_name", pa.string()),
>         pa.field("group_id", pa.int32()),
>     ])),
> ])
> writer = pq.ParquetWriter('test_schema.parquet', schema=schema)
> df = pd.DataFrame(test_json)
> table = pa.Table.from_pandas(df, schema=schema, safe=False)
> writer.write_table(table)
> writer.close()table = pq.read_table("test_schema.parquet")
> print(table) {code}
> {code:java}
> name: [["taro","taro"]] id: [[-1248285164,1234]] points: [[[2,2,2],null]] groups: [ – is_valid: [ true, false ] – child 0 type: string [ "baseball", null ] – child 1 type: int32 [ 1234, null ]]
> {code}
> However, the following two do not work.
>  
> {code:java}
> test_json = [
>     {
>         "name": "taro",
>         "id": 2,
>         "points": [2, 3046682132, 2],
>         "groups": {
>             "group_name": "baseball", 
>             "group_id": 1234
>         }
>     },
>     { 
>         "name": "taro",
>         "id": 1234, 
>     }
> ]{code}
> {code:java}
> Traceback (most recent call last):
> File "test_pyarrow.py", line 35, in <module>
> table = pa.Table.from_pandas(df, schema=schema, safe=False)
> File "pyarrow/table.pxi", line 1782, in pyarrow.lib.Table.from_pandas
> File "/home/s0108403058/.pyenv/versions/3.8.0/lib/python3.8/site-packages/pyarrow/pandas_compat.py", line 594, in dataframe_to_arrays
> arrays = [convert_column(c, f)
> File "/home/s0108403058/.pyenv/versions/3.8.0/lib/python3.8/site-packages/pyarrow/pandas_compat.py", line 594, in <listcomp>
> arrays = [convert_column(c, f)
> File "/home/s0108403058/.pyenv/versions/3.8.0/lib/python3.8/site-packages/pyarrow/pandas_compat.py", line 581, in convert_column
> raise e
> File "/home/s0108403058/.pyenv/versions/3.8.0/lib/python3.8/site-packages/pyarrow/pandas_compat.py", line 575, in convert_column
> result = pa.array(col, type=type_, from_pandas=True, safe=safe)
> File "pyarrow/array.pxi", line 312, in pyarrow.lib.array
> File "pyarrow/array.pxi", line 83, in pyarrow.lib._ndarray_to_array
> File "pyarrow/error.pxi", line 99, in pyarrow.lib.check_status
> pyarrow.lib.ArrowInvalid: ('Value 3046682132 too large to fit in C integer type', 'Conversion failed for column points with type object') {code}
> {code:java}
> test_json = [
>     {
>         "name": "taro",
>         "id": 2,
>         "points": [2, 2, 2],
>         "groups": {
>             "group_name": "baseball", 
>             "group_id": 3046682132
>         }
>     },
>     { 
>         "name": "taro",
>         "id": 1234, 
>     }
> ] {code}
> {code:java}
> Traceback (most recent call last):
> File "test_pyarrow.py", line 35, in <module>
> table = pa.Table.from_pandas(df, schema=schema, safe=False)
> File "pyarrow/table.pxi", line 1782, in pyarrow.lib.Table.from_pandas
> File "/home/s0108403058/.pyenv/versions/3.8.0/lib/python3.8/site-packages/pyarrow/pandas_compat.py", line 594, in dataframe_to_arrays
> arrays = [convert_column(c, f)
> File "/home/s0108403058/.pyenv/versions/3.8.0/lib/python3.8/site-packages/pyarrow/pandas_compat.py", line 594, in <listcomp>
> arrays = [convert_column(c, f)
> File "/home/s0108403058/.pyenv/versions/3.8.0/lib/python3.8/site-packages/pyarrow/pandas_compat.py", line 581, in convert_column
> raise e
> File "/home/s0108403058/.pyenv/versions/3.8.0/lib/python3.8/site-packages/pyarrow/pandas_compat.py", line 575, in convert_column
> result = pa.array(col, type=type_, from_pandas=True, safe=safe)
> File "pyarrow/array.pxi", line 312, in pyarrow.lib.array
> File "pyarrow/array.pxi", line 83, in pyarrow.lib._ndarray_to_array
> File "pyarrow/error.pxi", line 99, in pyarrow.lib.check_status
> pyarrow.lib.ArrowInvalid: ('Value 3046682132 too large to fit in C integer type', 'Conversion failed for column groups with type object') {code}
> Could you please fix this bug?
> pyarrow==7.0.0



--
This message was sent by Atlassian Jira
(v8.20.10#820010)