You are viewing a plain text version of this content. The canonical link for it is here.
Posted to jira@arrow.apache.org by "ARF (Jira)" <ji...@apache.org> on 2021/02/26 13:21:00 UTC

[jira] [Updated] (ARROW-11678) [python] Warn users if ParquetWriter coerces an explicitly defined schema

     [ https://issues.apache.org/jira/browse/ARROW-11678?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

ARF updated ARROW-11678:
------------------------
    Description: 
pyarrow should warn users if it coerces a explicitly defined schema. Not warning the user leads to confusing behavious as illustrated below:

 

{color:#af00db}import{color}{color:#000000} {color}{color:#267f99}pyarrow{color}{color:#000000} {color}{color:#af00db}as{color}{color:#000000} {color}{color:#267f99}pa{color}
 {color:#af00db}import{color}{color:#000000} {color}{color:#267f99}pyarrow{color}{color:#000000}.{color}{color:#267f99}parquet{color}{color:#000000} {color}{color:#af00db}as{color}{color:#000000} {color}{color:#267f99}pq{color}

{color:#001080}schema{color}{color:#000000} = {color}{color:#267f99}pa{color}{color:#000000}.schema({{color}
 {color:#000000}    {color}{color:#a31515}'code'{color}{color:#000000}: {color}{color:#267f99}pa{color}{color:#000000}.uint32(),{color}
 })

with{color:#000000} {color}{color:#267f99}pq{color}{color:#000000}.{color}{color:#267f99}ParquetWriter{color}{color:#000000}({color}{color:#a31515}'test_metadata.parquet'{color}{color:#000000}, {color}{color:#001080}schema{color}{color:#000000}) {color}{color:#af00db}as{color}{color:#000000} {color}{color:#001080}pqwriter{color}{color:#000000}:{color}
 {color:#000000}    {color}{color:#001080}code{color}{color:#000000} = {color}{color:#098658}111000{color}
 {color:#000000}    {color}{color:#001080}table{color}{color:#000000} = {color}{color:#267f99}pa{color}{color:#000000}.Table.from_pydict({{color}
 {color:#000000}        {color}{color:#a31515}'code'{color}{color:#000000}: {color}{color:#267f99}pa{color}{color:#000000}.nulls({color}{color:#098658}10{color}{color:#000000}, {color}{color:#001080}schema{color}{color:#000000}.field({color}{color:#a31515}'code'{color}{color:#000000}).type).fill_null({color}{color:#001080}code{color}{color:#000000}),{color}
 {color:#000000}    }){color}
 {color:#000000}    {color}{color:#001080}pqwriter{color}{color:#000000}.{color}{color:#795e26}write_table{color}{color:#000000}({color}{color:#001080}table{color}{color:#000000}){color}

{color:#001080}existing_table{color}{color:#000000} = {color}{color:#267f99}pq{color}{color:#000000}.{color}{color:#795e26}read_table{color}{color:#000000}({color}{color:#a31515}'test_metadata.parquet'{color}{color:#000000}){color}

{color:#af00db}with{color}{color:#000000} {color}{color:#267f99}pq{color}{color:#000000}.{color}{color:#267f99}ParquetWriter{color}{color:#000000}({color}{color:#a31515}'test_metadata.parquet'{color}{color:#000000}, {color}{color:#001080}schema{color}{color:#000000}) {color}{color:#af00db}as{color}{color:#000000} {color}{color:#001080}pqwriter{color}{color:#000000}:{color}
 {color:#000000}    {color}{color:#001080}pqwriter{color}{color:#000000}.{color}{color:#795e26}write_table{color}{color:#000000}({color}{color:#001080}existing_table{color}{color:#000000}){color}

 

 
----
*Error Message:*

ValueError: Table schema does not match schema used to create file:
 table:
 code: int64
 – field metadata –
 PARQUET:field_id: '1' vs.
 file:
 code: uint32

  was:
Round-tripping with ParquetWriter.write_table() -> pyarrow.parquet.read_table() -> ParquetWriter.write_table() is broken:

 

{color:#af00db}import{color}{color:#000000} {color}{color:#267f99}pyarrow{color}{color:#000000} {color}{color:#af00db}as{color}{color:#000000} {color}{color:#267f99}pa{color}
 {color:#af00db}import{color}{color:#000000} {color}{color:#267f99}pyarrow{color}{color:#000000}.{color}{color:#267f99}parquet{color}{color:#000000} {color}{color:#af00db}as{color}{color:#000000} {color}{color:#267f99}pq{color}

{color:#001080}schema{color}{color:#000000} = {color}{color:#267f99}pa{color}{color:#000000}.schema({{color}
 {color:#000000}    {color}{color:#a31515}'code'{color}{color:#000000}: {color}{color:#267f99}pa{color}{color:#000000}.uint32(),{color}
 })

with{color:#000000} {color}{color:#267f99}pq{color}{color:#000000}.{color}{color:#267f99}ParquetWriter{color}{color:#000000}({color}{color:#a31515}'test_metadata.parquet'{color}{color:#000000}, {color}{color:#001080}schema{color}{color:#000000}) {color}{color:#af00db}as{color}{color:#000000} {color}{color:#001080}pqwriter{color}{color:#000000}:{color}
 {color:#000000}    {color}{color:#001080}code{color}{color:#000000} = {color}{color:#098658}111000{color}
 {color:#000000}    {color}{color:#001080}table{color}{color:#000000} = {color}{color:#267f99}pa{color}{color:#000000}.Table.from_pydict({{color}
 {color:#000000}        {color}{color:#a31515}'code'{color}{color:#000000}: {color}{color:#267f99}pa{color}{color:#000000}.nulls({color}{color:#098658}10{color}{color:#000000}, {color}{color:#001080}schema{color}{color:#000000}.field({color}{color:#a31515}'code'{color}{color:#000000}).type).fill_null({color}{color:#001080}code{color}{color:#000000}),{color}
 {color:#000000}    }){color}
 {color:#000000}    {color}{color:#001080}pqwriter{color}{color:#000000}.{color}{color:#795e26}write_table{color}{color:#000000}({color}{color:#001080}table{color}{color:#000000}){color}

{color:#001080}existing_table{color}{color:#000000} = {color}{color:#267f99}pq{color}{color:#000000}.{color}{color:#795e26}read_table{color}{color:#000000}({color}{color:#a31515}'test_metadata.parquet'{color}{color:#000000}){color}

{color:#af00db}with{color}{color:#000000} {color}{color:#267f99}pq{color}{color:#000000}.{color}{color:#267f99}ParquetWriter{color}{color:#000000}({color}{color:#a31515}'test_metadata.parquet'{color}{color:#000000}, {color}{color:#001080}schema{color}{color:#000000}) {color}{color:#af00db}as{color}{color:#000000} {color}{color:#001080}pqwriter{color}{color:#000000}:{color}
 {color:#000000}    {color}{color:#001080}pqwriter{color}{color:#000000}.{color}{color:#795e26}write_table{color}{color:#000000}({color}{color:#001080}existing_table{color}{color:#000000}){color}

 

 
----
*Error Message:*

ValueError: Table schema does not match schema used to create file:
table:
code: int64
  -- field metadata --
  PARQUET:field_id: '1' vs.
file:
code: uint32

     Issue Type: Improvement  (was: Bug)
       Priority: Minor  (was: Critical)
        Summary: [python] Warn users if ParquetWriter coerces an explicitly defined schema  (was: [python] Broken round-trip with ParquetWriter.write_table -> read_table -> ParquetWriter.write_table)

> [python] Warn users if ParquetWriter coerces an explicitly defined schema
> -------------------------------------------------------------------------
>
>                 Key: ARROW-11678
>                 URL: https://issues.apache.org/jira/browse/ARROW-11678
>             Project: Apache Arrow
>          Issue Type: Improvement
>          Components: Python
>    Affects Versions: 3.0.0
>            Reporter: ARF
>            Priority: Minor
>
> pyarrow should warn users if it coerces a explicitly defined schema. Not warning the user leads to confusing behavious as illustrated below:
>  
> {color:#af00db}import{color}{color:#000000} {color}{color:#267f99}pyarrow{color}{color:#000000} {color}{color:#af00db}as{color}{color:#000000} {color}{color:#267f99}pa{color}
>  {color:#af00db}import{color}{color:#000000} {color}{color:#267f99}pyarrow{color}{color:#000000}.{color}{color:#267f99}parquet{color}{color:#000000} {color}{color:#af00db}as{color}{color:#000000} {color}{color:#267f99}pq{color}
> {color:#001080}schema{color}{color:#000000} = {color}{color:#267f99}pa{color}{color:#000000}.schema({{color}
>  {color:#000000}    {color}{color:#a31515}'code'{color}{color:#000000}: {color}{color:#267f99}pa{color}{color:#000000}.uint32(),{color}
>  })
> with{color:#000000} {color}{color:#267f99}pq{color}{color:#000000}.{color}{color:#267f99}ParquetWriter{color}{color:#000000}({color}{color:#a31515}'test_metadata.parquet'{color}{color:#000000}, {color}{color:#001080}schema{color}{color:#000000}) {color}{color:#af00db}as{color}{color:#000000} {color}{color:#001080}pqwriter{color}{color:#000000}:{color}
>  {color:#000000}    {color}{color:#001080}code{color}{color:#000000} = {color}{color:#098658}111000{color}
>  {color:#000000}    {color}{color:#001080}table{color}{color:#000000} = {color}{color:#267f99}pa{color}{color:#000000}.Table.from_pydict({{color}
>  {color:#000000}        {color}{color:#a31515}'code'{color}{color:#000000}: {color}{color:#267f99}pa{color}{color:#000000}.nulls({color}{color:#098658}10{color}{color:#000000}, {color}{color:#001080}schema{color}{color:#000000}.field({color}{color:#a31515}'code'{color}{color:#000000}).type).fill_null({color}{color:#001080}code{color}{color:#000000}),{color}
>  {color:#000000}    }){color}
>  {color:#000000}    {color}{color:#001080}pqwriter{color}{color:#000000}.{color}{color:#795e26}write_table{color}{color:#000000}({color}{color:#001080}table{color}{color:#000000}){color}
> {color:#001080}existing_table{color}{color:#000000} = {color}{color:#267f99}pq{color}{color:#000000}.{color}{color:#795e26}read_table{color}{color:#000000}({color}{color:#a31515}'test_metadata.parquet'{color}{color:#000000}){color}
> {color:#af00db}with{color}{color:#000000} {color}{color:#267f99}pq{color}{color:#000000}.{color}{color:#267f99}ParquetWriter{color}{color:#000000}({color}{color:#a31515}'test_metadata.parquet'{color}{color:#000000}, {color}{color:#001080}schema{color}{color:#000000}) {color}{color:#af00db}as{color}{color:#000000} {color}{color:#001080}pqwriter{color}{color:#000000}:{color}
>  {color:#000000}    {color}{color:#001080}pqwriter{color}{color:#000000}.{color}{color:#795e26}write_table{color}{color:#000000}({color}{color:#001080}existing_table{color}{color:#000000}){color}
>  
>  
> ----
> *Error Message:*
> ValueError: Table schema does not match schema used to create file:
>  table:
>  code: int64
>  – field metadata –
>  PARQUET:field_id: '1' vs.
>  file:
>  code: uint32



--
This message was sent by Atlassian Jira
(v8.3.4#803005)