You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@arrow.apache.org by "David Lee (JIRA)" <ji...@apache.org> on 2018/12/17 19:45:00 UTC

[jira] [Comment Edited] (ARROW-4032) [Python] New pyarrow.Table functions: from_pydict(), from_pylist() and to_pylist()

    [ https://issues.apache.org/jira/browse/ARROW-4032?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16721986#comment-16721986 ] 

David Lee edited comment on ARROW-4032 at 12/17/18 7:44 PM:
------------------------------------------------------------

Ended up just writing from_pylist() and to_pylist().. They run much faster than going through pandas.. 
{code}
def from_pylist(pylist, names=None, schema=None, safe=True):
    arrow_columns = list()
    if schema:
        for column in schema.names:
            arrow_columns.append(pa.array([v[column] if column in v else None for v in pylist], safe=safe, type=schema.types[schema.get_field_index(column)]))
        arrow_table = pa.Table.from_arrays(arrow_columns, schema.names)
    else:
        for column in names:
            arrow_columns.append(pa.array([v[column] if column in v else None for v in pylist], safe=safe))
        arrow_table = pa.Table.from_arrays(arrow_columns, names)
    return arrow_table

def to_pylist(arrow_table):
    pylist = list()
    for row in range(arrow_table.num_rows):
        pylist.append({arrow_table.schema.names[i]: arrow_table[i][row] for i in range(arrow_table.num_columns)})
    return pylist

def from_pydict(pydict, names=None, schema=None, safe=True):
    arrow_columns = list()
    dict_columns = list(pydict.keys())
    if schema:
        for column in schema.names:
            if column in pydict:
                arrow_columns.append(pa.array(pydict[column], safe=safe, type=schema.types[schema.get_field_index(column)]))
            else:
                arrow_columns.append(pa.array([None] * len(pydict[dict_columns[0]]), safe=safe, type=schema.types[schema.get_field_index(column)]))
        arrow_table = pa.Table.from_arrays(arrow_columns, schema.names)
    else:
        if not names:
            names = dict_columns
        for column in names:
            if column in dict_columns:
                arrow_columns.append(pa.array(pydict[column], safe=safe))
            else:
                arrow_columns.append(pa.array([None] * len(pydict[dict_columns[0]]), safe=safe))
        arrow_table = pa.Table.from_arrays(arrow_columns, names)
    return arrow_table
{code}


was (Author: davlee1972@yahoo.com):
Ended up just writing from_pylist() and to_pylist().. They run much faster than going through pandas..
{code:java}
def from_pylist(pylist, names=None, schema=None, safe=True):
    arrow_columns = list()
    if schema:
        for column in schema.names:
            arrow_columns.append(pa.array([v[column] if column in v else None for v in pylist], safe=safe, type=schema.types[schema.get_field_index(column)]))
        arrow_table = pa.Table.from_arrays(arrow_columns, schema.names)
    else:
        for column in names:
            arrow_columns.append(pa.array([v[column] if column in v else None for v in pylist], safe=safe))
        arrow_table = pa.Table.from_arrays(arrow_columns, names)
    return arrow_table

def to_pylist(arrow_table):
    pylist = list()
    for row in range(arrow_table.num_rows):
        pylist.append({arrow_table.schema.names[i]: arrow_table[i][row] for i in range(arrow_table.num_columns)})
    return pylist

def from_pydict(pydict, names=None, schema=None, safe=True):
    arrow_names = list()
    arrow_columns = list()
    for column, values in pydict.items():
        arrow_names.append(column)
        arrow_columns.append(pa.array(values))
    arrow_table = pa.Table.from_arrays(arrow_columns, arrow_names)
    return arrow_table{code}

> [Python] New pyarrow.Table functions: from_pydict(), from_pylist() and to_pylist()
> ----------------------------------------------------------------------------------
>
>                 Key: ARROW-4032
>                 URL: https://issues.apache.org/jira/browse/ARROW-4032
>             Project: Apache Arrow
>          Issue Type: Task
>          Components: Python
>            Reporter: David Lee
>            Priority: Minor
>
> Here's a proposal to create a pyarrow.Table.from_pydict() function.
> Right now only pyarrow.Table.from_pandas() exist and there are inherit problems using Pandas with NULL support for Int(s) and Boolean(s)
> [http://pandas.pydata.org/pandas-docs/version/0.23.4/gotchas.html]
> {{NaN}}, Integer {{NA}} values and {{NA}} type promotions:
> Sample python code on how this would work.
>  
> {code:java}
> import pyarrow as pa
> from datetime import datetime
> # convert microseconds to milliseconds. More support for MS in parquet.
> today = datetime.now()
> today = datetime(today.year, today.month, today.day, today.hour, today.minute, today.second, today.microsecond - today.microsecond % 1000)
> test_list = [
> {"name": "Tom", "age": 10},
> {"name": "Mark", "age": 5, "city": "San Francisco"},
> {"name": "Pam", "age": 7, "birthday": today}
> ]
> def from_pylist(pylist, schema=None, columns=None, safe=True):
>     arrow_columns = list()
>     if schema:
>         columns = schema.names
>     if not columns:
>         return
>     for column in columns:
>         arrow_columns.append(pa.array([v[column] if column in v else None for v in pylist], safe=safe))
>     arrow_table = pa.Table.from_arrays(arrow_columns, columns)
>     if schema:
>         arrow_table = arrow_table.cast(schema, safe=safe)
>     return arrow_table
> test = from_pylist(test_list, columns=['name' , 'age', 'city', 'birthday', 'dummy'])
> test_schema = pa.schema([
> pa.field('name', pa.string()),
> pa.field('age', pa.int16()),
> pa.field('city', pa.string()),
> pa.field('birthday', pa.timestamp('ms'))
> ])
> test2 = from_pylist(test_list, schema=test_schema)
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)