You are viewing a plain text version of this content. The canonical link for it is here.
Posted to jira@arrow.apache.org by "Samay Kapadia (Jira)" <ji...@apache.org> on 2020/12/18 11:28:00 UTC
[jira] [Updated] (ARROW-10958) "Nested data conversions not implemented" through glib, but not through pyarrow

     [ https://issues.apache.org/jira/browse/ARROW-10958?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Samay Kapadia updated ARROW-10958:
----------------------------------
    Description: 
Hey all,

For some context, I am trying to use Arrow's GLib interface through Julia; I have a sense that I can speedup by pandas workflows by using Julia and Apache Arrow.

I have a 1.7GB parquet file that can be read in about 20s by using pyarrow's parquet reader
{code:java}
pq.read_table(path)
{code}
but I've tried to go the same through the GLib interface in Julia and I'm seeing
{code:julia}
[parquet][arrow][file-reader][read-table]: NotImplemented: Nested data conversions not implemented for chunked array outputs
{code}

Arrow was installed using {{brew install apache-arrow-glib}} and it installed version 2.0.0

Here's my Julia code:
{code:java}
using Pkg
Pkg.add("Gtk")
using Gtk.GLib
using Gtk

path = "..." # contains columns that are lists of strings

struct _GParquetArrowFileReader
    parent_instance::Cint
end

const GParquetArrowFileReader = _GParquetArrowFileReaderstruct 
_GParquetArrowFileReaderClass
    parent_class::Cint
end

const GParquetArrowFileReaderClass = _GParquetArrowFileReaderClass

struct _GArrowTable
    parent_instance::Cint
end

const GArrowTable = _GArrowTable

struct _GArrowTableClass
    parent_class::Cint
end

const GArrowTableClass = _GArrowTableClass

function parquet_arrow_file_reader_new_path(path::String)::Ptr{GParquetArrowFileReader}
    ret::Ptr{GParquetArrowFileReader} = 0
    GError() do error_check
        ret = ccall(
            (:gparquet_arrow_file_reader_new_path, "/usr/local/Cellar/apache-arrow-glib/2.0.0/lib/libparquet-glib.200"), 
            Ptr{GParquetArrowFileReader}, 
            (Ptr{UInt8}, Ptr{Ptr{GError}}), 
            Gtk.bytestring(path), error_check
        )
        ret != 0
    end
    ret
end

function parquet_arrow_file_reader_read_table(reader::Ptr{GParquetArrowFileReader})::Ptr{GArrowTable}
    ret::Ptr{GArrowTable} = 0
    GError() do error_check
        ret = ccall(
            (:gparquet_arrow_file_reader_read_table, "/usr/local/Cellar/apache-arrow-glib/2.0.0/lib/libparquet-glib.200"), 
            Ptr{GParquetArrowFileReader}, 
            (Ptr{GParquetArrowFileReader}, Ptr{Ptr{GError}}), 
            reader, error_check
        )
        ret != 0
    end
    ret
end

reader = parquet_arrow_file_reader_new_path(path)
tbl = parquet_arrow_file_reader_read_table(reader)
{code}
Am I doing something wrong or is there a behavior discrepancy between pyarrow and glib?

  was:
Hey all,

For some context, I am trying to use Arrow's GLib interface through Julia; I have a sense that I can speedup by pandas workflows by using Julia and Apache Arrow.

I have a 1.7GB parquet file that can be read in about 20s by using pyarrow's parquet reader
{code:java}
pq.read_table(path)
{code}
but I've tried to go the same through the GLib interface in Julia and I'm seeing
{code:java}
[parquet][arrow][file-reader][read-table]: NotImplemented: Nested data conversions not implemented for chunked array outputs
{code}
{{Arrow was installed using }}

{{brew install apache-arrow-glib}}

{{and it installed version 2.0.0}}

Here's my Julia code:
{code:java}
using Pkg
Pkg.add("Gtk")
using Gtk.GLib
using Gtk

path = "..." # contains columns that are lists of strings

struct _GParquetArrowFileReader
    parent_instance::Cint
end

const GParquetArrowFileReader = _GParquetArrowFileReaderstruct 
_GParquetArrowFileReaderClass
    parent_class::Cint
end

const GParquetArrowFileReaderClass = _GParquetArrowFileReaderClass

struct _GArrowTable
    parent_instance::Cint
end

const GArrowTable = _GArrowTable

struct _GArrowTableClass
    parent_class::Cint
end

const GArrowTableClass = _GArrowTableClass

function parquet_arrow_file_reader_new_path(path::String)::Ptr{GParquetArrowFileReader}
    ret::Ptr{GParquetArrowFileReader} = 0
    GError() do error_check
        ret = ccall(
            (:gparquet_arrow_file_reader_new_path, "/usr/local/Cellar/apache-arrow-glib/2.0.0/lib/libparquet-glib.200"), 
            Ptr{GParquetArrowFileReader}, 
            (Ptr{UInt8}, Ptr{Ptr{GError}}), 
            Gtk.bytestring(path), error_check
        )
        ret != 0
    end
    ret
end

function parquet_arrow_file_reader_read_table(reader::Ptr{GParquetArrowFileReader})::Ptr{GArrowTable}
    ret::Ptr{GArrowTable} = 0
    GError() do error_check
        ret = ccall(
            (:gparquet_arrow_file_reader_read_table, "/usr/local/Cellar/apache-arrow-glib/2.0.0/lib/libparquet-glib.200"), 
            Ptr{GParquetArrowFileReader}, 
            (Ptr{GParquetArrowFileReader}, Ptr{Ptr{GError}}), 
            reader, error_check
        )
        ret != 0
    end
    ret
end

reader = parquet_arrow_file_reader_new_path(path)
tbl = parquet_arrow_file_reader_read_table(reader)
{code}
Am I doing something wrong or is there a behavior discrepancy between pyarrow and glib?


> "Nested data conversions not implemented" through glib, but not through pyarrow
> -------------------------------------------------------------------------------
>
>                 Key: ARROW-10958
>                 URL: https://issues.apache.org/jira/browse/ARROW-10958
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: GLib
>    Affects Versions: 2.0.0
>         Environment: macOS Catalina 10.15.7
>            Reporter: Samay Kapadia
>            Priority: Major
>
> Hey all,
> For some context, I am trying to use Arrow's GLib interface through Julia; I have a sense that I can speedup by pandas workflows by using Julia and Apache Arrow.
> I have a 1.7GB parquet file that can be read in about 20s by using pyarrow's parquet reader
> {code:java}
> pq.read_table(path)
> {code}
> but I've tried to go the same through the GLib interface in Julia and I'm seeing
> {code:julia}
> [parquet][arrow][file-reader][read-table]: NotImplemented: Nested data conversions not implemented for chunked array outputs
> {code}
> Arrow was installed using {{brew install apache-arrow-glib}} and it installed version 2.0.0
> Here's my Julia code:
> {code:java}
> using Pkg
> Pkg.add("Gtk")
> using Gtk.GLib
> using Gtk
> path = "..." # contains columns that are lists of strings
> struct _GParquetArrowFileReader
>     parent_instance::Cint
> end
> const GParquetArrowFileReader = _GParquetArrowFileReaderstruct 
> _GParquetArrowFileReaderClass
>     parent_class::Cint
> end
> const GParquetArrowFileReaderClass = _GParquetArrowFileReaderClass
> struct _GArrowTable
>     parent_instance::Cint
> end
> const GArrowTable = _GArrowTable
> struct _GArrowTableClass
>     parent_class::Cint
> end
> const GArrowTableClass = _GArrowTableClass
> function parquet_arrow_file_reader_new_path(path::String)::Ptr{GParquetArrowFileReader}
>     ret::Ptr{GParquetArrowFileReader} = 0
>     GError() do error_check
>         ret = ccall(
>             (:gparquet_arrow_file_reader_new_path, "/usr/local/Cellar/apache-arrow-glib/2.0.0/lib/libparquet-glib.200"), 
>             Ptr{GParquetArrowFileReader}, 
>             (Ptr{UInt8}, Ptr{Ptr{GError}}), 
>             Gtk.bytestring(path), error_check
>         )
>         ret != 0
>     end
>     ret
> end
> function parquet_arrow_file_reader_read_table(reader::Ptr{GParquetArrowFileReader})::Ptr{GArrowTable}
>     ret::Ptr{GArrowTable} = 0
>     GError() do error_check
>         ret = ccall(
>             (:gparquet_arrow_file_reader_read_table, "/usr/local/Cellar/apache-arrow-glib/2.0.0/lib/libparquet-glib.200"), 
>             Ptr{GParquetArrowFileReader}, 
>             (Ptr{GParquetArrowFileReader}, Ptr{Ptr{GError}}), 
>             reader, error_check
>         )
>         ret != 0
>     end
>     ret
> end
> reader = parquet_arrow_file_reader_new_path(path)
> tbl = parquet_arrow_file_reader_read_table(reader)
> {code}
> Am I doing something wrong or is there a behavior discrepancy between pyarrow and glib?



--
This message was sent by Atlassian Jira
(v8.3.4#803005)