You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2018/07/14 06:01:30 UTC
[arrow] branch master updated: ARROW-2829: [GLib] Add
GArrowORCFileReader
This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 2c90eff ARROW-2829: [GLib] Add GArrowORCFileReader
2c90eff is described below
commit 2c90eff02c212ccf0c21a3f05b5371a801352e0f
Author: Kouhei Sutou <ko...@clear-code.com>
AuthorDate: Sat Jul 14 15:01:13 2018 +0900
ARROW-2829: [GLib] Add GArrowORCFileReader
c_glib/test/fixture/TestOrcFile.test1.orc is copied from
https://github.com/apache/orc/blob/master/examples/TestOrcFile.test1.orc .
Its license is Apache License 2.0.
Author: Kouhei Sutou <ko...@clear-code.com>
Closes #2250 from kou/glib-support-orc and squashes the following commits:
db0404af [Kouhei Sutou] [GLib] Use ORC term not Arrow term for consistency
f7682f79 [Kouhei Sutou] [GLib] Add GObject Introspection version check for old one
43dd9ba6 [Kouhei Sutou] [GLib] Enable ORC on Travis CI
4ac6407a [Kouhei Sutou] [deb] Add arrow-glib-orc.pc
9c7ab775 [Kouhei Sutou] [GLib] Clean private related code
61b882f5 [Kouhei Sutou] [GLib] Use accessor for field indexes for future API extension
7f0c2243 [Kouhei Sutou] [GLib] Fix install error with Meson + ORC
40aadaac [Kouhei Sutou] [GLib] Add GArrowORCFileReader
---
.travis.yml | 3 +
c_glib/arrow-glib/Makefile.am | 20 +
.../arrow-glib-orc.pc.in} | 40 +-
c_glib/arrow-glib/meson.build | 24 ++
c_glib/arrow-glib/orc-file-reader.cpp | 407 +++++++++++++++++++++
c_glib/arrow-glib/orc-file-reader.h | 60 +++
c_glib/arrow-glib/orc-file-reader.hpp | 31 ++
c_glib/configure.ac | 16 +
c_glib/doc/reference/Makefile.am | 5 +
c_glib/doc/reference/arrow-glib-docs.xml | 1 +
c_glib/doc/reference/meson.build | 7 +
c_glib/meson.build | 2 +
c_glib/test/fixture/TestOrcFile.test1.orc | Bin 0 -> 1711 bytes
c_glib/test/{run-test.rb => helper/fixture.rb} | 34 +-
c_glib/test/run-test.rb | 1 +
c_glib/test/test-orc-file-reader.rb | 244 ++++++++++++
.../debian/libarrow-glib-dev.install | 1 +
17 files changed, 834 insertions(+), 62 deletions(-)
diff --git a/.travis.yml b/.travis.yml
index 51253f2..24b32a4 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -163,6 +163,7 @@ matrix:
language: cpp
os: linux
env:
+ - ARROW_TRAVIS_ORC=1
- BUILD_TORCH_EXAMPLE=no
- CC="gcc-4.9"
- CXX="g++-4.9"
@@ -179,6 +180,8 @@ matrix:
- compiler: clang
osx_image: xcode8.3
os: osx
+ env:
+ - ARROW_TRAVIS_ORC=1
cache:
addons:
rvm: 2.2
diff --git a/c_glib/arrow-glib/Makefile.am b/c_glib/arrow-glib/Makefile.am
index 845048d..7017143 100644
--- a/c_glib/arrow-glib/Makefile.am
+++ b/c_glib/arrow-glib/Makefile.am
@@ -82,6 +82,11 @@ libarrow_glib_la_headers += \
libarrow_glib_la_headers += \
compute.h
+if HAVE_ARROW_ORC
+libarrow_glib_la_headers += \
+ orc-file-reader.h
+endif
+
libarrow_glib_la_generated_headers = \
enums.h \
version.h
@@ -128,6 +133,11 @@ libarrow_glib_la_sources += \
libarrow_glib_la_sources += \
compute.cpp
+if HAVE_ARROW_ORC
+libarrow_glib_la_sources += \
+ orc-file-reader.cpp
+endif
+
libarrow_glib_la_cpp_headers = \
array.hpp \
array-builder.hpp \
@@ -165,6 +175,11 @@ libarrow_glib_la_cpp_headers += \
libarrow_glib_la_cpp_headers += \
compute.hpp
+if HAVE_ARROW_ORC
+libarrow_glib_la_cpp_headers += \
+ orc-file-reader.hpp
+endif
+
libarrow_glib_la_SOURCES = \
$(libarrow_glib_la_sources) \
$(libarrow_glib_la_cpp_headers)
@@ -217,6 +232,11 @@ pkgconfigdir = $(libdir)/pkgconfig
pkgconfig_DATA = \
arrow-glib.pc
+if HAVE_ARROW_ORC
+pkgconfig_DATA += \
+ arrow-glib-orc.pc
+endif
+
if HAVE_INTROSPECTION
-include $(INTROSPECTION_MAKEFILE)
INTROSPECTION_GIRS =
diff --git a/c_glib/test/run-test.rb b/c_glib/arrow-glib/arrow-glib-orc.pc.in
old mode 100755
new mode 100644
similarity index 55%
copy from c_glib/test/run-test.rb
copy to c_glib/arrow-glib/arrow-glib-orc.pc.in
index 392c56f..5d22e14
--- a/c_glib/test/run-test.rb
+++ b/c_glib/arrow-glib/arrow-glib-orc.pc.in
@@ -1,5 +1,3 @@
-#!/usr/bin/env ruby
-#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
@@ -17,34 +15,12 @@
# specific language governing permissions and limitations
# under the License.
-require "pathname"
-require "test-unit"
-
-base_dir = Pathname(__dir__).parent
-test_dir = base_dir + "test"
-
-require "gi"
-
-Gio = GI.load("Gio")
-Arrow = GI.load("Arrow")
-module Arrow
- class Buffer
- alias_method :initialize_raw, :initialize
- def initialize(data)
- initialize_raw(data)
- @data = data
- end
- end
-end
-
-begin
- ArrowGPU = GI.load("ArrowGPU")
-rescue GObjectIntrospection::RepositoryError::TypelibNotFound
-end
-
-require "rbconfig"
-require "tempfile"
-require_relative "helper/buildable"
-require_relative "helper/omittable"
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
-exit(Test::Unit::AutoRunner.run(true, test_dir.to_s))
+Name: Apache Arrow GLib ORC
+Description: ORC modules for Apache Arrow GLib
+Version: @VERSION@
+Requires: arrow-glib
diff --git a/c_glib/arrow-glib/meson.build b/c_glib/arrow-glib/meson.build
index 5e3dbe2..e5f4860 100644
--- a/c_glib/arrow-glib/meson.build
+++ b/c_glib/arrow-glib/meson.build
@@ -57,6 +57,12 @@ sources += files(
'compute.cpp',
)
+if arrow_orc_dependency.found()
+ sources += files(
+ 'orc-file-reader.cpp',
+ )
+endif
+
c_headers = files(
'array.h',
'array-builder.h',
@@ -102,6 +108,12 @@ c_headers += files(
'compute.h',
)
+if arrow_orc_dependency.found()
+ c_headers += files(
+ 'orc-file-reader.h',
+ )
+endif
+
cpp_headers = files(
'array.hpp',
@@ -144,6 +156,11 @@ cpp_headers += files(
'compute.hpp',
)
+if arrow_orc_dependency.found()
+ cpp_headers += files(
+ 'orc-file-reader.hpp',
+ )
+endif
version_h_conf = configuration_data()
version_h_conf.set('GARROW_VERSION_MAJOR', version_major)
@@ -198,6 +215,13 @@ pkgconfig.generate(filebase: meson.project_name(),
version: version,
requires: ['gio-2.0', 'arrow'],
libraries: [libarrow_glib])
+if arrow_orc_dependency.found()
+ pkgconfig.generate(filebase: meson.project_name(),
+ name: 'Apache Arrow GLib ORC',
+ description: 'ORC modules for Apache Arrow GLib',
+ version: version,
+ requires: ['arrow-glib'])
+endif
arrow_glib_gir = gnome.generate_gir(libarrow_glib,
sources: sources + c_headers + enums,
diff --git a/c_glib/arrow-glib/orc-file-reader.cpp b/c_glib/arrow-glib/orc-file-reader.cpp
new file mode 100644
index 0000000..87ba563
--- /dev/null
+++ b/c_glib/arrow-glib/orc-file-reader.cpp
@@ -0,0 +1,407 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <arrow-glib/error.hpp>
+#include <arrow-glib/input-stream.hpp>
+#include <arrow-glib/orc-file-reader.hpp>
+#include <arrow-glib/record-batch.hpp>
+#include <arrow-glib/schema.hpp>
+#include <arrow-glib/table.hpp>
+
+G_BEGIN_DECLS
+
+/**
+ * SECTION: orc-file-reader
+ * @section_id: orc-file-reader
+ * @title: ORC reader
+ * @include: arrow-glib/orc-file-reader.h
+ *
+ * #GArrowORCFileReader is a class for reading stripes in ORC file
+ * format from input.
+ */
+
+typedef struct GArrowORCFileReaderPrivate_ {
+ GArrowSeekableInputStream *input;
+ arrow::adapters::orc::ORCFileReader *orc_file_reader;
+ GArray *field_indexes;
+} GArrowORCFileReaderPrivate;
+
+enum {
+ PROP_0,
+ PROP_INPUT,
+ PROP_ORC_FILE_READER
+};
+
+G_DEFINE_TYPE_WITH_PRIVATE(GArrowORCFileReader,
+ garrow_orc_file_reader,
+ G_TYPE_OBJECT);
+
+#define GARROW_ORC_FILE_READER_GET_PRIVATE(obj) \
+ static_cast<GArrowORCFileReaderPrivate *>( \
+ garrow_orc_file_reader_get_instance_private( \
+ GARROW_ORC_FILE_READER(obj)))
+
+static void
+garrow_orc_file_reader_dispose(GObject *object)
+{
+ auto priv = GARROW_ORC_FILE_READER_GET_PRIVATE(object);
+
+ if (priv->input) {
+ g_object_unref(priv->input);
+ priv->input = NULL;
+ }
+
+ G_OBJECT_CLASS(garrow_orc_file_reader_parent_class)->dispose(object);
+}
+
+static void
+garrow_orc_file_reader_finalize(GObject *object)
+{
+ auto priv = GARROW_ORC_FILE_READER_GET_PRIVATE(object);
+
+ delete priv->orc_file_reader;
+
+ if (priv->field_indexes) {
+ g_array_free(priv->field_indexes, TRUE);
+ }
+
+ G_OBJECT_CLASS(garrow_orc_file_reader_parent_class)->finalize(object);
+}
+
+static void
+garrow_orc_file_reader_set_property(GObject *object,
+ guint prop_id,
+ const GValue *value,
+ GParamSpec *pspec)
+{
+ auto priv = GARROW_ORC_FILE_READER_GET_PRIVATE(object);
+
+ switch (prop_id) {
+ case PROP_INPUT:
+ priv->input = GARROW_SEEKABLE_INPUT_STREAM(g_value_dup_object(value));
+ break;
+ case PROP_ORC_FILE_READER:
+ priv->orc_file_reader =
+ static_cast<arrow::adapters::orc::ORCFileReader *>(g_value_get_pointer(value));
+ break;
+ default:
+ G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+ break;
+ }
+}
+
+static void
+garrow_orc_file_reader_get_property(GObject *object,
+ guint prop_id,
+ GValue *value,
+ GParamSpec *pspec)
+{
+ auto priv = GARROW_ORC_FILE_READER_GET_PRIVATE(object);
+
+ switch (prop_id) {
+ case PROP_INPUT:
+ g_value_set_object(value, priv->input);
+ break;
+ default:
+ G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+ break;
+ }
+}
+
+static void
+garrow_orc_file_reader_init(GArrowORCFileReader *object)
+{
+}
+
+static void
+garrow_orc_file_reader_class_init(GArrowORCFileReaderClass *klass)
+{
+ auto gobject_class = G_OBJECT_CLASS(klass);
+
+ gobject_class->dispose = garrow_orc_file_reader_dispose;
+ gobject_class->finalize = garrow_orc_file_reader_finalize;
+ gobject_class->set_property = garrow_orc_file_reader_set_property;
+ gobject_class->get_property = garrow_orc_file_reader_get_property;
+
+ GParamSpec *spec;
+ spec = g_param_spec_object("input",
+ "Input",
+ "The input stream",
+ GARROW_TYPE_SEEKABLE_INPUT_STREAM,
+ static_cast<GParamFlags>(G_PARAM_READWRITE |
+ G_PARAM_CONSTRUCT_ONLY));
+ g_object_class_install_property(gobject_class, PROP_INPUT, spec);
+
+ spec = g_param_spec_pointer("orc-file-reader",
+ "arrow::adapters::orc::ORCFileReader",
+ "The raw arrow::adapters::orc::ORCFileReader *",
+ static_cast<GParamFlags>(G_PARAM_WRITABLE |
+ G_PARAM_CONSTRUCT_ONLY));
+ g_object_class_install_property(gobject_class, PROP_ORC_FILE_READER, spec);
+}
+
+
+/**
+ * garrow_orc_file_reader_new:
+ * @file: The file to be read.
+ * @error: (nullable): Return locatipcn for a #GError or %NULL.
+ *
+ * Returns: (nullable): A newly created #GArrowORCFileReader
+ * or %NULL on error.
+ *
+ * Since: 0.10.0
+ */
+GArrowORCFileReader *
+garrow_orc_file_reader_new(GArrowSeekableInputStream *input,
+ GError **error)
+{
+ auto arrow_random_access_file = garrow_seekable_input_stream_get_raw(input);
+ auto pool = arrow::default_memory_pool();
+ std::unique_ptr<arrow::adapters::orc::ORCFileReader> arrow_reader;
+ auto status =
+ arrow::adapters::orc::ORCFileReader::Open(arrow_random_access_file,
+ pool,
+ &arrow_reader);
+ if (garrow_error_check(error, status, "[orc-file-reader][new]")) {
+ return garrow_orc_file_reader_new_raw(input, arrow_reader.release());
+ } else {
+ return NULL;
+ }
+}
+
+/**
+ * garrow_orc_file_reader_set_field_indexes:
+ * @reader: A #GArrowORCFileReader.
+ * @field_indexes: (nullable) (array length=n_field_indexes):
+ * The field indexes to be read.
+ * @n_field_indexes: The number of the specified indexes.
+ *
+ * Since: 0.10.0
+ */
+void
+garrow_orc_file_reader_set_field_indexes(GArrowORCFileReader *reader,
+ const gint *field_indexes,
+ guint n_field_indexes)
+{
+ auto priv = GARROW_ORC_FILE_READER_GET_PRIVATE(reader);
+ if (priv->field_indexes) {
+ g_array_free(priv->field_indexes, TRUE);
+ }
+ if (n_field_indexes == 0) {
+ priv->field_indexes = NULL;
+ } else {
+ priv->field_indexes = g_array_sized_new(FALSE,
+ FALSE,
+ sizeof(gint),
+ n_field_indexes);
+ g_array_append_vals(priv->field_indexes, field_indexes, n_field_indexes);
+ }
+}
+
+/**
+ * garrow_orc_file_reader_get_field_indexes:
+ * @reader: A #GArrowORCFileReader.
+ * @n_field_indexes: The number of the specified indexes.
+ *
+ * Returns: (nullable) (array length=n_field_indexes) (transfer none):
+ * The field indexes to be read.
+ *
+ * Since: 0.10.0
+ */
+const gint *
+garrow_orc_file_reader_get_field_indexes(GArrowORCFileReader *reader,
+ guint *n_field_indexes)
+{
+ auto priv = GARROW_ORC_FILE_READER_GET_PRIVATE(reader);
+ if (priv->field_indexes) {
+ *n_field_indexes = priv->field_indexes->len;
+ return reinterpret_cast<gint *>(priv->field_indexes->data);
+ } else {
+ *n_field_indexes = 0;
+ return NULL;
+ }
+}
+
+/**
+ * garrow_orc_file_reader_read_type:
+ * @reader: A #GArrowORCFileReader.
+ * @error: (nullable): Return locatipcn for a #GError or %NULL.
+ *
+ * Returns: (nullable) (transfer full): A newly read type as
+ * #GArrowSchema or %NULL on error.
+ *
+ * Since: 0.10.0
+ */
+GArrowSchema *
+garrow_orc_file_reader_read_type(GArrowORCFileReader *reader,
+ GError **error)
+{
+ auto arrow_reader = garrow_orc_file_reader_get_raw(reader);
+ std::shared_ptr<arrow::Schema> arrow_schema;
+ auto status = arrow_reader->ReadSchema(&arrow_schema);
+ if (garrow_error_check(error, status, "[orc-file-reader][read-type]")) {
+ return garrow_schema_new_raw(&arrow_schema);
+ } else {
+ return NULL;
+ }
+}
+
+/**
+ * garrow_orc_file_reader_read_stripes:
+ * @reader: A #GArrowORCFileReader.
+ * @error: (nullable): Return locatipcn for a #GError or %NULL.
+ *
+ * Returns: (nullable) (transfer full): A newly read stripes as
+ * #GArrowTable or %NULL on error.
+ *
+ * Since: 0.10.0
+ */
+GArrowTable *
+garrow_orc_file_reader_read_stripes(GArrowORCFileReader *reader,
+ GError **error)
+{
+ auto arrow_reader = garrow_orc_file_reader_get_raw(reader);
+ auto priv = GARROW_ORC_FILE_READER_GET_PRIVATE(reader);
+ if (priv->field_indexes) {
+ std::vector<int> arrow_field_indexes;
+ auto field_indexes = priv->field_indexes;
+ for (guint i = 0; i < field_indexes->len; ++i) {
+ arrow_field_indexes.push_back(g_array_index(field_indexes, gint, i));
+ }
+ std::shared_ptr<arrow::Table> arrow_table;
+ auto status = arrow_reader->Read(arrow_field_indexes, &arrow_table);
+ if (garrow_error_check(error, status, "[orc-file-reader][read-stripes]")) {
+ return garrow_table_new_raw(&arrow_table);
+ } else {
+ return NULL;
+ }
+ } else {
+ std::shared_ptr<arrow::Table> arrow_table;
+ auto status = arrow_reader->Read(&arrow_table);
+ if (garrow_error_check(error, status, "[orc-file-reader][read-stripes]")) {
+ return garrow_table_new_raw(&arrow_table);
+ } else {
+ return NULL;
+ }
+ }
+}
+
+/**
+ * garrow_orc_file_reader_read_stripe:
+ * @reader: A #GArrowORCFileReader.
+ * @i: The stripe index to be read.
+ * @error: (nullable): Return locatipcn for a #GError or %NULL.
+ *
+ * Returns: (nullable) (transfer full): A newly read stripe as
+ * #GArrowRecordBatch or %NULL on error.
+ *
+ * Since: 0.10.0
+ */
+GArrowRecordBatch *
+garrow_orc_file_reader_read_stripe(GArrowORCFileReader *reader,
+ gint64 i,
+ GError **error)
+{
+ auto arrow_reader = garrow_orc_file_reader_get_raw(reader);
+ if (i < 0) {
+ i += arrow_reader->NumberOfStripes();
+ }
+ auto priv = GARROW_ORC_FILE_READER_GET_PRIVATE(reader);
+ if (priv->field_indexes) {
+ std::vector<int> arrow_field_indexes;
+ auto field_indexes = priv->field_indexes;
+ for (guint j = 0; j < field_indexes->len; ++j) {
+ arrow_field_indexes.push_back(g_array_index(field_indexes, gint, j));
+ }
+ std::shared_ptr<arrow::RecordBatch> arrow_record_batch;
+ auto status = arrow_reader->ReadStripe(i,
+ arrow_field_indexes,
+ &arrow_record_batch);
+ if (garrow_error_check(error, status, "[orc-file-reader][read-stripe]")) {
+ return garrow_record_batch_new_raw(&arrow_record_batch);
+ } else {
+ return NULL;
+ }
+ } else {
+ std::shared_ptr<arrow::RecordBatch> arrow_record_batch;
+ auto status = arrow_reader->ReadStripe(i, &arrow_record_batch);
+ if (garrow_error_check(error, status, "[orc-file-reader][read-stripe]")) {
+ return garrow_record_batch_new_raw(&arrow_record_batch);
+ } else {
+ return NULL;
+ }
+ }
+}
+
+/**
+ * garrow_orc_file_reader_get_n_stripes:
+ * @reader: A #GArrowORCFileReader.
+ *
+ * Returns: The number of stripes in the file.
+ *
+ * Since: 0.10.0
+ */
+gint64
+garrow_orc_file_reader_get_n_stripes(GArrowORCFileReader *reader)
+{
+ auto arrow_reader = garrow_orc_file_reader_get_raw(reader);
+ return arrow_reader->NumberOfStripes();
+}
+
+/**
+ * garrow_orc_file_reader_get_n_rows:
+ * @reader: A #GArrowORCFileReader.
+ *
+ * Returns: The number of rows in the file.
+ *
+ * Since: 0.10.0
+ */
+gint64
+garrow_orc_file_reader_get_n_rows(GArrowORCFileReader *reader)
+{
+ auto arrow_reader = garrow_orc_file_reader_get_raw(reader);
+ return arrow_reader->NumberOfRows();
+}
+
+
+G_END_DECLS
+
+
+GArrowORCFileReader *
+garrow_orc_file_reader_new_raw(GArrowSeekableInputStream *input,
+ arrow::adapters::orc::ORCFileReader *arrow_reader)
+{
+ auto reader =
+ GARROW_ORC_FILE_READER(g_object_new(GARROW_TYPE_ORC_FILE_READER,
+ "input", input,
+ "orc-file-reader", arrow_reader,
+ NULL));
+ return reader;
+}
+
+arrow::adapters::orc::ORCFileReader *
+garrow_orc_file_reader_get_raw(GArrowORCFileReader *reader)
+{
+ auto priv = GARROW_ORC_FILE_READER_GET_PRIVATE(reader);
+ return priv->orc_file_reader;
+}
diff --git a/c_glib/arrow-glib/orc-file-reader.h b/c_glib/arrow-glib/orc-file-reader.h
new file mode 100644
index 0000000..67fd8b0
--- /dev/null
+++ b/c_glib/arrow-glib/orc-file-reader.h
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <arrow-glib/reader.h>
+
+G_BEGIN_DECLS
+
+#define GARROW_TYPE_ORC_FILE_READER (garrow_orc_file_reader_get_type())
+G_DECLARE_DERIVABLE_TYPE(GArrowORCFileReader,
+ garrow_orc_file_reader,
+ GARROW,
+ ORC_FILE_READER,
+ GObject)
+struct _GArrowORCFileReaderClass
+{
+ GObjectClass parent_class;
+};
+
+GArrowORCFileReader *
+garrow_orc_file_reader_new(GArrowSeekableInputStream *file,
+ GError **error);
+void
+garrow_orc_file_reader_set_field_indexes(GArrowORCFileReader *reader,
+ const gint *field_indexes,
+ guint n_field_indexes);
+const gint *
+garrow_orc_file_reader_get_field_indexes(GArrowORCFileReader *reader,
+ guint *n_field_indexes);
+GArrowSchema *
+garrow_orc_file_reader_read_type(GArrowORCFileReader *reader,
+ GError **error);
+GArrowTable *
+garrow_orc_file_reader_read_stripes(GArrowORCFileReader *reader,
+ GError **error);
+GArrowRecordBatch *
+garrow_orc_file_reader_read_stripe(GArrowORCFileReader *reader,
+ gint64 i,
+ GError **error);
+gint64 garrow_orc_file_reader_get_n_stripes(GArrowORCFileReader *reader);
+gint64 garrow_orc_file_reader_get_n_rows(GArrowORCFileReader *reader);
+
+G_END_DECLS
diff --git a/c_glib/arrow-glib/orc-file-reader.hpp b/c_glib/arrow-glib/orc-file-reader.hpp
new file mode 100644
index 0000000..4171290
--- /dev/null
+++ b/c_glib/arrow-glib/orc-file-reader.hpp
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <arrow/adapters/orc/adapter.h>
+
+#include <arrow-glib/reader.hpp>
+#include <arrow-glib/orc-file-reader.h>
+
+GArrowORCFileReader *
+garrow_orc_file_reader_new_raw(GArrowSeekableInputStream *input,
+ arrow::adapters::orc::ORCFileReader *arrow_reader);
+arrow::adapters::orc::ORCFileReader *
+garrow_orc_file_reader_get_raw(GArrowORCFileReader *reader);
diff --git a/c_glib/configure.ac b/c_glib/configure.ac
index d80cc9e..d6b7820 100644
--- a/c_glib/configure.ac
+++ b/c_glib/configure.ac
@@ -115,6 +115,10 @@ if test "x$GARROW_ARROW_CPP_BUILD_DIR" = "x"; then
USE_ARROW_BUILD_DIR=no
PKG_CHECK_MODULES([ARROW], [arrow arrow-compute])
+ PKG_CHECK_MODULES([ARROW_ORC],
+ [arrow-orc],
+ [HAVE_ARROW_ORC=yes],
+ [HAVE_ARROW_ORC=no])
PKG_CHECK_MODULES([ARROW_GPU],
[arrow-gpu],
[HAVE_ARROW_GPU=yes],
@@ -135,6 +139,12 @@ else
AC_SUBST(ARROW_CFLAGS)
AC_SUBST(ARROW_LIBS)
+ if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/arrow/adapters/orc/arrow-orc.pc"; then
+ HAVE_ARROW_ORC=yes
+ else
+ HAVE_ARROW_ORC=no
+ fi
+
ARROW_GPU_CFLAGS=""
if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/arrow/gpu/arrow-gpu.pc"; then
HAVE_ARROW_GPU=yes
@@ -150,6 +160,11 @@ fi
AM_CONDITIONAL([USE_ARROW_BUILD_DIR],
[test "$USE_ARROW_BUILD_DIR" = "yes"])
+AM_CONDITIONAL([HAVE_ARROW_ORC], [test "$HAVE_ARROW_ORC" = "yes"])
+if test "$HAVE_ARROW_ORC" = "yes"; then
+ AC_DEFINE(HAVE_ARROW_ORC, [1], [Define to 1 if Apache Arrow supports ORC.])
+fi
+
AM_CONDITIONAL([HAVE_ARROW_GPU], [test "$HAVE_ARROW_GPU" = "yes"])
if test "$HAVE_ARROW_GPU" = "yes"; then
AC_DEFINE(HAVE_ARROW_GPU, [1], [Define to 1 if Apache Arrow supports GPU.])
@@ -162,6 +177,7 @@ AC_CONFIG_FILES([
Makefile
arrow-glib/Makefile
arrow-glib/arrow-glib.pc
+ arrow-glib/arrow-glib-orc.pc
arrow-glib/version.h
arrow-gpu-glib/Makefile
arrow-gpu-glib/arrow-gpu-glib.pc
diff --git a/c_glib/doc/reference/Makefile.am b/c_glib/doc/reference/Makefile.am
index 6f916a6..ad0c938 100644
--- a/c_glib/doc/reference/Makefile.am
+++ b/c_glib/doc/reference/Makefile.am
@@ -36,6 +36,11 @@ HFILE_GLOB = \
IGNORE_HFILES =
+if !HAVE_ARROW_ORC
+IGNORE_HFILES += \
+ $(top_srcdir)/arrow-glib/orc-file-reader.h
+endif
+
CFILE_GLOB = \
$(top_srcdir)/arrow-glib/*.cpp
diff --git a/c_glib/doc/reference/arrow-glib-docs.xml b/c_glib/doc/reference/arrow-glib-docs.xml
index 2c75041..776a7b7 100644
--- a/c_glib/doc/reference/arrow-glib-docs.xml
+++ b/c_glib/doc/reference/arrow-glib-docs.xml
@@ -123,6 +123,7 @@
<chapter id="reader">
<title>Reader</title>
<xi:include href="xml/reader.xml"/>
+ <xi:include href="xml/orc-file-reader.xml"><xi:fallback /></xi:include>
</chapter>
<chapter id="writer">
<title>Writer</title>
diff --git a/c_glib/doc/reference/meson.build b/c_glib/doc/reference/meson.build
index 431aa0a..4f72424 100644
--- a/c_glib/doc/reference/meson.build
+++ b/c_glib/doc/reference/meson.build
@@ -59,10 +59,17 @@ if arrow_gpu_dependency.found()
libarrow_gpu_glib_dependency,
]
endif
+ignore_headers = []
+if not arrow_orc_dependency.found()
+ ignore_headers += [
+ join_paths(meson.source_root(), 'arrow-glib', 'orc-file-reader.h'),
+ ]
+endif
gnome.gtkdoc(meson.project_name(),
main_xml: meson.project_name() + '-docs.xml',
src_dir: source_directories,
dependencies: dependencies,
+ ignore_headers: ignore_headers,
gobject_typesfile: meson.project_name() + '.types',
scan_args: [
'--rebuild-types',
diff --git a/c_glib/meson.build b/c_glib/meson.build
index 330f2bb..3f7846d 100644
--- a/c_glib/meson.build
+++ b/c_glib/meson.build
@@ -48,6 +48,8 @@ pkgconfig = import('pkgconfig')
root_inc = include_directories('.')
+arrow_orc_dependency = dependency('arrow-orc', required: false)
+
subdir('arrow-glib')
arrow_gpu_dependency = dependency('arrow-gpu', required: false)
if arrow_gpu_dependency.found()
diff --git a/c_glib/test/fixture/TestOrcFile.test1.orc b/c_glib/test/fixture/TestOrcFile.test1.orc
new file mode 100644
index 0000000..4fb0bef
Binary files /dev/null and b/c_glib/test/fixture/TestOrcFile.test1.orc differ
diff --git a/c_glib/test/run-test.rb b/c_glib/test/helper/fixture.rb
old mode 100755
new mode 100644
similarity index 57%
copy from c_glib/test/run-test.rb
copy to c_glib/test/helper/fixture.rb
index 392c56f..f07afd0
--- a/c_glib/test/run-test.rb
+++ b/c_glib/test/helper/fixture.rb
@@ -1,5 +1,3 @@
-#!/usr/bin/env ruby
-#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
@@ -17,34 +15,10 @@
# specific language governing permissions and limitations
# under the License.
-require "pathname"
-require "test-unit"
-
-base_dir = Pathname(__dir__).parent
-test_dir = base_dir + "test"
-
-require "gi"
-
-Gio = GI.load("Gio")
-Arrow = GI.load("Arrow")
-module Arrow
- class Buffer
- alias_method :initialize_raw, :initialize
- def initialize(data)
- initialize_raw(data)
- @data = data
+module Helper
+ module Fixture
+ def fixture_path(*components)
+ File.join(__dir__, "..", "fixture", *components)
end
end
end
-
-begin
- ArrowGPU = GI.load("ArrowGPU")
-rescue GObjectIntrospection::RepositoryError::TypelibNotFound
-end
-
-require "rbconfig"
-require "tempfile"
-require_relative "helper/buildable"
-require_relative "helper/omittable"
-
-exit(Test::Unit::AutoRunner.run(true, test_dir.to_s))
diff --git a/c_glib/test/run-test.rb b/c_glib/test/run-test.rb
index 392c56f..366b083 100755
--- a/c_glib/test/run-test.rb
+++ b/c_glib/test/run-test.rb
@@ -45,6 +45,7 @@ end
require "rbconfig"
require "tempfile"
require_relative "helper/buildable"
+require_relative "helper/fixture"
require_relative "helper/omittable"
exit(Test::Unit::AutoRunner.run(true, test_dir.to_s))
diff --git a/c_glib/test/test-orc-file-reader.rb b/c_glib/test/test-orc-file-reader.rb
new file mode 100644
index 0000000..6b5c640
--- /dev/null
+++ b/c_glib/test/test-orc-file-reader.rb
@@ -0,0 +1,244 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestORCFileReader < Test::Unit::TestCase
+ include Helper::Omittable
+ include Helper::Fixture
+
+ def setup
+ omit("Require Apache Arrow ORC") unless Arrow.const_defined?(:ORCFileReader)
+ path = fixture_path("TestOrcFile.test1.orc")
+ input = Arrow::MemoryMappedInputStream.new(path)
+ @reader = Arrow::ORCFileReader.new(input)
+ end
+
+ def test_read_type
+ assert_equal(<<-SCHEMA.chomp, @reader.read_type.to_s)
+boolean1: bool
+byte1: int8
+short1: int16
+int1: int32
+long1: int64
+float1: float
+double1: double
+bytes1: binary
+string1: string
+middle: struct<list: list<item: struct<int1: int32, string1: string>>>
+list: list<item: struct<int1: int32, string1: string>>
+map: list<item: struct<key: string, value: struct<int1: int32, string1: string>>>
+ SCHEMA
+ end
+
+ def test_field_indexes
+ require_gi(1, 42, 0)
+ assert_nil(@reader.field_indexes)
+ @reader.set_field_indexes([1, 3])
+ assert_equal([1, 3], @reader.field_indexes)
+ end
+
+ sub_test_case("#read_stripes") do
+ test("all") do
+ table = @reader.read_stripes
+ dump = table.n_columns.times.collect do |i|
+ column = table.get_column(i)
+ [
+ column.field.to_s,
+ column.data.chunks.collect(&:to_s),
+ ]
+ end
+ assert_equal([
+ ["boolean1: bool", ["[false, true]"]],
+ ["byte1: int8", ["[1, 100]"]],
+ ["short1: int16", ["[1024, 2048]"]],
+ ["int1: int32", ["[65536, 65536]"]],
+ [
+ "long1: int64",
+ ["[9223372036854775807, 9223372036854775807]"],
+ ],
+ ["float1: float", ["[1, 2]"]],
+ ["double1: double", ["[-15, -5]"]],
+ ["bytes1: binary", ["[0001020304, ]"]],
+ ["string1: string", ["[\"hi\", \"bye\"]"]],
+ [
+ "middle: " +
+ "struct<list: " +
+ "list<item: struct<int1: int32, string1: string>>>",
+ [
+ <<-STRUCT.chomp
+
+-- is_valid: all not null
+-- child 0 type: list<item: struct<int1: int32, string1: string>> values:
+ -- is_valid: all not null
+ -- value_offsets: [0, 2, 4]
+ -- values:
+ -- is_valid: all not null
+ -- child 0 type: int32 values: [1, 2, 1, 2]
+ -- child 1 type: string values: ["bye", "sigh", "bye", "sigh"]
+ STRUCT
+ ]
+ ],
+ [
+ "list: list<item: struct<int1: int32, string1: string>>",
+ [
+ <<-LIST.chomp
+
+-- is_valid: all not null
+-- value_offsets: [0, 2, 5]
+-- values:
+ -- is_valid: all not null
+ -- child 0 type: int32 values: [3, 4, 100000000, -100000, 1234]
+ -- child 1 type: string values: ["good", "bad", "cat", "in", "hat"]
+ LIST
+ ]
+ ],
+ [
+ "map: list<item: " +
+ "struct<key: string, value: " +
+ "struct<int1: int32, string1: string>>>",
+ [
+ <<-MAP.chomp
+
+-- is_valid: all not null
+-- value_offsets: [0, 0, 2]
+-- values:
+ -- is_valid: all not null
+ -- child 0 type: string values: ["chani", "mauddib"]
+ -- child 1 type: struct<int1: int32, string1: string> values:
+ -- is_valid: all not null
+ -- child 0 type: int32 values: [5, 1]
+ -- child 1 type: string values: ["chani", "mauddib"]
+ MAP
+ ],
+ ],
+ ],
+ dump)
+ end
+
+ test("select fields") do
+ @reader.set_field_indexes([1, 3])
+ table = @reader.read_stripes
+ dump = table.n_columns.times.collect do |i|
+ column = table.get_column(i)
+ [
+ column.field.to_s,
+ column.data.chunks.collect(&:to_s),
+ ]
+ end
+ assert_equal([
+ ["boolean1: bool", ["[false, true]"]],
+ ["short1: int16", ["[1024, 2048]"]],
+ ],
+ dump)
+ end
+ end
+
+ sub_test_case("#read_stripe") do
+ test("all") do
+ record_batch = @reader.read_stripe(0)
+ dump = record_batch.n_columns.times.collect do |i|
+ [
+ record_batch.schema.get_field(i).to_s,
+ record_batch.get_column(i).to_s,
+ ]
+ end
+ assert_equal([
+ ["boolean1: bool", "[false, true]"],
+ ["byte1: int8", "[1, 100]"],
+ ["short1: int16", "[1024, 2048]"],
+ ["int1: int32", "[65536, 65536]"],
+ [
+ "long1: int64",
+ "[9223372036854775807, 9223372036854775807]",
+ ],
+ ["float1: float", "[1, 2]"],
+ ["double1: double", "[-15, -5]"],
+ ["bytes1: binary", "[0001020304, ]"],
+ ["string1: string", "[\"hi\", \"bye\"]"],
+ [
+ "middle: " +
+ "struct<list: " +
+ "list<item: struct<int1: int32, string1: string>>>",
+ <<-STRUCT.chomp
+
+-- is_valid: all not null
+-- child 0 type: list<item: struct<int1: int32, string1: string>> values:
+ -- is_valid: all not null
+ -- value_offsets: [0, 2, 4]
+ -- values:
+ -- is_valid: all not null
+ -- child 0 type: int32 values: [1, 2, 1, 2]
+ -- child 1 type: string values: ["bye", "sigh", "bye", "sigh"]
+ STRUCT
+ ],
+ [
+ "list: list<item: struct<int1: int32, string1: string>>",
+ <<-LIST.chomp
+
+-- is_valid: all not null
+-- value_offsets: [0, 2, 5]
+-- values:
+ -- is_valid: all not null
+ -- child 0 type: int32 values: [3, 4, 100000000, -100000, 1234]
+ -- child 1 type: string values: ["good", "bad", "cat", "in", "hat"]
+ LIST
+ ],
+ [
+ "map: list<item: " +
+ "struct<key: string, value: " +
+ "struct<int1: int32, string1: string>>>",
+ <<-MAP.chomp
+
+-- is_valid: all not null
+-- value_offsets: [0, 0, 2]
+-- values:
+ -- is_valid: all not null
+ -- child 0 type: string values: ["chani", "mauddib"]
+ -- child 1 type: struct<int1: int32, string1: string> values:
+ -- is_valid: all not null
+ -- child 0 type: int32 values: [5, 1]
+ -- child 1 type: string values: ["chani", "mauddib"]
+ MAP
+ ],
+ ],
+ dump)
+ end
+
+ test("select fields") do
+ @reader.set_field_indexes([1, 3])
+ record_batch = @reader.read_stripe(0)
+ dump = record_batch.n_columns.times.collect do |i|
+ [
+ record_batch.schema.get_field(i).to_s,
+ record_batch.get_column(i).to_s,
+ ]
+ end
+ assert_equal([
+ ["boolean1: bool", "[false, true]"],
+ ["short1: int16", "[1024, 2048]"],
+ ],
+ dump)
+ end
+ end
+
+ def test_n_stripes
+ assert_equal(1, @reader.n_stripes)
+ end
+
+ def test_n_rows
+ assert_equal(2, @reader.n_rows)
+ end
+end
diff --git a/dev/tasks/linux-packages/debian/libarrow-glib-dev.install b/dev/tasks/linux-packages/debian/libarrow-glib-dev.install
index e59a1f9..461fbd4 100644
--- a/dev/tasks/linux-packages/debian/libarrow-glib-dev.install
+++ b/dev/tasks/linux-packages/debian/libarrow-glib-dev.install
@@ -2,5 +2,6 @@ usr/include/arrow-glib/
usr/lib/*/libarrow-glib.a
usr/lib/*/libarrow-glib.so
usr/lib/*/pkgconfig/arrow-glib.pc
+usr/lib/*/pkgconfig/arrow-glib-orc.pc
usr/share/gir-1.0/Arrow-1.0.gir
usr/share/arrow-glib/example/