You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2020/04/08 01:38:13 UTC

[arrow] branch master updated: ARROW-8343: [GLib] Add GArrowRecordBatchIterator

This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 0a66565  ARROW-8343: [GLib] Add GArrowRecordBatchIterator
0a66565 is described below

commit 0a665657aab9fb188b162044c876c9c2f5055815
Author: Kenta Murata <mr...@mrkn.jp>
AuthorDate: Wed Apr 8 10:37:54 2020 +0900

    ARROW-8343: [GLib] Add GArrowRecordBatchIterator
    
    I'd like to add `GArrowRecordBatchIterator` as a binding of `arrow::RecordBatchIterator` class.
    
    Closes #6847 from mrkn/ARROW-8343
    
    Lead-authored-by: Kenta Murata <mr...@mrkn.jp>
    Co-authored-by: Sutou Kouhei <ko...@clear-code.com>
    Signed-off-by: Sutou Kouhei <ko...@clear-code.com>
---
 c_glib/arrow-glib/record-batch.cpp        | 208 ++++++++++++++++++++++++++++--
 c_glib/arrow-glib/record-batch.h          |  32 +++++
 c_glib/arrow-glib/record-batch.hpp        |   6 +
 c_glib/test/test-record-batch-iterator.rb |  51 ++++++++
 ci/docker/ubuntu-20.04-cpp.dockerfile     |   5 +-
 5 files changed, 290 insertions(+), 12 deletions(-)

diff --git a/c_glib/arrow-glib/record-batch.cpp b/c_glib/arrow-glib/record-batch.cpp
index 5d189a7..3766585 100644
--- a/c_glib/arrow-glib/record-batch.cpp
+++ b/c_glib/arrow-glib/record-batch.cpp
@@ -28,19 +28,26 @@
 #include <arrow-glib/record-batch.hpp>
 #include <arrow-glib/schema.hpp>
 
+#include <arrow/util/iterator.h>
+
 #include <sstream>
 
 G_BEGIN_DECLS
 
 /**
  * SECTION: record-batch
- * @short_description: Record batch class
+ * @section_id: record-batch
+ * @title: Record batch related classes
+ * @include: arrow-glib/arrow-glib.h
  *
  * #GArrowRecordBatch is a class for record batch. Record batch is
  * similar to #GArrowTable. Record batch also has also zero or more
  * columns and zero or more records.
  *
  * Record batch is used for shared memory IPC.
+ *
+ * #GArrowRecordBatchIterator is a class for iterating record
+ * batches.
  */
 
 typedef struct GArrowRecordBatchPrivate_ {
@@ -48,8 +55,7 @@ typedef struct GArrowRecordBatchPrivate_ {
 } GArrowRecordBatchPrivate;
 
 enum {
-  PROP_0,
-  PROP_RECORD_BATCH
+  PROP_RECORD_BATCH = 1,
 };
 
 G_DEFINE_TYPE_WITH_PRIVATE(GArrowRecordBatch,
@@ -73,9 +79,9 @@ garrow_record_batch_finalize(GObject *object)
 
 static void
 garrow_record_batch_set_property(GObject *object,
-                          guint prop_id,
-                          const GValue *value,
-                          GParamSpec *pspec)
+                                 guint prop_id,
+                                 const GValue *value,
+                                 GParamSpec *pspec)
 {
   auto priv = GARROW_RECORD_BATCH_GET_PRIVATE(object);
 
@@ -92,9 +98,9 @@ garrow_record_batch_set_property(GObject *object,
 
 static void
 garrow_record_batch_get_property(GObject *object,
-                          guint prop_id,
-                          GValue *value,
-                          GParamSpec *pspec)
+                                 guint prop_id,
+                                 GValue *value,
+                                 GParamSpec *pspec)
 {
   switch (prop_id) {
   default:
@@ -402,6 +408,174 @@ garrow_record_batch_remove_column(GArrowRecordBatch *record_batch,
   }
 }
 
+
+typedef struct GArrowRecordBatchIteratorPrivate_ {
+  arrow::RecordBatchIterator iterator;
+} GArrowRecordBatchIteratorPrivate;
+
+enum {
+  PROP_ITERATOR = 1,
+};
+
+G_DEFINE_TYPE_WITH_PRIVATE(GArrowRecordBatchIterator,
+                           garrow_record_batch_iterator,
+                           G_TYPE_OBJECT)
+
+#define GARROW_RECORD_BATCH_ITERATOR_GET_PRIVATE(obj)        \
+  static_cast<GArrowRecordBatchIteratorPrivate *>(           \
+     garrow_record_batch_iterator_get_instance_private(      \
+       GARROW_RECORD_BATCH_ITERATOR(obj)))
+
+static void
+garrow_record_batch_iterator_finalize(GObject *object)
+{
+  auto priv = GARROW_RECORD_BATCH_ITERATOR_GET_PRIVATE(object);
+
+  priv->iterator.~Iterator();
+
+  G_OBJECT_CLASS(garrow_record_batch_iterator_parent_class)->finalize(object);
+}
+
+static void
+garrow_record_batch_iterator_set_property(GObject *object,
+                                          guint prop_id,
+                                          const GValue *value,
+                                          GParamSpec *pspec)
+{
+  auto priv = GARROW_RECORD_BATCH_ITERATOR_GET_PRIVATE(object);
+
+  switch (prop_id) {
+  case PROP_ITERATOR:
+    priv->iterator =
+      std::move(*static_cast<arrow::RecordBatchIterator *>(g_value_get_pointer(value)));
+    break;
+  default:
+    G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
+    break;
+  }
+}
+
+static void
+garrow_record_batch_iterator_init(GArrowRecordBatchIterator *object)
+{
+  auto priv = GARROW_RECORD_BATCH_ITERATOR_GET_PRIVATE(object);
+  new(&priv->iterator) arrow::RecordBatchIterator;
+}
+
+static void
+garrow_record_batch_iterator_class_init(GArrowRecordBatchIteratorClass *klass)
+{
+  auto gobject_class = G_OBJECT_CLASS(klass);
+
+  gobject_class->finalize     = garrow_record_batch_iterator_finalize;
+  gobject_class->set_property = garrow_record_batch_iterator_set_property;
+
+  GParamSpec *spec;
+
+  spec = g_param_spec_pointer("iterator",
+                              "Iterator",
+                              "The raw arrow::RecordBatchIterator",
+                              static_cast<GParamFlags>(G_PARAM_WRITABLE |
+                                                       G_PARAM_CONSTRUCT_ONLY));
+  g_object_class_install_property(gobject_class, PROP_ITERATOR, spec);
+}
+
+/**
+ * garrow_record_batch_iterator_new:
+ * @record_batches: (element-type GArrowRecordBatch):
+ *   The record batches.
+ *
+ * Returns: A newly created #GArrowRecordBatchIterator.
+ *
+ * Since: 0.17.0
+ */
+GArrowRecordBatchIterator *
+garrow_record_batch_iterator_new(GList *record_batches)
+{
+  std::vector<std::shared_ptr<arrow::RecordBatch>> arrow_record_batches;
+  for (auto node = record_batches; node; node = node->next) {
+    auto record_batch = GARROW_RECORD_BATCH(node->data);
+    arrow_record_batches.push_back(garrow_record_batch_get_raw(record_batch));
+  }
+
+  auto arrow_iterator = arrow::MakeVectorIterator(arrow_record_batches);
+  return garrow_record_batch_iterator_new_raw(&arrow_iterator);
+}
+
+/**
+ * garrow_record_batch_iterator_next:
+ * @iterator: A #GArrowRecordBatchIterator.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: (nullable) (transfer full):
+ *   The next #GArrowRecordBatch, or %NULL when the iterator is completed.
+ *
+ * Since: 0.17.0
+ */
+GArrowRecordBatch *
+garrow_record_batch_iterator_next(GArrowRecordBatchIterator *iterator,
+                                  GError **error)
+{
+  auto priv = GARROW_RECORD_BATCH_ITERATOR_GET_PRIVATE(iterator);
+
+  auto result = priv->iterator.Next();
+  if (garrow::check(error, result, "[record-batch-iterator][next]")) {
+    auto arrow_record_batch = *result;
+    if (arrow_record_batch) {
+      return garrow_record_batch_new_raw(&arrow_record_batch);
+    }
+  }
+  return NULL;
+}
+
+/**
+ * garrow_record_batch_iterator_equal:
+ * @iterator: A #GArrowRecordBatchIterator.
+ * @other_iterator: A #GArrowRecordBatchIterator to be compared.
+ *
+ * Returns: %TRUE if both iterators are the same, %FALSE otherwise.
+ *
+ * Since: 0.17.0
+ */
+gboolean
+garrow_record_batch_iterator_equal(GArrowRecordBatchIterator *iterator,
+                                   GArrowRecordBatchIterator *other_iterator)
+{
+  auto priv = GARROW_RECORD_BATCH_ITERATOR_GET_PRIVATE(iterator);
+  auto priv_other = GARROW_RECORD_BATCH_ITERATOR_GET_PRIVATE(other_iterator);
+  return priv->iterator.Equals(priv_other->iterator);
+}
+
+/**
+ * garrow_record_batch_iterator_to_list:
+ * @iterator: A #GArrowRecordBatchIterator.
+ * @error: (nullable): Return location for a #GError or %NULL.
+ *
+ * Returns: (element-type GArrowRecordBatch) (transfer full):
+ *   A #GList contains every moved elements from the iterator.
+ *
+ * Since: 0.17.0
+ */
+GList*
+garrow_record_batch_iterator_to_list(GArrowRecordBatchIterator *iterator,
+                                     GError **error)
+{
+  auto priv = GARROW_RECORD_BATCH_ITERATOR_GET_PRIVATE(iterator);
+  GList *record_batches = NULL;
+  for (auto arrow_record_batch_result : priv->iterator) {
+    if (!garrow::check(error,
+                       arrow_record_batch_result,
+                       "[record-batch-iterator][to-list]")) {
+      g_list_free_full(record_batches, g_object_unref);
+      return NULL;
+    }
+    auto arrow_record_batch = *std::move(arrow_record_batch_result);
+    auto record_batch = garrow_record_batch_new_raw(&arrow_record_batch);
+    record_batches = g_list_prepend(record_batches, record_batch);
+  }
+  return g_list_reverse(record_batches);
+}
+
 G_END_DECLS
 
 GArrowRecordBatch *
@@ -420,3 +594,19 @@ garrow_record_batch_get_raw(GArrowRecordBatch *record_batch)
   auto priv = GARROW_RECORD_BATCH_GET_PRIVATE(record_batch);
   return priv->record_batch;
 }
+
+GArrowRecordBatchIterator *
+garrow_record_batch_iterator_new_raw(arrow::RecordBatchIterator *arrow_iterator)
+{
+  auto iterator = g_object_new(GARROW_TYPE_RECORD_BATCH_ITERATOR,
+                               "iterator", arrow_iterator,
+                               NULL);
+  return GARROW_RECORD_BATCH_ITERATOR(iterator);
+}
+
+arrow::RecordBatchIterator *
+garrow_record_batch_iterator_get_raw(GArrowRecordBatchIterator *iterator)
+{
+  auto priv = GARROW_RECORD_BATCH_ITERATOR_GET_PRIVATE(iterator);
+  return &priv->iterator;
+}
diff --git a/c_glib/arrow-glib/record-batch.h b/c_glib/arrow-glib/record-batch.h
index e9f4807..58c1da4 100644
--- a/c_glib/arrow-glib/record-batch.h
+++ b/c_glib/arrow-glib/record-batch.h
@@ -71,4 +71,36 @@ GArrowRecordBatch *garrow_record_batch_remove_column(GArrowRecordBatch *record_b
                                                      guint i,
                                                      GError **error);
 
+
+#define GARROW_TYPE_RECORD_BATCH_ITERATOR       \
+  (garrow_record_batch_iterator_get_type())
+G_DECLARE_DERIVABLE_TYPE(GArrowRecordBatchIterator,
+                         garrow_record_batch_iterator,
+                         GARROW,
+                         RECORD_BATCH_ITERATOR,
+                         GObject)
+struct _GArrowRecordBatchIteratorClass
+{
+  GObjectClass parent_class;
+};
+
+GARROW_AVAILABLE_IN_0_17
+GArrowRecordBatchIterator *
+garrow_record_batch_iterator_new(GList *record_batches);
+
+GARROW_AVAILABLE_IN_0_17
+GArrowRecordBatch *
+garrow_record_batch_iterator_next(GArrowRecordBatchIterator *iterator,
+                                  GError **error);
+
+GARROW_AVAILABLE_IN_0_17
+gboolean
+garrow_record_batch_iterator_equal(GArrowRecordBatchIterator *iterator,
+                                   GArrowRecordBatchIterator *other_iterator);
+
+GARROW_AVAILABLE_IN_0_17
+GList*
+garrow_record_batch_iterator_to_list(GArrowRecordBatchIterator *iterator,
+                                     GError **error);
+
 G_END_DECLS
diff --git a/c_glib/arrow-glib/record-batch.hpp b/c_glib/arrow-glib/record-batch.hpp
index 2e4fe03..5068110 100644
--- a/c_glib/arrow-glib/record-batch.hpp
+++ b/c_glib/arrow-glib/record-batch.hpp
@@ -25,3 +25,9 @@
 
 GArrowRecordBatch *garrow_record_batch_new_raw(std::shared_ptr<arrow::RecordBatch> *arrow_record_batch);
 std::shared_ptr<arrow::RecordBatch> garrow_record_batch_get_raw(GArrowRecordBatch *record_batch);
+
+GArrowRecordBatchIterator *
+garrow_record_batch_iterator_new_raw(arrow::RecordBatchIterator *arrow_iterator);
+
+arrow::RecordBatchIterator *
+garrow_record_batch_iterator_get_raw(GArrowRecordBatchIterator *iterator);
diff --git a/c_glib/test/test-record-batch-iterator.rb b/c_glib/test/test-record-batch-iterator.rb
new file mode 100644
index 0000000..daedde7
--- /dev/null
+++ b/c_glib/test/test-record-batch-iterator.rb
@@ -0,0 +1,51 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestRecordBatchIterator <Test::Unit::TestCase
+  include Helper::Buildable
+
+  def setup
+    fields = [
+      Arrow::Field.new("visible", Arrow::BooleanDataType.new),
+      Arrow::Field.new("point", Arrow::Int32DataType.new),
+    ]
+    schema = Arrow::Schema.new(fields)
+    @record_batches = [
+      [
+        build_boolean_array([true, false, true]),
+        build_int32_array([1, 2, 3]),
+      ],
+      [
+        build_boolean_array([false, true, false, true]),
+        build_int32_array([-1, -2, -3, -4]),
+      ]
+    ].collect do |columns|
+      Arrow::RecordBatch.new(schema, columns[0].length, columns)
+    end
+    @iterator = Arrow::RecordBatchIterator.new(@record_batches)
+  end
+
+  def test_next
+    assert_equal(@record_batches[0], @iterator.next)
+    assert_equal(@record_batches[1], @iterator.next)
+    assert_equal(nil, @iterator.next)
+  end
+
+  def test_to_list
+    assert_equal(@record_batches, @iterator.to_list)
+  end
+end
diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile
index e08c3e6..897f794 100644
--- a/ci/docker/ubuntu-20.04-cpp.dockerfile
+++ b/ci/docker/ubuntu-20.04-cpp.dockerfile
@@ -64,6 +64,7 @@ RUN apt-get update -y -q && \
         libre2-dev \
         libsnappy-dev \
         libssl-dev \
+        libthrift-dev \
         libzstd-dev \
         ninja-build \
         pkg-config \
@@ -80,7 +81,6 @@ RUN apt-get update -y -q && \
 # - flatbuffer is not packaged
 # - libgtest-dev only provide sources
 # - libprotobuf-dev only provide sources
-# - thrift is too old
 ENV ARROW_BUILD_TESTS=ON \
     ARROW_DEPENDENCY_SOURCE=SYSTEM \
     ARROW_DATASET=ON \
@@ -106,5 +106,4 @@ ENV ARROW_BUILD_TESTS=ON \
     ORC_SOURCE=BUNDLED \
     PARQUET_BUILD_EXECUTABLES=ON \
     PARQUET_BUILD_EXAMPLES=ON \
-    PATH=/usr/lib/ccache/:$PATH \
-    Thrift_SOURCE=BUNDLED
+    PATH=/usr/lib/ccache/:$PATH