You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2019/01/10 08:39:27 UTC

[arrow] branch master updated: ARROW-4214: [Ruby] Add support for building RecordBatch from raw Ruby objects

This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new f67a515  ARROW-4214: [Ruby] Add support for building RecordBatch from raw Ruby objects
f67a515 is described below

commit f67a5150df7d11a0ad5bc53044c192b023ad312c
Author: Kouhei Sutou <ko...@clear-code.com>
AuthorDate: Thu Jan 10 17:39:11 2019 +0900

    ARROW-4214: [Ruby] Add support for building RecordBatch from raw Ruby objects
    
    Author: Kouhei Sutou <ko...@clear-code.com>
    
    Closes #3360 from kou/ruby-record-batch-builder-append-records and squashes the following commits:
    
    e85bbaf5 <Kouhei Sutou>  Add support for building RecordBatch from raw Ruby objects
---
 ruby/red-arrow/lib/arrow/array-builder.rb        |   8 +-
 ruby/red-arrow/lib/arrow/list-array-builder.rb   |  10 ++
 ruby/red-arrow/lib/arrow/loader.rb               |   8 ++
 ruby/red-arrow/lib/arrow/record-batch-builder.rb | 115 ++++++++++++++++++++++
 ruby/red-arrow/lib/arrow/record-batch.rb         |  16 ++++
 ruby/red-arrow/lib/arrow/struct-array-builder.rb |  10 ++
 ruby/red-arrow/test/test-list-array-builder.rb   |  17 ++++
 ruby/red-arrow/test/test-record-batch-builder.rb | 116 +++++++++++++++++++++++
 ruby/red-arrow/test/test-record-batch.rb         | 114 +++++++++++++++-------
 ruby/red-arrow/test/test-struct-array-builder.rb |  20 ++++
 10 files changed, 400 insertions(+), 34 deletions(-)

diff --git a/ruby/red-arrow/lib/arrow/array-builder.rb b/ruby/red-arrow/lib/arrow/array-builder.rb
index 8edb3c4..7cfc432 100644
--- a/ruby/red-arrow/lib/arrow/array-builder.rb
+++ b/ruby/red-arrow/lib/arrow/array-builder.rb
@@ -65,6 +65,12 @@ module Arrow
     end
 
     def build(values)
+      append(*values)
+      finish
+    end
+
+    # @since 0.12.0
+    def append(*values)
       value_convertable = respond_to?(:convert_to_arrow_value, true)
       start_index = 0
       current_index = 0
@@ -111,8 +117,6 @@ module Arrow
           append_nulls(current_index - start_index)
         end
       end
-
-      finish
     end
 
     def append_nulls(n)
diff --git a/ruby/red-arrow/lib/arrow/list-array-builder.rb b/ruby/red-arrow/lib/arrow/list-array-builder.rb
index aa093c2..1fa507f 100644
--- a/ruby/red-arrow/lib/arrow/list-array-builder.rb
+++ b/ruby/red-arrow/lib/arrow/list-array-builder.rb
@@ -82,5 +82,15 @@ module Arrow
         end
       end
     end
+
+    # @since 0.12.0
+    def append(*values)
+      if values.empty?
+        # For backward compatibility
+        append_value
+      else
+        super
+      end
+    end
   end
 end
diff --git a/ruby/red-arrow/lib/arrow/loader.rb b/ruby/red-arrow/lib/arrow/loader.rb
index acd2573..6e0bf29 100644
--- a/ruby/red-arrow/lib/arrow/loader.rb
+++ b/ruby/red-arrow/lib/arrow/loader.rb
@@ -54,6 +54,7 @@ module Arrow
       require "arrow/path-extension"
       require "arrow/record"
       require "arrow/record-batch"
+      require "arrow/record-batch-builder"
       require "arrow/record-batch-file-reader"
       require "arrow/record-batch-stream-reader"
       require "arrow/rolling-window"
@@ -89,6 +90,13 @@ module Arrow
 
     def load_method_info(info, klass, method_name)
       case klass.name
+      when /Builder\z/
+        case method_name
+        when "append"
+          return
+        else
+          super
+        end
       when "Arrow::StringArray"
         case method_name
         when "get_value"
diff --git a/ruby/red-arrow/lib/arrow/record-batch-builder.rb b/ruby/red-arrow/lib/arrow/record-batch-builder.rb
new file mode 100644
index 0000000..dba16b3
--- /dev/null
+++ b/ruby/red-arrow/lib/arrow/record-batch-builder.rb
@@ -0,0 +1,115 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+  class RecordBatchBuilder
+    class << self
+      # @since 0.12.0
+      def build(schema, data)
+        builder = new(schema)
+        builder.append(data)
+        builder.flush
+      end
+    end
+
+    alias_method :initialize_raw, :initialize
+    private :initialize_raw
+    def initialize(schema)
+      unless schema.is_a?(Schema)
+        schema = Schema.new(schema)
+      end
+      initialize_raw(schema)
+      @name_to_index = {}
+      schema.fields.each_with_index do |field, i|
+        @name_to_index[field.name] = i
+      end
+    end
+
+    # @since 0.12.0
+    def [](name_or_index)
+      case name_or_index
+      when String, Symbol
+        name = name_or_index
+        self[resolve_name(name)]
+      else
+        index = name_or_index
+        column_builders[index]
+      end
+    end
+
+    # @since 0.12.0
+    def append(*values)
+      values.each do |value|
+        case value
+        when Hash
+          append_columns(value)
+        else
+          append_records(value)
+        end
+      end
+    end
+
+    # @since 0.12.0
+    def append_records(records)
+      n = n_fields
+      columns = n.times.collect do
+        []
+      end
+      records.each_with_index do |record, nth_record|
+        case record
+        when nil
+        when Hash
+          record.each do |name, value|
+            nth_column = resolve_name(name)
+            next if nth_column.nil?
+            columns[nth_column] << value
+          end
+        else
+          record.each_with_index do |value, nth_column|
+            columns[nth_column] << value
+          end
+        end
+        columns.each do |column|
+          column << nil if column.size != (nth_record + 1)
+        end
+      end
+      columns.each_with_index do |column, i|
+        self[i].append(*column)
+      end
+    end
+
+    # @since 0.12.0
+    def append_columns(columns)
+      columns.each do |name, values|
+        self[name].append(*values)
+      end
+    end
+
+    private
+    def resolve_name(name)
+      @name_to_index[name.to_s]
+    end
+
+    # TODO: Make public with good name. Is column_builders good enough?
+    # builders? sub_builders?
+    def column_builders
+      @column_builders ||= n_fields.times.collect do |i|
+        get_field(i)
+      end
+    end
+  end
+end
diff --git a/ruby/red-arrow/lib/arrow/record-batch.rb b/ruby/red-arrow/lib/arrow/record-batch.rb
index 6d9c35b..b577d4a 100644
--- a/ruby/red-arrow/lib/arrow/record-batch.rb
+++ b/ruby/red-arrow/lib/arrow/record-batch.rb
@@ -22,6 +22,22 @@ module Arrow
     include RecordContainable
     include Enumerable
 
+    class << self
+      def new(*args)
+        n_args = args.size
+        case n_args
+        when 2
+          schema, data = args
+          RecordBatchBuilder.build(schema, data)
+        when 3
+          super
+        else
+          message = "wrong number of arguments (given #{n_args}, expected 2..3)"
+          raise ArgumentError, message
+        end
+      end
+    end
+
     alias_method :each, :each_record
 
     alias_method :columns_raw, :columns
diff --git a/ruby/red-arrow/lib/arrow/struct-array-builder.rb b/ruby/red-arrow/lib/arrow/struct-array-builder.rb
index 52f75aa..b56056c 100644
--- a/ruby/red-arrow/lib/arrow/struct-array-builder.rb
+++ b/ruby/red-arrow/lib/arrow/struct-array-builder.rb
@@ -119,6 +119,16 @@ module Arrow
       end
     end
 
+    # @since 0.12.0
+    def append(*values)
+      if values.empty?
+        # For backward compatibility
+        append_value_raw
+      else
+        super
+      end
+    end
+
     private
     def cached_field_builders
       @field_builders ||= field_builders
diff --git a/ruby/red-arrow/test/test-list-array-builder.rb b/ruby/red-arrow/test/test-list-array-builder.rb
index e36f2c8..aee31e7 100644
--- a/ruby/red-arrow/test/test-list-array-builder.rb
+++ b/ruby/red-arrow/test/test-list-array-builder.rb
@@ -59,4 +59,21 @@ class ListArrayBuilderTest < Test::Unit::TestCase
                    array.collect {|list| list ? list.to_a : nil})
     end
   end
+
+  sub_test_case("#append") do
+    test("backward compatibility") do
+      @builder.append
+      @builder.value_builder.append(true)
+      @builder.value_builder.append(false)
+      @builder.append
+      @builder.value_builder.append(true)
+      array = @builder.finish
+
+      assert_equal([
+                     [true, false],
+                     [true],
+                   ],
+                   array.collect(&:to_a))
+    end
+  end
 end
diff --git a/ruby/red-arrow/test/test-record-batch-builder.rb b/ruby/red-arrow/test/test-record-batch-builder.rb
new file mode 100644
index 0000000..7cd1f8c
--- /dev/null
+++ b/ruby/red-arrow/test/test-record-batch-builder.rb
@@ -0,0 +1,116 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class RecordBatchBuilderTest < Test::Unit::TestCase
+  sub_test_case(".new") do
+    test("Schema") do
+      schema = Arrow::Schema.new(visible: :boolean,
+                                 count: :uint32)
+      builder = Arrow::RecordBatchBuilder.new(schema)
+      assert_equal(schema,
+                   builder.schema)
+    end
+
+    test("Hash") do
+      builder = Arrow::RecordBatchBuilder.new(visible: :boolean,
+                                              count: :uint32)
+      assert_equal(Arrow::Schema.new(visible: :boolean,
+                                     count: :uint32),
+                   builder.schema)
+    end
+  end
+
+  sub_test_case("instance methods") do
+    def setup
+      @schema = Arrow::Schema.new(visible: :boolean,
+                                  count: :uint32)
+      @builder = Arrow::RecordBatchBuilder.new(@schema)
+    end
+
+    sub_test_case("#[]") do
+      test("String") do
+        assert_equal(Arrow::BooleanDataType.new,
+                     @builder["visible"].value_data_type)
+      end
+
+      test("Symbol") do
+        assert_equal(Arrow::BooleanDataType.new,
+                     @builder[:visible].value_data_type)
+      end
+
+      test("Integer") do
+        assert_equal(Arrow::UInt32DataType.new,
+                     @builder[1].value_data_type)
+      end
+    end
+
+    test("#append") do
+      records = [
+        {visible: true, count: 1},
+      ]
+      columns = {
+        visible: [false],
+        count: [2],
+      }
+      arrays = [
+        Arrow::BooleanArray.new([true, false]),
+        Arrow::UInt32Array.new([1, 2]),
+      ]
+      @builder.append(records, columns)
+      assert_equal(Arrow::RecordBatch.new(@schema,
+                                          arrays[0].length,
+                                          arrays),
+                   @builder.flush)
+    end
+
+    test("#append_records") do
+      records = [
+        {visible: true, count: 1},
+        {visible: true, count: 2, garbage: "garbage"},
+        {visible: true},
+        [false, 4],
+        nil,
+        [true],
+      ]
+      arrays = [
+        Arrow::BooleanArray.new([true, true, true, false, nil, true]),
+        Arrow::UInt32Array.new([1, 2, nil, 4, nil, nil]),
+      ]
+      @builder.append_records(records)
+      assert_equal(Arrow::RecordBatch.new(@schema,
+                                          arrays[0].length,
+                                          arrays),
+                   @builder.flush)
+    end
+
+    test("#append_columns") do
+      columns = {
+        visible: [true, true, true, false, nil, true],
+        count: [1, 2, nil, 4, nil, nil],
+      }
+      arrays = [
+        Arrow::BooleanArray.new(columns[:visible]),
+        Arrow::UInt32Array.new(columns[:count]),
+      ]
+      @builder.append_columns(columns)
+      assert_equal(Arrow::RecordBatch.new(@schema,
+                                          arrays[0].length,
+                                          arrays),
+                   @builder.flush)
+    end
+  end
+end
diff --git a/ruby/red-arrow/test/test-record-batch.rb b/ruby/red-arrow/test/test-record-batch.rb
index 4dac085..d33298b 100644
--- a/ruby/red-arrow/test/test-record-batch.rb
+++ b/ruby/red-arrow/test/test-record-batch.rb
@@ -16,47 +16,97 @@
 # under the License.
 
 class RecordBatchTest < Test::Unit::TestCase
-  setup do
-    fields = [
-      Arrow::Field.new("count", :uint32),
-    ]
-    @schema = Arrow::Schema.new(fields)
-    @counts = Arrow::UInt32Array.new([1, 2, 4, 8])
-    @record_batch = Arrow::RecordBatch.new(@schema, @counts.length, [@counts])
-  end
+  sub_test_case(".new") do
+    def setup
+      @schema = Arrow::Schema.new(visible: :boolean,
+                                  count: :uint32)
+    end
 
-  sub_test_case(".each") do
-    test("default") do
-      records = []
-      @record_batch.each do |record|
-        records << [record, record.index]
-      end
+    test("[Schema, records]") do
+      records = [
+        {visible: true, count: 1},
+        nil,
+        [false, 3],
+      ]
+      record_batch = Arrow::RecordBatch.new(@schema, records)
       assert_equal([
-                     [0, 0],
-                     [1, 1],
-                     [2, 2],
-                     [3, 3],
+                     {"visible" => true,  "count" => 1},
+                     {"visible" => nil,   "count" => nil},
+                     {"visible" => false, "count" => 3},
                    ],
-                   records.collect {|record, i| [record.index, i]})
+                   record_batch.each_record.collect(&:to_h))
     end
 
-    test("reuse_record: true") do
-      records = []
-      @record_batch.each(reuse_record: true) do |record|
-        records << [record, record.index]
-      end
+    test("[Schema, columns]") do
+      columns = {
+        visible: [true, nil, false],
+        count: [1, 2, nil],
+      }
+      record_batch = Arrow::RecordBatch.new(@schema, columns)
+      assert_equal([
+                     {"visible" => true,  "count" => 1},
+                     {"visible" => nil,   "count" => 2},
+                     {"visible" => false, "count" => nil},
+                   ],
+                   record_batch.each_record.collect(&:to_h))
+    end
+
+    test("[Schema, n_rows, columns]") do
+      columns = [
+        Arrow::BooleanArray.new([true, nil, false]),
+        Arrow::UInt32Array.new([1, 2, nil]),
+      ]
+      n_rows = columns[0].length
+      record_batch = Arrow::RecordBatch.new(@schema, n_rows, columns)
       assert_equal([
-                     [3, 0],
-                     [3, 1],
-                     [3, 2],
-                     [3, 3],
+                     {"visible" => true,  "count" => 1},
+                     {"visible" => nil,   "count" => 2},
+                     {"visible" => false, "count" => nil},
                    ],
-                   records.collect {|record, i| [record.index, i]})
+                   record_batch.each_record.collect(&:to_h))
     end
   end
 
-  test("#to_table") do
-    assert_equal(Arrow::Table.new(@schema, [@counts]),
-                 @record_batch.to_table)
+  sub_test_case("instance methods") do
+    def setup
+      @schema = Arrow::Schema.new(count: :uint32)
+      @counts = Arrow::UInt32Array.new([1, 2, 4, 8])
+      @record_batch = Arrow::RecordBatch.new(@schema, @counts.length, [@counts])
+    end
+
+    sub_test_case("#each") do
+      test("default") do
+        records = []
+        @record_batch.each do |record|
+          records << [record, record.index]
+        end
+        assert_equal([
+                       [0, 0],
+                       [1, 1],
+                       [2, 2],
+                       [3, 3],
+                     ],
+                     records.collect {|record, i| [record.index, i]})
+      end
+
+      test("reuse_record: true") do
+        records = []
+        @record_batch.each(reuse_record: true) do |record|
+          records << [record, record.index]
+        end
+        assert_equal([
+                       [3, 0],
+                       [3, 1],
+                       [3, 2],
+                       [3, 3],
+                     ],
+                     records.collect {|record, i| [record.index, i]})
+      end
+    end
+
+    test("#to_table") do
+      assert_equal(Arrow::Table.new(@schema, [@counts]),
+                   @record_batch.to_table)
+    end
   end
 end
diff --git a/ruby/red-arrow/test/test-struct-array-builder.rb b/ruby/red-arrow/test/test-struct-array-builder.rb
index 42e1ded..f7706ee 100644
--- a/ruby/red-arrow/test/test-struct-array-builder.rb
+++ b/ruby/red-arrow/test/test-struct-array-builder.rb
@@ -157,4 +157,24 @@ class StructArrayBuilderTest < Test::Unit::TestCase
                    ])
     end
   end
+
+  sub_test_case("#append") do
+    test("backward compatibility") do
+      @builder.append
+      @builder.get_field_builder(0).append(true)
+      @builder.get_field_builder(1).append(1)
+      @builder.append
+      @builder.get_field_builder(0).append(false)
+      @builder.get_field_builder(1).append(2)
+      array = @builder.finish
+      assert_equal([
+                     [true, 1],
+                     [false, 2],
+                   ],
+                   [
+                     array.get_value(0).values,
+                     array.get_value(1).values,
+                   ])
+    end
+  end
 end