You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2019/01/10 08:39:27 UTC
[arrow] branch master updated: ARROW-4214: [Ruby] Add support for
building RecordBatch from raw Ruby objects
This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new f67a515 ARROW-4214: [Ruby] Add support for building RecordBatch from raw Ruby objects
f67a515 is described below
commit f67a5150df7d11a0ad5bc53044c192b023ad312c
Author: Kouhei Sutou <ko...@clear-code.com>
AuthorDate: Thu Jan 10 17:39:11 2019 +0900
ARROW-4214: [Ruby] Add support for building RecordBatch from raw Ruby objects
Author: Kouhei Sutou <ko...@clear-code.com>
Closes #3360 from kou/ruby-record-batch-builder-append-records and squashes the following commits:
e85bbaf5 <Kouhei Sutou> Add support for building RecordBatch from raw Ruby objects
---
ruby/red-arrow/lib/arrow/array-builder.rb | 8 +-
ruby/red-arrow/lib/arrow/list-array-builder.rb | 10 ++
ruby/red-arrow/lib/arrow/loader.rb | 8 ++
ruby/red-arrow/lib/arrow/record-batch-builder.rb | 115 ++++++++++++++++++++++
ruby/red-arrow/lib/arrow/record-batch.rb | 16 ++++
ruby/red-arrow/lib/arrow/struct-array-builder.rb | 10 ++
ruby/red-arrow/test/test-list-array-builder.rb | 17 ++++
ruby/red-arrow/test/test-record-batch-builder.rb | 116 +++++++++++++++++++++++
ruby/red-arrow/test/test-record-batch.rb | 114 +++++++++++++++-------
ruby/red-arrow/test/test-struct-array-builder.rb | 20 ++++
10 files changed, 400 insertions(+), 34 deletions(-)
diff --git a/ruby/red-arrow/lib/arrow/array-builder.rb b/ruby/red-arrow/lib/arrow/array-builder.rb
index 8edb3c4..7cfc432 100644
--- a/ruby/red-arrow/lib/arrow/array-builder.rb
+++ b/ruby/red-arrow/lib/arrow/array-builder.rb
@@ -65,6 +65,12 @@ module Arrow
end
def build(values)
+ append(*values)
+ finish
+ end
+
+ # @since 0.12.0
+ def append(*values)
value_convertable = respond_to?(:convert_to_arrow_value, true)
start_index = 0
current_index = 0
@@ -111,8 +117,6 @@ module Arrow
append_nulls(current_index - start_index)
end
end
-
- finish
end
def append_nulls(n)
diff --git a/ruby/red-arrow/lib/arrow/list-array-builder.rb b/ruby/red-arrow/lib/arrow/list-array-builder.rb
index aa093c2..1fa507f 100644
--- a/ruby/red-arrow/lib/arrow/list-array-builder.rb
+++ b/ruby/red-arrow/lib/arrow/list-array-builder.rb
@@ -82,5 +82,15 @@ module Arrow
end
end
end
+
+ # @since 0.12.0
+ def append(*values)
+ if values.empty?
+ # For backward compatibility
+ append_value
+ else
+ super
+ end
+ end
end
end
diff --git a/ruby/red-arrow/lib/arrow/loader.rb b/ruby/red-arrow/lib/arrow/loader.rb
index acd2573..6e0bf29 100644
--- a/ruby/red-arrow/lib/arrow/loader.rb
+++ b/ruby/red-arrow/lib/arrow/loader.rb
@@ -54,6 +54,7 @@ module Arrow
require "arrow/path-extension"
require "arrow/record"
require "arrow/record-batch"
+ require "arrow/record-batch-builder"
require "arrow/record-batch-file-reader"
require "arrow/record-batch-stream-reader"
require "arrow/rolling-window"
@@ -89,6 +90,13 @@ module Arrow
def load_method_info(info, klass, method_name)
case klass.name
+ when /Builder\z/
+ case method_name
+ when "append"
+ return
+ else
+ super
+ end
when "Arrow::StringArray"
case method_name
when "get_value"
diff --git a/ruby/red-arrow/lib/arrow/record-batch-builder.rb b/ruby/red-arrow/lib/arrow/record-batch-builder.rb
new file mode 100644
index 0000000..dba16b3
--- /dev/null
+++ b/ruby/red-arrow/lib/arrow/record-batch-builder.rb
@@ -0,0 +1,115 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Arrow
+ class RecordBatchBuilder
+ class << self
+ # @since 0.12.0
+ def build(schema, data)
+ builder = new(schema)
+ builder.append(data)
+ builder.flush
+ end
+ end
+
+ alias_method :initialize_raw, :initialize
+ private :initialize_raw
+ def initialize(schema)
+ unless schema.is_a?(Schema)
+ schema = Schema.new(schema)
+ end
+ initialize_raw(schema)
+ @name_to_index = {}
+ schema.fields.each_with_index do |field, i|
+ @name_to_index[field.name] = i
+ end
+ end
+
+ # @since 0.12.0
+ def [](name_or_index)
+ case name_or_index
+ when String, Symbol
+ name = name_or_index
+ self[resolve_name(name)]
+ else
+ index = name_or_index
+ column_builders[index]
+ end
+ end
+
+ # @since 0.12.0
+ def append(*values)
+ values.each do |value|
+ case value
+ when Hash
+ append_columns(value)
+ else
+ append_records(value)
+ end
+ end
+ end
+
+ # @since 0.12.0
+ def append_records(records)
+ n = n_fields
+ columns = n.times.collect do
+ []
+ end
+ records.each_with_index do |record, nth_record|
+ case record
+ when nil
+ when Hash
+ record.each do |name, value|
+ nth_column = resolve_name(name)
+ next if nth_column.nil?
+ columns[nth_column] << value
+ end
+ else
+ record.each_with_index do |value, nth_column|
+ columns[nth_column] << value
+ end
+ end
+ columns.each do |column|
+ column << nil if column.size != (nth_record + 1)
+ end
+ end
+ columns.each_with_index do |column, i|
+ self[i].append(*column)
+ end
+ end
+
+ # @since 0.12.0
+ def append_columns(columns)
+ columns.each do |name, values|
+ self[name].append(*values)
+ end
+ end
+
+ private
+ def resolve_name(name)
+ @name_to_index[name.to_s]
+ end
+
+ # TODO: Make public with good name. Is column_builders good enough?
+ # builders? sub_builders?
+ def column_builders
+ @column_builders ||= n_fields.times.collect do |i|
+ get_field(i)
+ end
+ end
+ end
+end
diff --git a/ruby/red-arrow/lib/arrow/record-batch.rb b/ruby/red-arrow/lib/arrow/record-batch.rb
index 6d9c35b..b577d4a 100644
--- a/ruby/red-arrow/lib/arrow/record-batch.rb
+++ b/ruby/red-arrow/lib/arrow/record-batch.rb
@@ -22,6 +22,22 @@ module Arrow
include RecordContainable
include Enumerable
+ class << self
+ def new(*args)
+ n_args = args.size
+ case n_args
+ when 2
+ schema, data = args
+ RecordBatchBuilder.build(schema, data)
+ when 3
+ super
+ else
+ message = "wrong number of arguments (given #{n_args}, expected 2..3)"
+ raise ArgumentError, message
+ end
+ end
+ end
+
alias_method :each, :each_record
alias_method :columns_raw, :columns
diff --git a/ruby/red-arrow/lib/arrow/struct-array-builder.rb b/ruby/red-arrow/lib/arrow/struct-array-builder.rb
index 52f75aa..b56056c 100644
--- a/ruby/red-arrow/lib/arrow/struct-array-builder.rb
+++ b/ruby/red-arrow/lib/arrow/struct-array-builder.rb
@@ -119,6 +119,16 @@ module Arrow
end
end
+ # @since 0.12.0
+ def append(*values)
+ if values.empty?
+ # For backward compatibility
+ append_value_raw
+ else
+ super
+ end
+ end
+
private
def cached_field_builders
@field_builders ||= field_builders
diff --git a/ruby/red-arrow/test/test-list-array-builder.rb b/ruby/red-arrow/test/test-list-array-builder.rb
index e36f2c8..aee31e7 100644
--- a/ruby/red-arrow/test/test-list-array-builder.rb
+++ b/ruby/red-arrow/test/test-list-array-builder.rb
@@ -59,4 +59,21 @@ class ListArrayBuilderTest < Test::Unit::TestCase
array.collect {|list| list ? list.to_a : nil})
end
end
+
+ sub_test_case("#append") do
+ test("backward compatibility") do
+ @builder.append
+ @builder.value_builder.append(true)
+ @builder.value_builder.append(false)
+ @builder.append
+ @builder.value_builder.append(true)
+ array = @builder.finish
+
+ assert_equal([
+ [true, false],
+ [true],
+ ],
+ array.collect(&:to_a))
+ end
+ end
end
diff --git a/ruby/red-arrow/test/test-record-batch-builder.rb b/ruby/red-arrow/test/test-record-batch-builder.rb
new file mode 100644
index 0000000..7cd1f8c
--- /dev/null
+++ b/ruby/red-arrow/test/test-record-batch-builder.rb
@@ -0,0 +1,116 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class RecordBatchBuilderTest < Test::Unit::TestCase
+ sub_test_case(".new") do
+ test("Schema") do
+ schema = Arrow::Schema.new(visible: :boolean,
+ count: :uint32)
+ builder = Arrow::RecordBatchBuilder.new(schema)
+ assert_equal(schema,
+ builder.schema)
+ end
+
+ test("Hash") do
+ builder = Arrow::RecordBatchBuilder.new(visible: :boolean,
+ count: :uint32)
+ assert_equal(Arrow::Schema.new(visible: :boolean,
+ count: :uint32),
+ builder.schema)
+ end
+ end
+
+ sub_test_case("instance methods") do
+ def setup
+ @schema = Arrow::Schema.new(visible: :boolean,
+ count: :uint32)
+ @builder = Arrow::RecordBatchBuilder.new(@schema)
+ end
+
+ sub_test_case("#[]") do
+ test("String") do
+ assert_equal(Arrow::BooleanDataType.new,
+ @builder["visible"].value_data_type)
+ end
+
+ test("Symbol") do
+ assert_equal(Arrow::BooleanDataType.new,
+ @builder[:visible].value_data_type)
+ end
+
+ test("Integer") do
+ assert_equal(Arrow::UInt32DataType.new,
+ @builder[1].value_data_type)
+ end
+ end
+
+ test("#append") do
+ records = [
+ {visible: true, count: 1},
+ ]
+ columns = {
+ visible: [false],
+ count: [2],
+ }
+ arrays = [
+ Arrow::BooleanArray.new([true, false]),
+ Arrow::UInt32Array.new([1, 2]),
+ ]
+ @builder.append(records, columns)
+ assert_equal(Arrow::RecordBatch.new(@schema,
+ arrays[0].length,
+ arrays),
+ @builder.flush)
+ end
+
+ test("#append_records") do
+ records = [
+ {visible: true, count: 1},
+ {visible: true, count: 2, garbage: "garbage"},
+ {visible: true},
+ [false, 4],
+ nil,
+ [true],
+ ]
+ arrays = [
+ Arrow::BooleanArray.new([true, true, true, false, nil, true]),
+ Arrow::UInt32Array.new([1, 2, nil, 4, nil, nil]),
+ ]
+ @builder.append_records(records)
+ assert_equal(Arrow::RecordBatch.new(@schema,
+ arrays[0].length,
+ arrays),
+ @builder.flush)
+ end
+
+ test("#append_columns") do
+ columns = {
+ visible: [true, true, true, false, nil, true],
+ count: [1, 2, nil, 4, nil, nil],
+ }
+ arrays = [
+ Arrow::BooleanArray.new(columns[:visible]),
+ Arrow::UInt32Array.new(columns[:count]),
+ ]
+ @builder.append_columns(columns)
+ assert_equal(Arrow::RecordBatch.new(@schema,
+ arrays[0].length,
+ arrays),
+ @builder.flush)
+ end
+ end
+end
diff --git a/ruby/red-arrow/test/test-record-batch.rb b/ruby/red-arrow/test/test-record-batch.rb
index 4dac085..d33298b 100644
--- a/ruby/red-arrow/test/test-record-batch.rb
+++ b/ruby/red-arrow/test/test-record-batch.rb
@@ -16,47 +16,97 @@
# under the License.
class RecordBatchTest < Test::Unit::TestCase
- setup do
- fields = [
- Arrow::Field.new("count", :uint32),
- ]
- @schema = Arrow::Schema.new(fields)
- @counts = Arrow::UInt32Array.new([1, 2, 4, 8])
- @record_batch = Arrow::RecordBatch.new(@schema, @counts.length, [@counts])
- end
+ sub_test_case(".new") do
+ def setup
+ @schema = Arrow::Schema.new(visible: :boolean,
+ count: :uint32)
+ end
- sub_test_case(".each") do
- test("default") do
- records = []
- @record_batch.each do |record|
- records << [record, record.index]
- end
+ test("[Schema, records]") do
+ records = [
+ {visible: true, count: 1},
+ nil,
+ [false, 3],
+ ]
+ record_batch = Arrow::RecordBatch.new(@schema, records)
assert_equal([
- [0, 0],
- [1, 1],
- [2, 2],
- [3, 3],
+ {"visible" => true, "count" => 1},
+ {"visible" => nil, "count" => nil},
+ {"visible" => false, "count" => 3},
],
- records.collect {|record, i| [record.index, i]})
+ record_batch.each_record.collect(&:to_h))
end
- test("reuse_record: true") do
- records = []
- @record_batch.each(reuse_record: true) do |record|
- records << [record, record.index]
- end
+ test("[Schema, columns]") do
+ columns = {
+ visible: [true, nil, false],
+ count: [1, 2, nil],
+ }
+ record_batch = Arrow::RecordBatch.new(@schema, columns)
+ assert_equal([
+ {"visible" => true, "count" => 1},
+ {"visible" => nil, "count" => 2},
+ {"visible" => false, "count" => nil},
+ ],
+ record_batch.each_record.collect(&:to_h))
+ end
+
+ test("[Schema, n_rows, columns]") do
+ columns = [
+ Arrow::BooleanArray.new([true, nil, false]),
+ Arrow::UInt32Array.new([1, 2, nil]),
+ ]
+ n_rows = columns[0].length
+ record_batch = Arrow::RecordBatch.new(@schema, n_rows, columns)
assert_equal([
- [3, 0],
- [3, 1],
- [3, 2],
- [3, 3],
+ {"visible" => true, "count" => 1},
+ {"visible" => nil, "count" => 2},
+ {"visible" => false, "count" => nil},
],
- records.collect {|record, i| [record.index, i]})
+ record_batch.each_record.collect(&:to_h))
end
end
- test("#to_table") do
- assert_equal(Arrow::Table.new(@schema, [@counts]),
- @record_batch.to_table)
+ sub_test_case("instance methods") do
+ def setup
+ @schema = Arrow::Schema.new(count: :uint32)
+ @counts = Arrow::UInt32Array.new([1, 2, 4, 8])
+ @record_batch = Arrow::RecordBatch.new(@schema, @counts.length, [@counts])
+ end
+
+ sub_test_case("#each") do
+ test("default") do
+ records = []
+ @record_batch.each do |record|
+ records << [record, record.index]
+ end
+ assert_equal([
+ [0, 0],
+ [1, 1],
+ [2, 2],
+ [3, 3],
+ ],
+ records.collect {|record, i| [record.index, i]})
+ end
+
+ test("reuse_record: true") do
+ records = []
+ @record_batch.each(reuse_record: true) do |record|
+ records << [record, record.index]
+ end
+ assert_equal([
+ [3, 0],
+ [3, 1],
+ [3, 2],
+ [3, 3],
+ ],
+ records.collect {|record, i| [record.index, i]})
+ end
+ end
+
+ test("#to_table") do
+ assert_equal(Arrow::Table.new(@schema, [@counts]),
+ @record_batch.to_table)
+ end
end
end
diff --git a/ruby/red-arrow/test/test-struct-array-builder.rb b/ruby/red-arrow/test/test-struct-array-builder.rb
index 42e1ded..f7706ee 100644
--- a/ruby/red-arrow/test/test-struct-array-builder.rb
+++ b/ruby/red-arrow/test/test-struct-array-builder.rb
@@ -157,4 +157,24 @@ class StructArrayBuilderTest < Test::Unit::TestCase
])
end
end
+
+ sub_test_case("#append") do
+ test("backward compatibility") do
+ @builder.append
+ @builder.get_field_builder(0).append(true)
+ @builder.get_field_builder(1).append(1)
+ @builder.append
+ @builder.get_field_builder(0).append(false)
+ @builder.get_field_builder(1).append(2)
+ array = @builder.finish
+ assert_equal([
+ [true, 1],
+ [false, 2],
+ ],
+ [
+ array.get_value(0).values,
+ array.get_value(1).values,
+ ])
+ end
+ end
end