You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2023/01/17 07:50:06 UTC
[arrow] branch master updated: GH-15287: [Ruby] Merge column and add suffix in Table#join (#33654)
This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 1a8272001d GH-15287: [Ruby] Merge column and add suffix in Table#join (#33654)
1a8272001d is described below
commit 1a8272001deb5be1053bb737493c368f659bce09
Author: Hirokazu SUZUKI <he...@gmail.com>
AuthorDate: Tue Jan 17 16:49:59 2023 +0900
GH-15287: [Ruby] Merge column and add suffix in Table#join (#33654)
# Rationale for this change
Current implementation is always preserve column of join key. It is convenient if columns are merged.
# What changes are included in this PR?
- Columns from left and right are marged if;
- Join key is a String or a Symbol (<= incompatible Change)
- Join key is nil (natural join) (<= change in unreleased feature)
- New options `left_suffix=""` and `right_suffix=""` are introduced.
- If it is empty (by default), join key(s) do not change.
- If it is not empty, the suffix is appended to join key(s).
# Are these changes tested?
Yes.
# Are there any user-facing changes?
There are incompatible change when join key is a String or a Symbol.
* Closes: #15287
Lead-authored-by: Hirokazu SUZUKI <he...@gmail.com>
Co-authored-by: Sutou Kouhei <ko...@clear-code.com>
Co-authored-by: Sutou Kouhei <ko...@cozmixng.org>
Signed-off-by: Sutou Kouhei <ko...@clear-code.com>
---
c_glib/arrow-glib/compute.cpp | 2 +-
ruby/red-arrow/lib/arrow/table.rb | 136 ++++++++++++++++++++++++++--
ruby/red-arrow/test/test-table.rb | 185 ++++++++++++++++++++++++++++++++++++--
3 files changed, 309 insertions(+), 14 deletions(-)
diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp
index 27e49b0027..fd5a6bae04 100644
--- a/c_glib/arrow-glib/compute.cpp
+++ b/c_glib/arrow-glib/compute.cpp
@@ -5195,7 +5195,7 @@ G_END_DECLS
arrow::Result<arrow::FieldRef>
garrow_field_reference_resolve_raw(const gchar *reference)
{
- if (reference && reference[0] == '.') {
+ if (reference && (reference[0] == '.' || reference[0] == '[')) {
return arrow::FieldRef::FromDotPath(reference);
} else {
arrow::FieldRef arrow_reference(reference);
diff --git a/ruby/red-arrow/lib/arrow/table.rb b/ruby/red-arrow/lib/arrow/table.rb
index 4f01620701..a14bf90d1f 100644
--- a/ruby/red-arrow/lib/arrow/table.rb
+++ b/ruby/red-arrow/lib/arrow/table.rb
@@ -472,18 +472,22 @@ module Arrow
#
# If both of `left_outputs` and `right_outputs` aren't
# specified, all columns in `self` and `right` are
- # outputted.
+ # output.
# @param right_outputs [::Array<String, Symbol>] Output columns in
# `right`.
#
# If both of `left_outputs` and `right_outputs` aren't
# specified, all columns in `self` and `right` are
- # outputted.
+ # output.
# @return [Arrow::Table]
# The joined `Arrow::Table`.
#
# @overload join(right, type: :inner, left_outputs: nil, right_outputs: nil)
- # If key(s) are not supplied, common keys in self and right are used.
+ # If key(s) are not supplied, common keys in self and right are used
+ # (natural join).
+ #
+ # Column used as keys are merged and remain in left side
+ # when both of `left_outputs` and `right_outputs` are `nil`.
#
# @macro join_common_before
# @macro join_common_after
@@ -493,13 +497,19 @@ module Arrow
# @overload join(right, key, type: :inner, left_outputs: nil, right_outputs: nil)
# Join right by a key.
#
+ # Column used as keys are merged and remain in left side
+ # when both of `left_outputs` and `right_outputs` are `nil`.
+ #
# @macro join_common_before
# @param key [String, Symbol] A join key.
# @macro join_common_after
#
- # @overload join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
+ # @overload join(right, keys, type: :inner, left_suffix: "", right_suffix: "",
+ # left_outputs: nil, right_outputs: nil)
# Join right by keys.
#
+ # Column name can be renamed by appending `left_suffix` or `right_suffix`.
+ #
# @macro join_common_before
# @param keys [::Array<String, Symbol>] Join keys.
# @macro join_common_after
@@ -516,8 +526,16 @@ module Arrow
# @macro join_common_after
#
# @since 7.0.0
- def join(right, keys=nil, type: :inner, left_outputs: nil, right_outputs: nil)
+ def join(right,
+ keys=nil,
+ type: :inner,
+ left_suffix: "",
+ right_suffix: "",
+ left_outputs: nil,
+ right_outputs: nil)
+ is_natural_join = keys.nil?
keys ||= (column_names & right.column_names)
+ type = JoinType.try_convert(type) || type
plan = ExecutePlan.new
left_node = plan.build_source_node(self)
right_node = plan.build_source_node(right)
@@ -533,21 +551,43 @@ module Arrow
hash_join_node_options = HashJoinNodeOptions.new(type,
left_keys,
right_keys)
+ use_manual_outputs = false
unless left_outputs.nil?
hash_join_node_options.left_outputs = left_outputs
+ use_manual_outputs = true
end
unless right_outputs.nil?
hash_join_node_options.right_outputs = right_outputs
+ use_manual_outputs = true
end
hash_join_node = plan.build_hash_join_node(left_node,
right_node,
hash_join_node_options)
+ type_nick = type.nick
+ is_filter_join = (type_nick.end_with?("-semi") or
+ type_nick.end_with?("-anti"))
+ if use_manual_outputs or is_filter_join
+ process_node = hash_join_node
+ elsif is_natural_join
+ process_node = join_merge_keys(plan, hash_join_node, right, keys)
+ elsif keys.is_a?(String) or keys.is_a?(Symbol)
+ process_node = join_merge_keys(plan, hash_join_node, right, [keys.to_s])
+ elsif !keys.is_a?(Hash) and (left_suffix != "" or right_suffix != "")
+ process_node = join_rename_keys(plan,
+ hash_join_node,
+ right,
+ keys,
+ left_suffix,
+ right_suffix)
+ else
+ process_node = hash_join_node
+ end
sink_node_options = SinkNodeOptions.new
- plan.build_sink_node(hash_join_node, sink_node_options)
+ plan.build_sink_node(process_node, sink_node_options)
plan.validate
plan.start
plan.wait
- reader = sink_node_options.get_reader(hash_join_node.output_schema)
+ reader = sink_node_options.get_reader(process_node.output_schema)
table = reader.read_all
share_input(table)
table
@@ -620,5 +660,87 @@ module Arrow
raise ArgumentError, message
end
end
+
+ def join_merge_keys(plan, input_node, right, keys)
+ expressions = []
+ names = []
+ normalized_keys = {}
+ keys.each do |key|
+ normalized_keys[key.to_s] = true
+ end
+ key_to_outputs = {}
+ outputs = []
+ left_n_column_names = column_names.size
+ column_names.each_with_index do |name, i|
+ is_key = normalized_keys.include?(name)
+ output = {is_key: is_key, name: name, index: i, direction: :left}
+ outputs << output
+ key_to_outputs[name] = {left: output} if is_key
+ end
+ right.column_names.each_with_index do |name, i|
+ index = left_n_column_names + i
+ is_key = normalized_keys.include?(name)
+ output = {is_key: is_key, name: name, index: index, direction: :right}
+ outputs << output
+ key_to_outputs[name][:right] = output if is_key
+ end
+
+ outputs.each do |output|
+ if output[:is_key]
+ next if output[:direction] == :right
+ left_output = key_to_outputs[output[:name]][:left]
+ right_output = key_to_outputs[output[:name]][:right]
+ left_field = FieldExpression.new("[#{left_output[:index]}]")
+ right_field = FieldExpression.new("[#{right_output[:index]}]")
+ is_left_null = CallExpression.new("is_null", [left_field])
+ merge_column = CallExpression.new("if_else",
+ [
+ is_left_null,
+ right_field,
+ left_field,
+ ])
+ expressions << merge_column
+ else
+ expressions << FieldExpression.new("[#{output[:index]}]")
+ end
+ names << output[:name]
+ end
+ project_node_options = ProjectNodeOptions.new(expressions, names)
+ plan.build_project_node(input_node, project_node_options)
+ end
+
+ def join_rename_keys(plan,
+ input_node,
+ right,
+ keys,
+ left_suffix,
+ right_suffix)
+ expressions = []
+ names = []
+ normalized_keys = {}
+ keys.each do |key|
+ normalized_keys[key.to_s] = true
+ end
+ left_n_column_names = column_names.size
+ column_names.each_with_index do |name, i|
+ expressions << FieldExpression.new("[#{i}]")
+ if normalized_keys.include?(name)
+ names << "#{name}#{left_suffix}"
+ else
+ names << name
+ end
+ end
+ right.column_names.each_with_index do |name, i|
+ index = left_n_column_names + i
+ expressions << FieldExpression.new("[#{index}]")
+ if normalized_keys.include?(name)
+ names << "#{name}#{right_suffix}"
+ else
+ names << name
+ end
+ end
+ project_node_options = ProjectNodeOptions.new(expressions, names)
+ plan.build_project_node(input_node, project_node_options)
+ end
end
end
diff --git a/ruby/red-arrow/test/test-table.rb b/ruby/red-arrow/test/test-table.rb
index bd09c3536a..608091c675 100644
--- a/ruby/red-arrow/test/test-table.rb
+++ b/ruby/red-arrow/test/test-table.rb
@@ -1131,7 +1131,7 @@ visible: false
end
sub_test_case("#join") do
- test("no keys") do
+ test("keys: nil (natural join)") do
table1 = Arrow::Table.new(key: [1, 2, 3],
number: [10, 20, 30])
table2 = Arrow::Table.new(key: [3, 1],
@@ -1139,7 +1139,6 @@ visible: false
assert_equal(Arrow::Table.new([
["key", [1, 3]],
["number", [10, 30]],
- ["key", [1, 3]],
["string", ["one", "three"]],
]),
table1.join(table2))
@@ -1153,7 +1152,6 @@ visible: false
assert_equal(Arrow::Table.new([
["key", [1, 3]],
["number", [10, 30]],
- ["key", [1, 3]],
["string", ["one", "three"]],
]),
table1.join(table2, "key"))
@@ -1167,12 +1165,25 @@ visible: false
assert_equal(Arrow::Table.new([
["key", [1, 3]],
["number", [10, 30]],
- ["key", [1, 3]],
["string", ["one", "three"]],
]),
table1.join(table2, :key))
end
+ test("keys: [String]") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [1, 3]],
+ ["number", [10, 30]],
+ ["key", [1, 3]],
+ ["string", ["one", "three"]],
+ ]),
+ table1.join(table2, ["key"]))
+ end
+
test("keys: [String, Symbol]") do
table1 = Arrow::Table.new(key1: [1, 1, 2, 2],
key2: [10, 100, 20, 200],
@@ -1230,7 +1241,7 @@ visible: false
type: :inner))
end
- test("type:") do
+ test("type: :left_outer") do
table1 = Arrow::Table.new(key: [1, 2, 3],
number: [10, 20, 30])
table2 = Arrow::Table.new(key: [3, 1],
@@ -1238,12 +1249,85 @@ visible: false
assert_equal(Arrow::Table.new([
["key", [1, 3, 2]],
["number", [10, 30, 20]],
- ["key", [1, 3, nil]],
["string", ["one", "three", nil]],
]),
table1.join(table2, "key", type: :left_outer))
end
+ test("type: :right_outer") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [1, 3]],
+ ["number", [10, 30]],
+ ["string", ["one", "three"]],
+ ]),
+ table1.join(table2, "key", type: :right_outer))
+ end
+
+ test("type: :full_outer") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [1, 3, 2]],
+ ["number", [10, 30, 20]],
+ ["string", ["one", "three", nil]],
+ ]),
+ table1.join(table2, "key", type: :full_outer))
+ end
+
+ test("type: :left_semi") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [1, 3]],
+ ["number", [10, 30]],
+ ]),
+ table1.join(table2, "key", type: :left_semi))
+ end
+
+ test("type: :right_semi") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [3, 1]],
+ ["string", ["three", "one"]],
+ ]),
+ table1.join(table2, "key", type: :right_semi))
+ end
+
+ test("type: :left_anti") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [2]],
+ ["number", [20]],
+ ]),
+ table1.join(table2, "key", type: :left_anti))
+ end
+
+ test("type: :right_anti") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", Arrow::ChunkedArray.new(:uint8)],
+ ["string", Arrow::ChunkedArray.new(:string)],
+ ]),
+ table1.join(table2, "key", type: :right_anti))
+ end
+
test("left_outputs: & right_outputs:") do
table1 = Arrow::Table.new(key: [1, 2, 3],
number: [10, 20, 30])
@@ -1257,5 +1341,94 @@ visible: false
left_outputs: ["key", "number"],
right_outputs: ["string"]))
end
+
+ test("left_outputs: & type: :inner") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [1, 3]],
+ ["number", [10, 30]],
+ ["key", [1, 3]],
+ ["string", ["one", "three"]]
+ ]),
+ table1.join(table2,
+ type: :inner,
+ left_outputs: table1.column_names,
+ right_outputs: table2.column_names))
+ end
+
+ test("left_outputs: & type: :left_outer") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [1, 3, 2]],
+ ["number", [10, 30, 20]],
+ ["key", [1, 3, nil]],
+ ["string", ["one", "three", nil]],
+ ]),
+ table1.join(table2,
+ type: :left_outer,
+ left_outputs: table1.column_names,
+ right_outputs: table2.column_names))
+ end
+
+ test("left_outputs: & type: :right_outer") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [1, 3]],
+ ["number", [10, 30]],
+ ["key", [1, 3]],
+ ["string", ["one", "three"]],
+ ]),
+ table1.join(table2,
+ type: :right_outer,
+ left_outputs: table1.column_names,
+ right_outputs: table2.column_names))
+ end
+
+ test("left_outputs: & type: :full_outer") do
+ table1 = Arrow::Table.new(key: [1, 2, 3],
+ number: [10, 20, 30])
+ table2 = Arrow::Table.new(key: [3, 1],
+ string: ["three", "one"])
+ assert_equal(Arrow::Table.new([
+ ["key", [1, 3, 2]],
+ ["number", [10, 30, 20]],
+ ["key", [1, 3, nil]],
+ ["string", ["one", "three", nil]],
+ ]),
+ table1.join(table2,
+ type: :full_outer,
+ left_outputs: table1.column_names,
+ right_outputs: table2.column_names))
+ end
+
+ test("left_suffix: & keys: [String]") do
+ table1 = Arrow::Table.new(key1: [1, 1, 2, 2],
+ key2: [10, 100, 20, 200],
+ number: [1010, 1100, 2020, 2200])
+ table2 = Arrow::Table.new(key1: [1, 2, 2],
+ key2: [100, 20, 50],
+ string: ["1-100", "2-20", "2-50"])
+ assert_equal(Arrow::Table.new([
+ ["key1_left", [1, 2]],
+ ["key2_left", [100, 20]],
+ ["number", [1100, 2020]],
+ ["key1_right", [1, 2]],
+ ["key2_right", [100, 20]],
+ ["string", ["1-100", "2-20"]],
+ ]),
+ table1.join(table2,
+ ["key1", "key2"],
+ left_suffix: "_left",
+ right_suffix: "_right"))
+ end
end
end