You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2023/01/17 07:50:06 UTC

[arrow] branch master updated: GH-15287: [Ruby] Merge column and add suffix in Table#join (#33654)

This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 1a8272001d GH-15287: [Ruby] Merge column and add suffix in Table#join (#33654)
1a8272001d is described below

commit 1a8272001deb5be1053bb737493c368f659bce09
Author: Hirokazu SUZUKI <he...@gmail.com>
AuthorDate: Tue Jan 17 16:49:59 2023 +0900

    GH-15287: [Ruby] Merge column and add suffix in Table#join (#33654)
    
    # Rationale for this change
    
    Current implementation is always preserve column of join key. It is convenient if columns are merged.
    
    # What changes are included in this PR?
    
    - Columns from left and right are marged if;
      - Join key is a String or a Symbol (<= incompatible Change)
      - Join key is nil (natural join) (<= change in unreleased feature)
    - New options `left_suffix=""` and `right_suffix=""` are introduced.
      - If it is empty (by default), join key(s) do not change.
      - If it is not empty, the suffix is appended to join key(s).
    
    # Are these changes tested?
    
    Yes.
    
    # Are there any user-facing changes?
    
    There are incompatible change when join key is a String or a Symbol.
    
    * Closes: #15287
    
    Lead-authored-by: Hirokazu SUZUKI <he...@gmail.com>
    Co-authored-by: Sutou Kouhei <ko...@clear-code.com>
    Co-authored-by: Sutou Kouhei <ko...@cozmixng.org>
    Signed-off-by: Sutou Kouhei <ko...@clear-code.com>
---
 c_glib/arrow-glib/compute.cpp     |   2 +-
 ruby/red-arrow/lib/arrow/table.rb | 136 ++++++++++++++++++++++++++--
 ruby/red-arrow/test/test-table.rb | 185 ++++++++++++++++++++++++++++++++++++--
 3 files changed, 309 insertions(+), 14 deletions(-)

diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp
index 27e49b0027..fd5a6bae04 100644
--- a/c_glib/arrow-glib/compute.cpp
+++ b/c_glib/arrow-glib/compute.cpp
@@ -5195,7 +5195,7 @@ G_END_DECLS
 arrow::Result<arrow::FieldRef>
 garrow_field_reference_resolve_raw(const gchar *reference)
 {
-  if (reference && reference[0] == '.') {
+  if (reference && (reference[0] == '.' || reference[0] == '[')) {
     return arrow::FieldRef::FromDotPath(reference);
   } else {
     arrow::FieldRef arrow_reference(reference);
diff --git a/ruby/red-arrow/lib/arrow/table.rb b/ruby/red-arrow/lib/arrow/table.rb
index 4f01620701..a14bf90d1f 100644
--- a/ruby/red-arrow/lib/arrow/table.rb
+++ b/ruby/red-arrow/lib/arrow/table.rb
@@ -472,18 +472,22 @@ module Arrow
     #
     #     If both of `left_outputs` and `right_outputs` aren't
     #     specified, all columns in `self` and `right` are
-    #     outputted.
+    #     output.
     #   @param right_outputs [::Array<String, Symbol>] Output columns in
     #     `right`.
     #
     #     If both of `left_outputs` and `right_outputs` aren't
     #     specified, all columns in `self` and `right` are
-    #     outputted.
+    #     output.
     #   @return [Arrow::Table]
     #     The joined `Arrow::Table`.
     #
     # @overload join(right, type: :inner, left_outputs: nil, right_outputs: nil)
-    #   If key(s) are not supplied, common keys in self and right are used.
+    #   If key(s) are not supplied, common keys in self and right are used
+    #   (natural join).
+    #
+    #   Column used as keys are merged and remain in left side
+    #   when both of `left_outputs` and `right_outputs` are `nil`.
     #
     #   @macro join_common_before
     #   @macro join_common_after
@@ -493,13 +497,19 @@ module Arrow
     # @overload join(right, key, type: :inner, left_outputs: nil, right_outputs: nil)
     #   Join right by a key.
     #
+    #   Column used as keys are merged and remain in left side
+    #   when both of `left_outputs` and `right_outputs` are `nil`.
+    #
     #   @macro join_common_before
     #   @param key [String, Symbol] A join key.
     #   @macro join_common_after
     #
-    # @overload join(right, keys, type: :inner, left_outputs: nil, right_outputs: nil)
+    # @overload join(right, keys, type: :inner, left_suffix: "", right_suffix: "",
+    #                left_outputs: nil, right_outputs: nil)
     #   Join right by keys.
     #
+    #   Column name can be renamed by appending `left_suffix` or `right_suffix`.
+    #
     #   @macro join_common_before
     #   @param keys [::Array<String, Symbol>] Join keys.
     #   @macro join_common_after
@@ -516,8 +526,16 @@ module Arrow
     #   @macro join_common_after
     #
     # @since 7.0.0
-    def join(right, keys=nil, type: :inner, left_outputs: nil, right_outputs: nil)
+    def join(right,
+             keys=nil,
+             type: :inner,
+             left_suffix: "",
+             right_suffix: "",
+             left_outputs: nil,
+             right_outputs: nil)
+      is_natural_join = keys.nil?
       keys ||= (column_names & right.column_names)
+      type = JoinType.try_convert(type) || type
       plan = ExecutePlan.new
       left_node = plan.build_source_node(self)
       right_node = plan.build_source_node(right)
@@ -533,21 +551,43 @@ module Arrow
       hash_join_node_options = HashJoinNodeOptions.new(type,
                                                        left_keys,
                                                        right_keys)
+      use_manual_outputs = false
       unless left_outputs.nil?
         hash_join_node_options.left_outputs = left_outputs
+        use_manual_outputs = true
       end
       unless right_outputs.nil?
         hash_join_node_options.right_outputs = right_outputs
+        use_manual_outputs = true
       end
       hash_join_node = plan.build_hash_join_node(left_node,
                                                  right_node,
                                                  hash_join_node_options)
+      type_nick = type.nick
+      is_filter_join = (type_nick.end_with?("-semi") or
+                        type_nick.end_with?("-anti"))
+      if use_manual_outputs or is_filter_join
+        process_node = hash_join_node
+      elsif is_natural_join
+        process_node = join_merge_keys(plan, hash_join_node, right, keys)
+      elsif keys.is_a?(String) or keys.is_a?(Symbol)
+        process_node = join_merge_keys(plan, hash_join_node, right, [keys.to_s])
+      elsif !keys.is_a?(Hash) and (left_suffix != "" or right_suffix != "")
+        process_node = join_rename_keys(plan,
+                                        hash_join_node,
+                                        right,
+                                        keys,
+                                        left_suffix,
+                                        right_suffix)
+      else
+        process_node = hash_join_node
+      end
       sink_node_options = SinkNodeOptions.new
-      plan.build_sink_node(hash_join_node, sink_node_options)
+      plan.build_sink_node(process_node, sink_node_options)
       plan.validate
       plan.start
       plan.wait
-      reader = sink_node_options.get_reader(hash_join_node.output_schema)
+      reader = sink_node_options.get_reader(process_node.output_schema)
       table = reader.read_all
       share_input(table)
       table
@@ -620,5 +660,87 @@ module Arrow
         raise ArgumentError, message
       end
     end
+
+    def join_merge_keys(plan, input_node, right, keys)
+      expressions = []
+      names = []
+      normalized_keys = {}
+      keys.each do |key|
+        normalized_keys[key.to_s] = true
+      end
+      key_to_outputs = {}
+      outputs = []
+      left_n_column_names = column_names.size
+      column_names.each_with_index do |name, i|
+        is_key = normalized_keys.include?(name)
+        output = {is_key: is_key, name: name, index: i, direction: :left}
+        outputs << output
+        key_to_outputs[name] = {left: output} if is_key
+      end
+      right.column_names.each_with_index do |name, i|
+        index = left_n_column_names + i
+        is_key = normalized_keys.include?(name)
+        output = {is_key: is_key, name: name, index: index, direction: :right}
+        outputs << output
+        key_to_outputs[name][:right] = output if is_key
+      end
+
+      outputs.each do |output|
+        if output[:is_key]
+          next if output[:direction] == :right
+          left_output = key_to_outputs[output[:name]][:left]
+          right_output = key_to_outputs[output[:name]][:right]
+          left_field = FieldExpression.new("[#{left_output[:index]}]")
+          right_field = FieldExpression.new("[#{right_output[:index]}]")
+          is_left_null = CallExpression.new("is_null", [left_field])
+          merge_column = CallExpression.new("if_else",
+                                            [
+                                              is_left_null,
+                                              right_field,
+                                              left_field,
+                                            ])
+          expressions << merge_column
+        else
+          expressions << FieldExpression.new("[#{output[:index]}]")
+        end
+        names << output[:name]
+      end
+      project_node_options = ProjectNodeOptions.new(expressions, names)
+      plan.build_project_node(input_node, project_node_options)
+    end
+
+    def join_rename_keys(plan,
+                         input_node,
+                         right,
+                         keys,
+                         left_suffix,
+                         right_suffix)
+      expressions = []
+      names = []
+      normalized_keys = {}
+      keys.each do |key|
+        normalized_keys[key.to_s] = true
+      end
+      left_n_column_names = column_names.size
+      column_names.each_with_index do |name, i|
+        expressions << FieldExpression.new("[#{i}]")
+        if normalized_keys.include?(name)
+          names << "#{name}#{left_suffix}"
+        else
+          names << name
+        end
+      end
+      right.column_names.each_with_index do |name, i|
+        index = left_n_column_names + i
+        expressions << FieldExpression.new("[#{index}]")
+        if normalized_keys.include?(name)
+          names << "#{name}#{right_suffix}"
+        else
+          names << name
+        end
+      end
+      project_node_options = ProjectNodeOptions.new(expressions, names)
+      plan.build_project_node(input_node, project_node_options)
+    end
   end
 end
diff --git a/ruby/red-arrow/test/test-table.rb b/ruby/red-arrow/test/test-table.rb
index bd09c3536a..608091c675 100644
--- a/ruby/red-arrow/test/test-table.rb
+++ b/ruby/red-arrow/test/test-table.rb
@@ -1131,7 +1131,7 @@ visible: false
   end
 
   sub_test_case("#join") do
-    test("no keys") do
+    test("keys: nil (natural join)") do
       table1 = Arrow::Table.new(key: [1, 2, 3],
                                 number: [10, 20, 30])
       table2 = Arrow::Table.new(key: [3, 1],
@@ -1139,7 +1139,6 @@ visible: false
       assert_equal(Arrow::Table.new([
                                       ["key", [1, 3]],
                                       ["number", [10, 30]],
-                                      ["key", [1, 3]],
                                       ["string", ["one", "three"]],
                                     ]),
                    table1.join(table2))
@@ -1153,7 +1152,6 @@ visible: false
       assert_equal(Arrow::Table.new([
                                       ["key", [1, 3]],
                                       ["number", [10, 30]],
-                                      ["key", [1, 3]],
                                       ["string", ["one", "three"]],
                                     ]),
                    table1.join(table2, "key"))
@@ -1167,12 +1165,25 @@ visible: false
       assert_equal(Arrow::Table.new([
                                       ["key", [1, 3]],
                                       ["number", [10, 30]],
-                                      ["key", [1, 3]],
                                       ["string", ["one", "three"]],
                                     ]),
                    table1.join(table2, :key))
     end
 
+    test("keys: [String]") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [1, 3]],
+                                      ["number", [10, 30]],
+                                      ["key", [1, 3]],
+                                      ["string", ["one", "three"]],
+                                    ]),
+                   table1.join(table2, ["key"]))
+    end
+
     test("keys: [String, Symbol]") do
       table1 = Arrow::Table.new(key1: [1, 1, 2, 2],
                                 key2: [10, 100, 20, 200],
@@ -1230,7 +1241,7 @@ visible: false
                                type: :inner))
     end
 
-    test("type:") do
+    test("type: :left_outer") do
       table1 = Arrow::Table.new(key: [1, 2, 3],
                                 number: [10, 20, 30])
       table2 = Arrow::Table.new(key: [3, 1],
@@ -1238,12 +1249,85 @@ visible: false
       assert_equal(Arrow::Table.new([
                                       ["key", [1, 3, 2]],
                                       ["number", [10, 30, 20]],
-                                      ["key", [1, 3, nil]],
                                       ["string", ["one", "three", nil]],
                                     ]),
                    table1.join(table2, "key", type: :left_outer))
     end
 
+    test("type: :right_outer") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [1, 3]],
+                                      ["number", [10, 30]],
+                                      ["string", ["one", "three"]],
+                                    ]),
+                   table1.join(table2, "key", type: :right_outer))
+    end
+
+    test("type: :full_outer") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [1, 3, 2]],
+                                      ["number", [10, 30, 20]],
+                                      ["string", ["one", "three", nil]],
+                                    ]),
+                   table1.join(table2, "key", type: :full_outer))
+    end
+
+    test("type: :left_semi") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [1, 3]],
+                                      ["number", [10, 30]],
+                                    ]),
+                   table1.join(table2, "key", type: :left_semi))
+    end
+
+    test("type: :right_semi") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [3, 1]],
+                                      ["string", ["three", "one"]],
+                                    ]),
+                   table1.join(table2, "key", type: :right_semi))
+    end
+
+    test("type: :left_anti") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [2]],
+                                      ["number", [20]],
+                                    ]),
+                   table1.join(table2, "key", type: :left_anti))
+    end
+
+    test("type: :right_anti") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", Arrow::ChunkedArray.new(:uint8)],
+                                      ["string", Arrow::ChunkedArray.new(:string)],
+                                    ]),
+                   table1.join(table2, "key", type: :right_anti))
+    end
+
     test("left_outputs: & right_outputs:") do
       table1 = Arrow::Table.new(key: [1, 2, 3],
                                 number: [10, 20, 30])
@@ -1257,5 +1341,94 @@ visible: false
                                left_outputs: ["key", "number"],
                                right_outputs: ["string"]))
     end
+
+    test("left_outputs: & type: :inner") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [1, 3]],
+                                      ["number", [10, 30]],
+                                      ["key", [1, 3]],
+                                      ["string", ["one", "three"]]
+                                    ]),
+                   table1.join(table2,
+                               type: :inner,
+                               left_outputs: table1.column_names,
+                               right_outputs: table2.column_names))
+    end
+
+    test("left_outputs: & type: :left_outer") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [1, 3, 2]],
+                                      ["number", [10, 30, 20]],
+                                      ["key", [1, 3, nil]],
+                                      ["string", ["one", "three", nil]],
+                                    ]),
+                   table1.join(table2,
+                               type: :left_outer,
+                               left_outputs: table1.column_names,
+                               right_outputs: table2.column_names))
+    end
+
+    test("left_outputs: & type: :right_outer") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [1, 3]],
+                                      ["number", [10, 30]],
+                                      ["key", [1, 3]],
+                                      ["string", ["one", "three"]],
+                                    ]),
+                   table1.join(table2,
+                               type: :right_outer,
+                               left_outputs: table1.column_names,
+                               right_outputs: table2.column_names))
+    end
+
+    test("left_outputs: & type: :full_outer") do
+      table1 = Arrow::Table.new(key: [1, 2, 3],
+                                number: [10, 20, 30])
+      table2 = Arrow::Table.new(key: [3, 1],
+                                string: ["three", "one"])
+      assert_equal(Arrow::Table.new([
+                                      ["key", [1, 3, 2]],
+                                      ["number", [10, 30, 20]],
+                                      ["key", [1, 3, nil]],
+                                      ["string", ["one", "three", nil]],
+                                    ]),
+                   table1.join(table2,
+                               type: :full_outer,
+                               left_outputs: table1.column_names,
+                               right_outputs: table2.column_names))
+    end
+
+    test("left_suffix: & keys: [String]") do
+      table1 = Arrow::Table.new(key1: [1, 1, 2, 2],
+                                key2: [10, 100, 20, 200],
+                                number: [1010, 1100, 2020, 2200])
+      table2 = Arrow::Table.new(key1: [1, 2, 2],
+                                key2: [100, 20, 50],
+                                string: ["1-100", "2-20", "2-50"])
+      assert_equal(Arrow::Table.new([
+                                      ["key1_left", [1, 2]],
+                                      ["key2_left", [100, 20]],
+                                      ["number", [1100, 2020]],
+                                      ["key1_right", [1, 2]],
+                                      ["key2_right", [100, 20]],
+                                      ["string", ["1-100", "2-20"]],
+                                    ]),
+                    table1.join(table2,
+                                ["key1", "key2"],
+                                left_suffix: "_left",
+                                right_suffix: "_right"))
+    end
   end
 end