You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2019/06/27 06:54:27 UTC

[arrow] branch master updated: ARROW-5535: [GLib] Add garrow_table_slice()

This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 576b520  ARROW-5535: [GLib] Add garrow_table_slice()
576b520 is described below

commit 576b52074b60485fb9ba52b727b4582bd3763b97
Author: Sutou Kouhei <ko...@clear-code.com>
AuthorDate: Thu Jun 27 15:54:17 2019 +0900

    ARROW-5535: [GLib] Add garrow_table_slice()
    
    `Arrow::Table#slice` has already been implemented in Red Arrow. So this PR replaces current `#slice` with `#slice` implemented by GLib.
    
    Author: Sutou Kouhei <ko...@clear-code.com>
    Author: Yosuke Shiro <yo...@gmail.com>
    
    Closes #4658 from shiro615/glib-table-slice and squashes the following commits:
    
    7f8f2387 <Sutou Kouhei> Accept negative integer as offset
    02b80262 <Sutou Kouhei> Use Hash to compare values
    dd27d9db <Sutou Kouhei> Simplify offset check
    75bb7f05 <Sutou Kouhei> Improve variable name
    9611374e <Sutou Kouhei> Simplify
    a03a39f6 <Sutou Kouhei> Resolve negative index in slice
    e8ef8440 <Sutou Kouhei> Update expected
    bd112f93 <Sutou Kouhei> Don't ignore nil returned by slicer {|slicer| ...}
    13868a28 <Sutou Kouhei> Add a document for slice {|slicer|}
    d7957199 <Sutou Kouhei> Improve error message
    8bd2b133 <Sutou Kouhei> Fix namespace
    2debac91 <Sutou Kouhei> Fix indent
    c2afd2f5 <Sutou Kouhei> Fix documented parameter names
    eaca7da1 <Sutou Kouhei> Fix document
    392f5f73 <Sutou Kouhei> Remove built .bundle
    2034f4f9 <Yosuke Shiro> Fix test case names
    79cf517e <Yosuke Shiro> Fix error messages
    08a54008 <Yosuke Shiro> Update documents of #slice
    ceb97add <Yosuke Shiro> Return nil if out of index
    d62e6147 <Yosuke Shiro> Can't give arguments with block
    57ce5295 <Yosuke Shiro> Validate from and the length
    ef2dccaf <Yosuke Shiro> Put a new line between alias_method : and alias_method :slice_raw
    0da545cf <Yosuke Shiro> Improve the documents for #slice
    7418c0ac <Yosuke Shiro> Return Arrow::Record for #slice(index)
    0418d103 <Yosuke Shiro> Fix parameter names in documents
    0461313a <Yosuke Shiro> Add documents for slice(range_*)
    6f1c9e8a <Yosuke Shiro> Fix array range
    8f99db1f <Yosuke Shiro> Add documents of Arrow::Table#slice
    a8841661 <Yosuke Shiro> Use #slice implemented by GLib
    4c03fd33 <Yosuke Shiro>  Add garrow_table_slice()
---
 .gitignore                        |   1 +
 c_glib/arrow-glib/table.cpp       |  27 +++++++
 c_glib/arrow-glib/table.h         |   5 ++
 c_glib/test/test-table.rb         |  16 ++++
 ruby/red-arrow/lib/arrow/table.rb | 158 ++++++++++++++++++++++----------------
 ruby/red-arrow/test/test-table.rb |  49 +++++++++---
 6 files changed, 177 insertions(+), 79 deletions(-)

diff --git a/.gitignore b/.gitignore
index b1a414e..21e8f34 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,6 +26,7 @@ arrow-src.tar.gz
 *.py[ocd]
 *.so
 *.so.*
+*.bundle
 *.dylib
 .build_cache_dir
 dependency-reduced-pom.xml
diff --git a/c_glib/arrow-glib/table.cpp b/c_glib/arrow-glib/table.cpp
index 51a5adb..a29d18b 100644
--- a/c_glib/arrow-glib/table.cpp
+++ b/c_glib/arrow-glib/table.cpp
@@ -562,6 +562,33 @@ garrow_table_concatenate(GArrowTable *table,
   }
 }
 
+/**
+ * garrow_table_slice:
+ * @table: A #GArrowTable.
+ * @offset: The offset of sub #GArrowTable. If the offset is negative,
+ *   the offset is counted from the last.
+ * @length: The length of sub #GArrowTable.
+ *
+ * Returns: (transfer full): The sub #GArrowTable. It covers
+ *   only from `offset` to `offset + length` range. The sub
+ *   #GArrowTable shares values with the base
+ *   #GArrowTable.
+ *
+ * Since: 0.14.0
+ */
+GArrowTable *
+garrow_table_slice(GArrowTable *table,
+                   gint64 offset,
+                   gint64 length)
+{
+  const auto arrow_table = garrow_table_get_raw(table);
+  if (offset < 0) {
+    offset += arrow_table->num_rows();
+  }
+  auto arrow_sub_table = arrow_table->Slice(offset, length);
+  return garrow_table_new_raw(&arrow_sub_table);
+}
+
 G_END_DECLS
 
 GArrowTable *
diff --git a/c_glib/arrow-glib/table.h b/c_glib/arrow-glib/table.h
index 763f6d8..f802637 100644
--- a/c_glib/arrow-glib/table.h
+++ b/c_glib/arrow-glib/table.h
@@ -94,5 +94,10 @@ GArrowTable *
 garrow_table_concatenate(GArrowTable *table,
                          GList *other_tables,
                          GError **error);
+GARROW_AVAILABLE_IN_0_14
+GArrowTable*
+garrow_table_slice(GArrowTable *table,
+                   gint64 offset,
+                   gint64 length);
 
 G_END_DECLS
diff --git a/c_glib/test/test-table.rb b/c_glib/test/test-table.rb
index 491fb58..54ba739 100644
--- a/c_glib/test/test-table.rb
+++ b/c_glib/test/test-table.rb
@@ -183,5 +183,21 @@ valid:
       table3 = build_table("visible" => build_boolean_array([false]))
       assert_equal(table, table1.concatenate([table2, table3]))
     end
+
+    sub_test_case("#slice") do
+      test("offset: positive") do
+        visibles = [true, false, true]
+        table = build_table("visible" => build_boolean_array(visibles))
+        assert_equal(build_table("visible" => build_boolean_array([false, true])),
+                     table.slice(1, 2))
+      end
+
+      test("offset: negative") do
+        visibles = [true, false, true]
+        table = build_table("visible" => build_boolean_array(visibles))
+        assert_equal(build_table("visible" => build_boolean_array([false, true])),
+                     table.slice(-2, 2))
+      end
+    end
   end
 end
diff --git a/ruby/red-arrow/lib/arrow/table.rb b/ruby/red-arrow/lib/arrow/table.rb
index 89f21aa..64f4b49 100644
--- a/ruby/red-arrow/lib/arrow/table.rb
+++ b/ruby/red-arrow/lib/arrow/table.rb
@@ -198,43 +198,92 @@ module Arrow
 
     alias_method :[], :find_column
 
-    # TODO
+    alias_method :slice_raw, :slice
+
+    # @overload slice(offset, length)
     #
-    # @return [Arrow::Table]
+    #   @param offset [Integer] The offset of sub Arrow::Table.
+    #   @param length [Integer] The length of sub Arrow::Table.
+    #   @return [Arrow::Table]
+    #     The sub `Arrow::Table` that covers only from
+    #     `offset` to `offset + length` range.
+    #
+    # @overload slice(index)
+    #
+    #   @param index [Integer] The index in this table.
+    #   @return [Arrow::Record]
+    #     The `Arrow::Record` corresponding to index of
+    #     the table.
+    #
+    # @overload slice(booleans)
+    #
+    #   @param booleans [::Array<Boolean>]
+    #     The values indicating the target rows.
+    #   @return [Arrow::Table]
+    #     The sub `Arrow::Table` that covers only rows of indices
+    #     the values of `booleans` is true.
+    #
+    # @overload slice(boolean_array)
+    #
+    #   @param boolean_array [::Array<Arrow::BooleanArray>]
+    #     The values indicating the target rows.
+    #   @return [Arrow::Table]
+    #     The sub `Arrow::Table` that covers only rows of indices
+    #     the values of `boolean_array` is true.
+    #
+    # @overload slice(range)
+    #
+    #   @param range_included_end [Range] The range indicating the target rows.
+    #   @return [Arrow::Table]
+    #     The sub `Arrow::Table` that covers only rows of the range of indices.
+    #
+    # @overload slice
+    #
+    #   @yield [slicer] Gives slicer that constructs condition to select records.
+    #   @yieldparam slicer [Arrow::Slicer] The slicer that helps us to
+    #     build condition.
+    #   @yieldreturn [Arrow::Slicer::Condition, ::Array<Arrow::Slicer::Condition>]
+    #     The condition to select records.
+    #   @return [Arrow::Table]
+    #     The sub `Arrow::Table` that covers only rows matched by condition
+    #     specified by slicer.
     def slice(*args)
       slicers = []
-      expected_n_args = nil
-      case args.size
-      when 0
-        expected_n_args = "1..2" unless block_given?
-      when 1
-        slicers << args[0]
-      when 2
-        from, to = args
-        slicers << (from...(from + to))
-      else
-        if block_given?
-          expected_n_args = "0..2"
-        else
-          expected_n_args = "1..2"
-        end
-      end
-      if expected_n_args
-        message = "wrong number of arguments " +
-          "(given #{args.size}, expected #{expected_n_args})"
-        raise ArgumentError, message
-      end
-
       if block_given?
+        unless args.empty?
+          raise ArgumentError, "must not specify both arguments and block"
+        end
         block_slicer = yield(Slicer.new(self))
         case block_slicer
-        when nil
-          # Ignore
         when ::Array
           slicers.concat(block_slicer)
         else
           slicers << block_slicer
         end
+      else
+        expected_n_args = nil
+        case args.size
+        when 1
+          if args[0].is_a?(Integer)
+            index = args[0]
+            index += n_rows if index < 0
+            return nil if index < 0
+            return nil if index >= n_rows
+            return Record.new(self, index)
+          else
+            slicers << args[0]
+          end
+        when 2
+          offset, length = args
+          slicers << (offset...(offset + length))
+        else
+          expected_n_args = "1..2"
+        end
+        if expected_n_args
+          message = "wrong number of arguments " +
+            "(given #{args.size}, expected #{expected_n_args})"
+          raise ArgumentError, message
+        end
       end
 
       ranges = []
@@ -243,12 +292,18 @@ module Arrow
         case slicer
         when Integer
           slicer += n_rows if slicer < 0
-          ranges << [slicer, slicer]
+          ranges << [slicer, n_rows - 1]
         when Range
-          from = slicer.first
+          original_from = from = slicer.first
           to = slicer.last
           to -= 1 if slicer.exclude_end?
           from += n_rows if from < 0
+          if from < 0 or from >= n_rows
+            message =
+              "offset is out of range (-#{n_rows + 1},#{n_rows}): " +
+              "#{original_from}"
+            raise ArgumentError, message
+          end
           to += n_rows if to < 0
           ranges << [from, to]
         when ::Array
@@ -457,47 +512,16 @@ module Arrow
       end
     end
 
-    # TODO: Almost codes should be implemented in Apache Arrow C++.
     def slice_by_ranges(ranges)
-      sliced_columns = columns.collect do |column|
-        chunks = []
-        arrays = column.data.each_chunk.to_a
-        offset = 0
-        offset_in_array = 0
-        ranges.each do |from, to|
-          range_size = to - from + 1
-          while range_size > 0
-            while offset + arrays.first.length - offset_in_array < from
-              offset += arrays.first.length - offset_in_array
-              arrays.shift
-              offset_in_array = 0
-            end
-            if offset < from
-              skipped_size = from - offset
-              offset += skipped_size
-              offset_in_array += skipped_size
-            end
-            array = arrays.first
-            array_length = array.length
-            rest_length = array_length - offset_in_array
-            if rest_length <= range_size
-              chunks << array.slice(offset_in_array, array_length)
-              offset += rest_length
-              range_size -= rest_length
-              offset_in_array = 0
-              arrays.shift
-            else
-              chunks << array.slice(offset_in_array, range_size)
-              offset += range_size
-              offset_in_array += range_size
-              range_size = 0
-            end
-          end
-        end
-        Column.new(column.field, ChunkedArray.new(chunks))
+      sliced_table = []
+      ranges.each do |from, to|
+        sliced_table << slice_raw(from, to - from + 1)
+      end
+      if sliced_table.size > 1
+        sliced_table[0].concatenate(sliced_table[1..-1])
+      else
+        sliced_table[0]
       end
-
-      self.class.new(schema, sliced_columns)
     end
 
     def ensure_column(name, data)
diff --git a/ruby/red-arrow/test/test-table.rb b/ruby/red-arrow/test/test-table.rb
index 2b7a46c..dce5d25 100644
--- a/ruby/red-arrow/test/test-table.rb
+++ b/ruby/red-arrow/test/test-table.rb
@@ -74,17 +74,24 @@ class TableTest < Test::Unit::TestCase
     end
 
     test("Integer: positive") do
-      assert_equal(<<-TABLE, @table.slice(2).to_s)
-	count	visible
-0	    4	       
-      TABLE
+      assert_equal({"count" => 128, "visible" => nil},
+                   @table.slice(@table.n_rows - 1).to_h)
     end
 
     test("Integer: negative") do
-      assert_equal(<<-TABLE, @table.slice(-1).to_s)
-	count	visible
-0	  128	       
-      TABLE
+      assert_equal({"count" => 1, "visible" => true},
+                   @table.slice(-@table.n_rows).to_h)
+    end
+
+    test("Integer: out of index") do
+      assert_equal([
+                     nil,
+                     nil,
+                   ],
+                   [
+                     @table.slice(@table.n_rows),
+                     @table.slice(-(@table.n_rows + 1)),
+                   ])
     end
 
     test("Range: positive: include end") do
@@ -145,17 +152,35 @@ class TableTest < Test::Unit::TestCase
         end
       end
 
-      test("too many arguments: with block") do
+      test("too many arguments") do
         message = "wrong number of arguments (given 3, expected 1..2)"
         assert_raise(ArgumentError.new(message)) do
           @table.slice(1, 2, 3)
         end
       end
 
-      test("too many arguments: without block") do
-        message = "wrong number of arguments (given 3, expected 0..2)"
+      test("arguments: with block") do
+        message = "must not specify both arguments and block"
+        assert_raise(ArgumentError.new(message)) do
+          @table.slice(1, 2) {}
+        end
+      end
+
+      test("offset: too small") do
+        n_rows = @table.n_rows
+        offset = -(n_rows + 1)
+        message = "offset is out of range (-#{n_rows + 1},#{n_rows}): #{offset}"
+        assert_raise(ArgumentError.new(message)) do
+          @table.slice(offset, 1)
+        end
+      end
+
+      test("offset: too large") do
+        n_rows = @table.n_rows
+        offset = n_rows
+        message = "offset is out of range (-#{n_rows + 1},#{n_rows}): #{offset}"
         assert_raise(ArgumentError.new(message)) do
-          @table.slice(1, 2, 3) {}
+          @table.slice(offset, 1)
         end
       end
     end