You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2019/06/27 06:54:27 UTC
[arrow] branch master updated: ARROW-5535: [GLib] Add
garrow_table_slice()
This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 576b520 ARROW-5535: [GLib] Add garrow_table_slice()
576b520 is described below
commit 576b52074b60485fb9ba52b727b4582bd3763b97
Author: Sutou Kouhei <ko...@clear-code.com>
AuthorDate: Thu Jun 27 15:54:17 2019 +0900
ARROW-5535: [GLib] Add garrow_table_slice()
`Arrow::Table#slice` has already been implemented in Red Arrow. So this PR replaces current `#slice` with `#slice` implemented by GLib.
Author: Sutou Kouhei <ko...@clear-code.com>
Author: Yosuke Shiro <yo...@gmail.com>
Closes #4658 from shiro615/glib-table-slice and squashes the following commits:
7f8f2387 <Sutou Kouhei> Accept negative integer as offset
02b80262 <Sutou Kouhei> Use Hash to compare values
dd27d9db <Sutou Kouhei> Simplify offset check
75bb7f05 <Sutou Kouhei> Improve variable name
9611374e <Sutou Kouhei> Simplify
a03a39f6 <Sutou Kouhei> Resolve negative index in slice
e8ef8440 <Sutou Kouhei> Update expected
bd112f93 <Sutou Kouhei> Don't ignore nil returned by slicer {|slicer| ...}
13868a28 <Sutou Kouhei> Add a document for slice {|slicer|}
d7957199 <Sutou Kouhei> Improve error message
8bd2b133 <Sutou Kouhei> Fix namespace
2debac91 <Sutou Kouhei> Fix indent
c2afd2f5 <Sutou Kouhei> Fix documented parameter names
eaca7da1 <Sutou Kouhei> Fix document
392f5f73 <Sutou Kouhei> Remove built .bundle
2034f4f9 <Yosuke Shiro> Fix test case names
79cf517e <Yosuke Shiro> Fix error messages
08a54008 <Yosuke Shiro> Update documents of #slice
ceb97add <Yosuke Shiro> Return nil if out of index
d62e6147 <Yosuke Shiro> Can't give arguments with block
57ce5295 <Yosuke Shiro> Validate from and the length
ef2dccaf <Yosuke Shiro> Put a new line between alias_method : and alias_method :slice_raw
0da545cf <Yosuke Shiro> Improve the documents for #slice
7418c0ac <Yosuke Shiro> Return Arrow::Record for #slice(index)
0418d103 <Yosuke Shiro> Fix parameter names in documents
0461313a <Yosuke Shiro> Add documents for slice(range_*)
6f1c9e8a <Yosuke Shiro> Fix array range
8f99db1f <Yosuke Shiro> Add documents of Arrow::Table#slice
a8841661 <Yosuke Shiro> Use #slice implemented by GLib
4c03fd33 <Yosuke Shiro> Add garrow_table_slice()
---
.gitignore | 1 +
c_glib/arrow-glib/table.cpp | 27 +++++++
c_glib/arrow-glib/table.h | 5 ++
c_glib/test/test-table.rb | 16 ++++
ruby/red-arrow/lib/arrow/table.rb | 158 ++++++++++++++++++++++----------------
ruby/red-arrow/test/test-table.rb | 49 +++++++++---
6 files changed, 177 insertions(+), 79 deletions(-)
diff --git a/.gitignore b/.gitignore
index b1a414e..21e8f34 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,6 +26,7 @@ arrow-src.tar.gz
*.py[ocd]
*.so
*.so.*
+*.bundle
*.dylib
.build_cache_dir
dependency-reduced-pom.xml
diff --git a/c_glib/arrow-glib/table.cpp b/c_glib/arrow-glib/table.cpp
index 51a5adb..a29d18b 100644
--- a/c_glib/arrow-glib/table.cpp
+++ b/c_glib/arrow-glib/table.cpp
@@ -562,6 +562,33 @@ garrow_table_concatenate(GArrowTable *table,
}
}
+/**
+ * garrow_table_slice:
+ * @table: A #GArrowTable.
+ * @offset: The offset of sub #GArrowTable. If the offset is negative,
+ * the offset is counted from the last.
+ * @length: The length of sub #GArrowTable.
+ *
+ * Returns: (transfer full): The sub #GArrowTable. It covers
+ * only from `offset` to `offset + length` range. The sub
+ * #GArrowTable shares values with the base
+ * #GArrowTable.
+ *
+ * Since: 0.14.0
+ */
+GArrowTable *
+garrow_table_slice(GArrowTable *table,
+ gint64 offset,
+ gint64 length)
+{
+ const auto arrow_table = garrow_table_get_raw(table);
+ if (offset < 0) {
+ offset += arrow_table->num_rows();
+ }
+ auto arrow_sub_table = arrow_table->Slice(offset, length);
+ return garrow_table_new_raw(&arrow_sub_table);
+}
+
G_END_DECLS
GArrowTable *
diff --git a/c_glib/arrow-glib/table.h b/c_glib/arrow-glib/table.h
index 763f6d8..f802637 100644
--- a/c_glib/arrow-glib/table.h
+++ b/c_glib/arrow-glib/table.h
@@ -94,5 +94,10 @@ GArrowTable *
garrow_table_concatenate(GArrowTable *table,
GList *other_tables,
GError **error);
+GARROW_AVAILABLE_IN_0_14
+GArrowTable*
+garrow_table_slice(GArrowTable *table,
+ gint64 offset,
+ gint64 length);
G_END_DECLS
diff --git a/c_glib/test/test-table.rb b/c_glib/test/test-table.rb
index 491fb58..54ba739 100644
--- a/c_glib/test/test-table.rb
+++ b/c_glib/test/test-table.rb
@@ -183,5 +183,21 @@ valid:
table3 = build_table("visible" => build_boolean_array([false]))
assert_equal(table, table1.concatenate([table2, table3]))
end
+
+ sub_test_case("#slice") do
+ test("offset: positive") do
+ visibles = [true, false, true]
+ table = build_table("visible" => build_boolean_array(visibles))
+ assert_equal(build_table("visible" => build_boolean_array([false, true])),
+ table.slice(1, 2))
+ end
+
+ test("offset: negative") do
+ visibles = [true, false, true]
+ table = build_table("visible" => build_boolean_array(visibles))
+ assert_equal(build_table("visible" => build_boolean_array([false, true])),
+ table.slice(-2, 2))
+ end
+ end
end
end
diff --git a/ruby/red-arrow/lib/arrow/table.rb b/ruby/red-arrow/lib/arrow/table.rb
index 89f21aa..64f4b49 100644
--- a/ruby/red-arrow/lib/arrow/table.rb
+++ b/ruby/red-arrow/lib/arrow/table.rb
@@ -198,43 +198,92 @@ module Arrow
alias_method :[], :find_column
- # TODO
+ alias_method :slice_raw, :slice
+
+ # @overload slice(offset, length)
#
- # @return [Arrow::Table]
+ # @param offset [Integer] The offset of sub Arrow::Table.
+ # @param length [Integer] The length of sub Arrow::Table.
+ # @return [Arrow::Table]
+ # The sub `Arrow::Table` that covers only from
+ # `offset` to `offset + length` range.
+ #
+ # @overload slice(index)
+ #
+ # @param index [Integer] The index in this table.
+ # @return [Arrow::Record]
+ # The `Arrow::Record` corresponding to index of
+ # the table.
+ #
+ # @overload slice(booleans)
+ #
+ # @param booleans [::Array<Boolean>]
+ # The values indicating the target rows.
+ # @return [Arrow::Table]
+ # The sub `Arrow::Table` that covers only rows of indices
+ # the values of `booleans` is true.
+ #
+ # @overload slice(boolean_array)
+ #
+ # @param boolean_array [::Array<Arrow::BooleanArray>]
+ # The values indicating the target rows.
+ # @return [Arrow::Table]
+ # The sub `Arrow::Table` that covers only rows of indices
+ # the values of `boolean_array` is true.
+ #
+ # @overload slice(range)
+ #
+ # @param range_included_end [Range] The range indicating the target rows.
+ # @return [Arrow::Table]
+ # The sub `Arrow::Table` that covers only rows of the range of indices.
+ #
+ # @overload slice
+ #
+ # @yield [slicer] Gives slicer that constructs condition to select records.
+ # @yieldparam slicer [Arrow::Slicer] The slicer that helps us to
+ # build condition.
+ # @yieldreturn [Arrow::Slicer::Condition, ::Array<Arrow::Slicer::Condition>]
+ # The condition to select records.
+ # @return [Arrow::Table]
+ # The sub `Arrow::Table` that covers only rows matched by condition
+ # specified by slicer.
def slice(*args)
slicers = []
- expected_n_args = nil
- case args.size
- when 0
- expected_n_args = "1..2" unless block_given?
- when 1
- slicers << args[0]
- when 2
- from, to = args
- slicers << (from...(from + to))
- else
- if block_given?
- expected_n_args = "0..2"
- else
- expected_n_args = "1..2"
- end
- end
- if expected_n_args
- message = "wrong number of arguments " +
- "(given #{args.size}, expected #{expected_n_args})"
- raise ArgumentError, message
- end
-
if block_given?
+ unless args.empty?
+ raise ArgumentError, "must not specify both arguments and block"
+ end
block_slicer = yield(Slicer.new(self))
case block_slicer
- when nil
- # Ignore
when ::Array
slicers.concat(block_slicer)
else
slicers << block_slicer
end
+ else
+ expected_n_args = nil
+ case args.size
+ when 1
+ if args[0].is_a?(Integer)
+ index = args[0]
+ index += n_rows if index < 0
+ return nil if index < 0
+ return nil if index >= n_rows
+ return Record.new(self, index)
+ else
+ slicers << args[0]
+ end
+ when 2
+ offset, length = args
+ slicers << (offset...(offset + length))
+ else
+ expected_n_args = "1..2"
+ end
+ if expected_n_args
+ message = "wrong number of arguments " +
+ "(given #{args.size}, expected #{expected_n_args})"
+ raise ArgumentError, message
+ end
end
ranges = []
@@ -243,12 +292,18 @@ module Arrow
case slicer
when Integer
slicer += n_rows if slicer < 0
- ranges << [slicer, slicer]
+ ranges << [slicer, n_rows - 1]
when Range
- from = slicer.first
+ original_from = from = slicer.first
to = slicer.last
to -= 1 if slicer.exclude_end?
from += n_rows if from < 0
+ if from < 0 or from >= n_rows
+ message =
+ "offset is out of range (-#{n_rows + 1},#{n_rows}): " +
+ "#{original_from}"
+ raise ArgumentError, message
+ end
to += n_rows if to < 0
ranges << [from, to]
when ::Array
@@ -457,47 +512,16 @@ module Arrow
end
end
- # TODO: Almost codes should be implemented in Apache Arrow C++.
def slice_by_ranges(ranges)
- sliced_columns = columns.collect do |column|
- chunks = []
- arrays = column.data.each_chunk.to_a
- offset = 0
- offset_in_array = 0
- ranges.each do |from, to|
- range_size = to - from + 1
- while range_size > 0
- while offset + arrays.first.length - offset_in_array < from
- offset += arrays.first.length - offset_in_array
- arrays.shift
- offset_in_array = 0
- end
- if offset < from
- skipped_size = from - offset
- offset += skipped_size
- offset_in_array += skipped_size
- end
- array = arrays.first
- array_length = array.length
- rest_length = array_length - offset_in_array
- if rest_length <= range_size
- chunks << array.slice(offset_in_array, array_length)
- offset += rest_length
- range_size -= rest_length
- offset_in_array = 0
- arrays.shift
- else
- chunks << array.slice(offset_in_array, range_size)
- offset += range_size
- offset_in_array += range_size
- range_size = 0
- end
- end
- end
- Column.new(column.field, ChunkedArray.new(chunks))
+ sliced_table = []
+ ranges.each do |from, to|
+ sliced_table << slice_raw(from, to - from + 1)
+ end
+ if sliced_table.size > 1
+ sliced_table[0].concatenate(sliced_table[1..-1])
+ else
+ sliced_table[0]
end
-
- self.class.new(schema, sliced_columns)
end
def ensure_column(name, data)
diff --git a/ruby/red-arrow/test/test-table.rb b/ruby/red-arrow/test/test-table.rb
index 2b7a46c..dce5d25 100644
--- a/ruby/red-arrow/test/test-table.rb
+++ b/ruby/red-arrow/test/test-table.rb
@@ -74,17 +74,24 @@ class TableTest < Test::Unit::TestCase
end
test("Integer: positive") do
- assert_equal(<<-TABLE, @table.slice(2).to_s)
- count visible
-0 4
- TABLE
+ assert_equal({"count" => 128, "visible" => nil},
+ @table.slice(@table.n_rows - 1).to_h)
end
test("Integer: negative") do
- assert_equal(<<-TABLE, @table.slice(-1).to_s)
- count visible
-0 128
- TABLE
+ assert_equal({"count" => 1, "visible" => true},
+ @table.slice(-@table.n_rows).to_h)
+ end
+
+ test("Integer: out of index") do
+ assert_equal([
+ nil,
+ nil,
+ ],
+ [
+ @table.slice(@table.n_rows),
+ @table.slice(-(@table.n_rows + 1)),
+ ])
end
test("Range: positive: include end") do
@@ -145,17 +152,35 @@ class TableTest < Test::Unit::TestCase
end
end
- test("too many arguments: with block") do
+ test("too many arguments") do
message = "wrong number of arguments (given 3, expected 1..2)"
assert_raise(ArgumentError.new(message)) do
@table.slice(1, 2, 3)
end
end
- test("too many arguments: without block") do
- message = "wrong number of arguments (given 3, expected 0..2)"
+ test("arguments: with block") do
+ message = "must not specify both arguments and block"
+ assert_raise(ArgumentError.new(message)) do
+ @table.slice(1, 2) {}
+ end
+ end
+
+ test("offset: too small") do
+ n_rows = @table.n_rows
+ offset = -(n_rows + 1)
+ message = "offset is out of range (-#{n_rows + 1},#{n_rows}): #{offset}"
+ assert_raise(ArgumentError.new(message)) do
+ @table.slice(offset, 1)
+ end
+ end
+
+ test("offset: too large") do
+ n_rows = @table.n_rows
+ offset = n_rows
+ message = "offset is out of range (-#{n_rows + 1},#{n_rows}): #{offset}"
assert_raise(ArgumentError.new(message)) do
- @table.slice(1, 2, 3) {}
+ @table.slice(offset, 1)
end
end
end