You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2023/06/11 21:02:06 UTC
[arrow] branch main updated: GH-36008: [Ruby][Parquet] Add Parquet::ArrowFileReader#each_row_group (#36022)
This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new ae655c5ccb GH-36008: [Ruby][Parquet] Add Parquet::ArrowFileReader#each_row_group (#36022)
ae655c5ccb is described below
commit ae655c5ccb8d4bec1acd0f6d50855a6dea1590c1
Author: takuya kodama <a....@gmail.com>
AuthorDate: Mon Jun 12 05:02:00 2023 +0800
GH-36008: [Ruby][Parquet] Add Parquet::ArrowFileReader#each_row_group (#36022)
### Rationale for this change
This change allows you to read a large Parquet file per row group.
- ref: https://github.com/apache/arrow/issues/36001
### What changes are included in this PR?
- Add Parquet::ArrowFileReader#each_row_group
- Add the related test about it
### Are these changes tested?
Yes
- I don't have confidence about the test. Could you give me a comment?
### Are there any user-facing changes?
No
Close: https://github.com/apache/arrow/issues/36008
* Closes: #36008
Authored-by: otegami <a....@gmail.com>
Signed-off-by: Sutou Kouhei <ko...@clear-code.com>
---
ruby/red-parquet/lib/parquet/arrow-file-reader.rb | 28 ++++++++++
ruby/red-parquet/lib/parquet/loader.rb | 1 +
ruby/red-parquet/test/test-arrow-file-reader.rb | 66 +++++++++++++++++++++++
3 files changed, 95 insertions(+)
diff --git a/ruby/red-parquet/lib/parquet/arrow-file-reader.rb b/ruby/red-parquet/lib/parquet/arrow-file-reader.rb
new file mode 100644
index 0000000000..6923c21987
--- /dev/null
+++ b/ruby/red-parquet/lib/parquet/arrow-file-reader.rb
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Parquet
+ class ArrowFileReader
+ def each_row_group
+ return to_enum(__method__) {n_row_groups} unless block_given?
+
+ n_row_groups.times do |i|
+ yield(read_row_group(i))
+ end
+ end
+ end
+end
diff --git a/ruby/red-parquet/lib/parquet/loader.rb b/ruby/red-parquet/lib/parquet/loader.rb
index b95f417b01..0c20ad2b52 100644
--- a/ruby/red-parquet/lib/parquet/loader.rb
+++ b/ruby/red-parquet/lib/parquet/loader.rb
@@ -29,6 +29,7 @@ module Parquet
end
def require_libraries
+ require "parquet/arrow-file-reader"
require "parquet/arrow-table-loadable"
require "parquet/arrow-table-savable"
require "parquet/writer-properties"
diff --git a/ruby/red-parquet/test/test-arrow-file-reader.rb b/ruby/red-parquet/test/test-arrow-file-reader.rb
new file mode 100644
index 0000000000..61db74b899
--- /dev/null
+++ b/ruby/red-parquet/test/test-arrow-file-reader.rb
@@ -0,0 +1,66 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestArrowFileReader < Test::Unit::TestCase
+ def setup
+ @schema = Arrow::Schema.new(visible: :boolean)
+ table = Arrow::Table.new(@schema, [[true], [false]])
+ Tempfile.create(["red-parquet", ".parquet"]) do |file|
+ @file = file
+ Parquet::ArrowFileWriter.open(table.schema, @file.path) do |writer|
+ chunk_size = 1
+ writer.write_table(table, chunk_size)
+ end
+ yield
+ end
+ end
+
+ sub_test_case("#each_row_group") do
+ test("block") do
+ Arrow::FileInputStream.open(@file.path) do |input|
+ reader = Parquet::ArrowFileReader.new(input)
+ row_groups = []
+ reader.each_row_group do |row_group|
+ row_groups << row_group
+ end
+ assert_equal([
+ Arrow::Table.new(@schema, [[true]]),
+ Arrow::Table.new(@schema, [[false]])
+ ],
+ row_groups)
+ end
+ end
+
+ test("without block") do
+ Arrow::FileInputStream.open(@file.path) do |input|
+ reader = Parquet::ArrowFileReader.new(input)
+ each_row_group = reader.each_row_group
+ assert_equal({
+ size: 2,
+ to_a: [
+ Arrow::Table.new(@schema, [[true]]),
+ Arrow::Table.new(@schema, [[false]])
+ ],
+ },
+ {
+ size: each_row_group.size,
+ to_a: each_row_group.to_a,
+ })
+ end
+ end
+ end
+end