You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ko...@apache.org on 2023/06/11 21:02:06 UTC

[arrow] branch main updated: GH-36008: [Ruby][Parquet] Add Parquet::ArrowFileReader#each_row_group (#36022)

This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new ae655c5ccb GH-36008: [Ruby][Parquet] Add Parquet::ArrowFileReader#each_row_group (#36022)
ae655c5ccb is described below

commit ae655c5ccb8d4bec1acd0f6d50855a6dea1590c1
Author: takuya kodama <a....@gmail.com>
AuthorDate: Mon Jun 12 05:02:00 2023 +0800

    GH-36008: [Ruby][Parquet] Add Parquet::ArrowFileReader#each_row_group (#36022)
    
    ### Rationale for this change
    This change allows you to read a large Parquet file per row group.
    - ref: https://github.com/apache/arrow/issues/36001
    
    ### What changes are included in this PR?
    - Add Parquet::ArrowFileReader#each_row_group
    - Add the related test about it
    
    ### Are these changes tested?
    Yes
    - I don't have confidence about the test. Could you give me a comment?
    
    ### Are there any user-facing changes?
    No
    
    Close: https://github.com/apache/arrow/issues/36008
    * Closes: #36008
    
    Authored-by: otegami <a....@gmail.com>
    Signed-off-by: Sutou Kouhei <ko...@clear-code.com>
---
 ruby/red-parquet/lib/parquet/arrow-file-reader.rb | 28 ++++++++++
 ruby/red-parquet/lib/parquet/loader.rb            |  1 +
 ruby/red-parquet/test/test-arrow-file-reader.rb   | 66 +++++++++++++++++++++++
 3 files changed, 95 insertions(+)

diff --git a/ruby/red-parquet/lib/parquet/arrow-file-reader.rb b/ruby/red-parquet/lib/parquet/arrow-file-reader.rb
new file mode 100644
index 0000000000..6923c21987
--- /dev/null
+++ b/ruby/red-parquet/lib/parquet/arrow-file-reader.rb
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module Parquet
+  class ArrowFileReader
+    def each_row_group
+      return to_enum(__method__) {n_row_groups} unless block_given?
+
+      n_row_groups.times do |i|
+        yield(read_row_group(i))
+      end
+    end
+  end
+end
diff --git a/ruby/red-parquet/lib/parquet/loader.rb b/ruby/red-parquet/lib/parquet/loader.rb
index b95f417b01..0c20ad2b52 100644
--- a/ruby/red-parquet/lib/parquet/loader.rb
+++ b/ruby/red-parquet/lib/parquet/loader.rb
@@ -29,6 +29,7 @@ module Parquet
     end
 
     def require_libraries
+      require "parquet/arrow-file-reader"
       require "parquet/arrow-table-loadable"
       require "parquet/arrow-table-savable"
       require "parquet/writer-properties"
diff --git a/ruby/red-parquet/test/test-arrow-file-reader.rb b/ruby/red-parquet/test/test-arrow-file-reader.rb
new file mode 100644
index 0000000000..61db74b899
--- /dev/null
+++ b/ruby/red-parquet/test/test-arrow-file-reader.rb
@@ -0,0 +1,66 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+class TestArrowFileReader < Test::Unit::TestCase
+  def setup
+    @schema = Arrow::Schema.new(visible: :boolean)
+    table = Arrow::Table.new(@schema, [[true], [false]])
+    Tempfile.create(["red-parquet", ".parquet"]) do |file|
+      @file = file
+      Parquet::ArrowFileWriter.open(table.schema, @file.path) do |writer|
+        chunk_size = 1
+        writer.write_table(table, chunk_size)
+      end
+      yield
+    end
+  end
+
+  sub_test_case("#each_row_group") do
+    test("block") do
+      Arrow::FileInputStream.open(@file.path) do |input|
+        reader = Parquet::ArrowFileReader.new(input)
+        row_groups = []
+        reader.each_row_group do |row_group|
+          row_groups << row_group
+        end
+        assert_equal([
+                       Arrow::Table.new(@schema, [[true]]),
+                       Arrow::Table.new(@schema, [[false]])
+                     ],
+                     row_groups)
+      end
+    end
+
+    test("without block") do
+      Arrow::FileInputStream.open(@file.path) do |input|
+        reader = Parquet::ArrowFileReader.new(input)
+        each_row_group = reader.each_row_group
+        assert_equal({
+                       size: 2,
+                       to_a: [
+                         Arrow::Table.new(@schema, [[true]]),
+                         Arrow::Table.new(@schema, [[false]])
+                       ],
+                     },
+                     {
+                       size: each_row_group.size,
+                       to_a: each_row_group.to_a,
+                     })
+      end
+    end
+  end
+end