You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by sh...@apache.org on 2020/05/04 12:51:14 UTC

[arrow] branch master updated: ARROW-8682: [Ruby][Parquet] Add support for column level compression

This is an automated email from the ASF dual-hosted git repository.

shiro pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 0ba03b7  ARROW-8682: [Ruby][Parquet] Add support for column level compression
0ba03b7 is described below

commit 0ba03b73530c10162f4db667c71ca2be0b71a1dd
Author: Sutou Kouhei <ko...@clear-code.com>
AuthorDate: Mon May 4 21:50:51 2020 +0900

    ARROW-8682: [Ruby][Parquet] Add support for column level compression
    
    Closes #7092 from kou/ruby-parquet-compression
    
    Authored-by: Sutou Kouhei <ko...@clear-code.com>
    Signed-off-by: Yosuke Shiro <yo...@gmail.com>
---
 ruby/red-arrow/lib/arrow/table-saver.rb            | 12 ++---
 ruby/red-arrow/test/test-feather.rb                |  2 +-
 .../red-parquet/lib/parquet/arrow-table-savable.rb | 20 +++++++-
 ruby/red-parquet/lib/parquet/loader.rb             |  1 +
 ...arrow-table-savable.rb => writer-properties.rb} | 20 +++-----
 ruby/red-parquet/test/test-arrow-table.rb          | 53 ++++++++++++++++++++--
 6 files changed, 80 insertions(+), 28 deletions(-)

diff --git a/ruby/red-arrow/lib/arrow/table-saver.rb b/ruby/red-arrow/lib/arrow/table-saver.rb
index 6c44b99..bc2296a 100644
--- a/ruby/red-arrow/lib/arrow/table-saver.rb
+++ b/ruby/red-arrow/lib/arrow/table-saver.rb
@@ -155,13 +155,13 @@ module Arrow
     end
 
     def save_as_feather
+      properties = FeatherWriteProperties.new
+      properties.class.properties.each do |name|
+        value = @options[name.to_sym]
+        next if value.nil?
+        properties.__send__("#{name}=", value)
+      end
       open_raw_output_stream do |output|
-        properties = FeatherWriteProperties.new
-        properties.class.properties.each do |name|
-          value = @options[name.to_sym]
-          next if value.nil?
-          properties.__send__("#{name}=", value)
-        end
         @table.write_as_feather(output, properties)
       end
     end
diff --git a/ruby/red-arrow/test/test-feather.rb b/ruby/red-arrow/test/test-feather.rb
index d36df5e..21d8a2c 100644
--- a/ruby/red-arrow/test/test-feather.rb
+++ b/ruby/red-arrow/test/test-feather.rb
@@ -28,7 +28,7 @@ class FeatherTest < Test::Unit::TestCase
     @output = Tempfile.new(["red-arrow", ".feather"])
     begin
       yield(@output)
-    rescue
+    ensure
       @output.close!
     end
   end
diff --git a/ruby/red-parquet/lib/parquet/arrow-table-savable.rb b/ruby/red-parquet/lib/parquet/arrow-table-savable.rb
index 7667381..0163b15 100644
--- a/ruby/red-parquet/lib/parquet/arrow-table-savable.rb
+++ b/ruby/red-parquet/lib/parquet/arrow-table-savable.rb
@@ -19,9 +19,25 @@ module Parquet
   module ArrowTableSavable
     private
     def save_as_parquet
+      properties = WriterProperties.new
+      @options.each do |key, value|
+        next if value.nil?
+        set_method_name = "set_#{key}"
+        next unless properties.respond_to?(set_method_name)
+        case value
+        when ::Array, ::Hash
+          value.each do |path, v|
+            properties.__send__(set_method_name, v, path)
+          end
+        else
+          properties.__send__(set_method_name, value)
+        end
+      end
       chunk_size = @options[:chunk_size] || 1024 # TODO
-      open_output_stream do |output|
-        Parquet::ArrowFileWriter.open(@table.schema, output) do |writer|
+      open_raw_output_stream do |output|
+        ArrowFileWriter.open(@table.schema,
+                             output,
+                             properties) do |writer|
           writer.write_table(@table, chunk_size)
         end
       end
diff --git a/ruby/red-parquet/lib/parquet/loader.rb b/ruby/red-parquet/lib/parquet/loader.rb
index a3d0cb4..5e25872 100644
--- a/ruby/red-parquet/lib/parquet/loader.rb
+++ b/ruby/red-parquet/lib/parquet/loader.rb
@@ -31,6 +31,7 @@ module Parquet
     def require_libraries
       require "parquet/arrow-table-loadable"
       require "parquet/arrow-table-savable"
+      require "parquet/writer-properties"
     end
 
     def load_object_info(info)
diff --git a/ruby/red-parquet/lib/parquet/arrow-table-savable.rb b/ruby/red-parquet/lib/parquet/writer-properties.rb
similarity index 69%
copy from ruby/red-parquet/lib/parquet/arrow-table-savable.rb
copy to ruby/red-parquet/lib/parquet/writer-properties.rb
index 7667381..5881471 100644
--- a/ruby/red-parquet/lib/parquet/arrow-table-savable.rb
+++ b/ruby/red-parquet/lib/parquet/writer-properties.rb
@@ -16,21 +16,13 @@
 # under the License.
 
 module Parquet
-  module ArrowTableSavable
-    private
-    def save_as_parquet
-      chunk_size = @options[:chunk_size] || 1024 # TODO
-      open_output_stream do |output|
-        Parquet::ArrowFileWriter.open(@table.schema, output) do |writer|
-          writer.write_table(@table, chunk_size)
-        end
+  class WriterProperties
+    def set_dictionary(enable, path=nil)
+      if enable
+        enable_dictionary(path)
+      else
+        disable_dictionary(path)
       end
     end
   end
 end
-
-module Arrow
-  class TableSaver
-    include Parquet::ArrowTableSavable
-  end
-end
diff --git a/ruby/red-parquet/test/test-arrow-table.rb b/ruby/red-parquet/test/test-arrow-table.rb
index 739facc..bb0f2e0 100644
--- a/ruby/red-parquet/test/test-arrow-table.rb
+++ b/ruby/red-parquet/test/test-arrow-table.rb
@@ -19,7 +19,8 @@ class TestArrowTableReader < Test::Unit::TestCase
   def setup
     @count_field = Arrow::Field.new("count", :uint8)
     @visible_field = Arrow::Field.new("visible", :boolean)
-    schema = Arrow::Schema.new([@count_field, @visible_field])
+    @label_field = Arrow::Field.new("label", :string)
+    schema = Arrow::Schema.new([@count_field, @visible_field, @label_field])
     count_arrays = [
       Arrow::UInt8Array.new([1, 2]),
       Arrow::UInt8Array.new([4, 8, 16]),
@@ -33,16 +34,30 @@ class TestArrowTableReader < Test::Unit::TestCase
       Arrow::BooleanArray.new([nil]),
       Arrow::BooleanArray.new([nil]),
     ]
+    label_arrays = [
+      Arrow::StringArray.new(["a"]),
+      Arrow::StringArray.new(["b", "c"]),
+      Arrow::StringArray.new(["d", nil, nil]),
+      Arrow::StringArray.new(["e", "f"]),
+    ]
     @count_array = Arrow::ChunkedArray.new(count_arrays)
     @visible_array = Arrow::ChunkedArray.new(visible_arrays)
-    @table = Arrow::Table.new(schema, [@count_array, @visible_array])
+    @label_array = Arrow::ChunkedArray.new(label_arrays)
+    @table = Arrow::Table.new(schema,
+                              [@count_array, @visible_array, @label_array])
+
+    @output = Tempfile.open(["red-parquet", ".parquet"])
+    begin
+      yield(@output)
+    ensure
+      @output.close!
+    end
   end
 
   def test_save_load_path
-    tempfile = Tempfile.open(["red-parquet", ".parquet"])
-    @table.save(tempfile.path)
+    @table.save(@output.path)
     assert do
-      @table.equal_metadata(Arrow::Table.load(tempfile.path), false)
+      @table.equal_metadata(Arrow::Table.load(@output.path), false)
     end
   end
 
@@ -53,4 +68,32 @@ class TestArrowTableReader < Test::Unit::TestCase
       @table.equal_metadata(Arrow::Table.load(buffer, format: :parquet), false)
     end
   end
+
+  def test_save_load_compression
+    @table.save(@output.path, compression: :zstd)
+    assert do
+      @table.equal_metadata(Arrow::Table.load(@output.path), false)
+    end
+  end
+
+  def test_save_load_compression_path
+    @table.save(@output.path, compression: {"count" => :zstd})
+    assert do
+      @table.equal_metadata(Arrow::Table.load(@output.path), false)
+    end
+  end
+
+  def test_save_load_dictionary
+    @table.save(@output.path, dictionary: false)
+    assert do
+      @table.equal_metadata(Arrow::Table.load(@output.path), false)
+    end
+  end
+
+  def test_save_load_dictionary_path
+    @table.save(@output.path, dictionary: [["label", false]])
+    assert do
+      @table.equal_metadata(Arrow::Table.load(@output.path), false)
+    end
+  end
 end