You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by sh...@apache.org on 2020/05/04 12:51:14 UTC
[arrow] branch master updated: ARROW-8682: [Ruby][Parquet] Add
support for column level compression
This is an automated email from the ASF dual-hosted git repository.
shiro pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 0ba03b7 ARROW-8682: [Ruby][Parquet] Add support for column level compression
0ba03b7 is described below
commit 0ba03b73530c10162f4db667c71ca2be0b71a1dd
Author: Sutou Kouhei <ko...@clear-code.com>
AuthorDate: Mon May 4 21:50:51 2020 +0900
ARROW-8682: [Ruby][Parquet] Add support for column level compression
Closes #7092 from kou/ruby-parquet-compression
Authored-by: Sutou Kouhei <ko...@clear-code.com>
Signed-off-by: Yosuke Shiro <yo...@gmail.com>
---
ruby/red-arrow/lib/arrow/table-saver.rb | 12 ++---
ruby/red-arrow/test/test-feather.rb | 2 +-
.../red-parquet/lib/parquet/arrow-table-savable.rb | 20 +++++++-
ruby/red-parquet/lib/parquet/loader.rb | 1 +
...arrow-table-savable.rb => writer-properties.rb} | 20 +++-----
ruby/red-parquet/test/test-arrow-table.rb | 53 ++++++++++++++++++++--
6 files changed, 80 insertions(+), 28 deletions(-)
diff --git a/ruby/red-arrow/lib/arrow/table-saver.rb b/ruby/red-arrow/lib/arrow/table-saver.rb
index 6c44b99..bc2296a 100644
--- a/ruby/red-arrow/lib/arrow/table-saver.rb
+++ b/ruby/red-arrow/lib/arrow/table-saver.rb
@@ -155,13 +155,13 @@ module Arrow
end
def save_as_feather
+ properties = FeatherWriteProperties.new
+ properties.class.properties.each do |name|
+ value = @options[name.to_sym]
+ next if value.nil?
+ properties.__send__("#{name}=", value)
+ end
open_raw_output_stream do |output|
- properties = FeatherWriteProperties.new
- properties.class.properties.each do |name|
- value = @options[name.to_sym]
- next if value.nil?
- properties.__send__("#{name}=", value)
- end
@table.write_as_feather(output, properties)
end
end
diff --git a/ruby/red-arrow/test/test-feather.rb b/ruby/red-arrow/test/test-feather.rb
index d36df5e..21d8a2c 100644
--- a/ruby/red-arrow/test/test-feather.rb
+++ b/ruby/red-arrow/test/test-feather.rb
@@ -28,7 +28,7 @@ class FeatherTest < Test::Unit::TestCase
@output = Tempfile.new(["red-arrow", ".feather"])
begin
yield(@output)
- rescue
+ ensure
@output.close!
end
end
diff --git a/ruby/red-parquet/lib/parquet/arrow-table-savable.rb b/ruby/red-parquet/lib/parquet/arrow-table-savable.rb
index 7667381..0163b15 100644
--- a/ruby/red-parquet/lib/parquet/arrow-table-savable.rb
+++ b/ruby/red-parquet/lib/parquet/arrow-table-savable.rb
@@ -19,9 +19,25 @@ module Parquet
module ArrowTableSavable
private
def save_as_parquet
+ properties = WriterProperties.new
+ @options.each do |key, value|
+ next if value.nil?
+ set_method_name = "set_#{key}"
+ next unless properties.respond_to?(set_method_name)
+ case value
+ when ::Array, ::Hash
+ value.each do |path, v|
+ properties.__send__(set_method_name, v, path)
+ end
+ else
+ properties.__send__(set_method_name, value)
+ end
+ end
chunk_size = @options[:chunk_size] || 1024 # TODO
- open_output_stream do |output|
- Parquet::ArrowFileWriter.open(@table.schema, output) do |writer|
+ open_raw_output_stream do |output|
+ ArrowFileWriter.open(@table.schema,
+ output,
+ properties) do |writer|
writer.write_table(@table, chunk_size)
end
end
diff --git a/ruby/red-parquet/lib/parquet/loader.rb b/ruby/red-parquet/lib/parquet/loader.rb
index a3d0cb4..5e25872 100644
--- a/ruby/red-parquet/lib/parquet/loader.rb
+++ b/ruby/red-parquet/lib/parquet/loader.rb
@@ -31,6 +31,7 @@ module Parquet
def require_libraries
require "parquet/arrow-table-loadable"
require "parquet/arrow-table-savable"
+ require "parquet/writer-properties"
end
def load_object_info(info)
diff --git a/ruby/red-parquet/lib/parquet/arrow-table-savable.rb b/ruby/red-parquet/lib/parquet/writer-properties.rb
similarity index 69%
copy from ruby/red-parquet/lib/parquet/arrow-table-savable.rb
copy to ruby/red-parquet/lib/parquet/writer-properties.rb
index 7667381..5881471 100644
--- a/ruby/red-parquet/lib/parquet/arrow-table-savable.rb
+++ b/ruby/red-parquet/lib/parquet/writer-properties.rb
@@ -16,21 +16,13 @@
# under the License.
module Parquet
- module ArrowTableSavable
- private
- def save_as_parquet
- chunk_size = @options[:chunk_size] || 1024 # TODO
- open_output_stream do |output|
- Parquet::ArrowFileWriter.open(@table.schema, output) do |writer|
- writer.write_table(@table, chunk_size)
- end
+ class WriterProperties
+ def set_dictionary(enable, path=nil)
+ if enable
+ enable_dictionary(path)
+ else
+ disable_dictionary(path)
end
end
end
end
-
-module Arrow
- class TableSaver
- include Parquet::ArrowTableSavable
- end
-end
diff --git a/ruby/red-parquet/test/test-arrow-table.rb b/ruby/red-parquet/test/test-arrow-table.rb
index 739facc..bb0f2e0 100644
--- a/ruby/red-parquet/test/test-arrow-table.rb
+++ b/ruby/red-parquet/test/test-arrow-table.rb
@@ -19,7 +19,8 @@ class TestArrowTableReader < Test::Unit::TestCase
def setup
@count_field = Arrow::Field.new("count", :uint8)
@visible_field = Arrow::Field.new("visible", :boolean)
- schema = Arrow::Schema.new([@count_field, @visible_field])
+ @label_field = Arrow::Field.new("label", :string)
+ schema = Arrow::Schema.new([@count_field, @visible_field, @label_field])
count_arrays = [
Arrow::UInt8Array.new([1, 2]),
Arrow::UInt8Array.new([4, 8, 16]),
@@ -33,16 +34,30 @@ class TestArrowTableReader < Test::Unit::TestCase
Arrow::BooleanArray.new([nil]),
Arrow::BooleanArray.new([nil]),
]
+ label_arrays = [
+ Arrow::StringArray.new(["a"]),
+ Arrow::StringArray.new(["b", "c"]),
+ Arrow::StringArray.new(["d", nil, nil]),
+ Arrow::StringArray.new(["e", "f"]),
+ ]
@count_array = Arrow::ChunkedArray.new(count_arrays)
@visible_array = Arrow::ChunkedArray.new(visible_arrays)
- @table = Arrow::Table.new(schema, [@count_array, @visible_array])
+ @label_array = Arrow::ChunkedArray.new(label_arrays)
+ @table = Arrow::Table.new(schema,
+ [@count_array, @visible_array, @label_array])
+
+ @output = Tempfile.open(["red-parquet", ".parquet"])
+ begin
+ yield(@output)
+ ensure
+ @output.close!
+ end
end
def test_save_load_path
- tempfile = Tempfile.open(["red-parquet", ".parquet"])
- @table.save(tempfile.path)
+ @table.save(@output.path)
assert do
- @table.equal_metadata(Arrow::Table.load(tempfile.path), false)
+ @table.equal_metadata(Arrow::Table.load(@output.path), false)
end
end
@@ -53,4 +68,32 @@ class TestArrowTableReader < Test::Unit::TestCase
@table.equal_metadata(Arrow::Table.load(buffer, format: :parquet), false)
end
end
+
+ def test_save_load_compression
+ @table.save(@output.path, compression: :zstd)
+ assert do
+ @table.equal_metadata(Arrow::Table.load(@output.path), false)
+ end
+ end
+
+ def test_save_load_compression_path
+ @table.save(@output.path, compression: {"count" => :zstd})
+ assert do
+ @table.equal_metadata(Arrow::Table.load(@output.path), false)
+ end
+ end
+
+ def test_save_load_dictionary
+ @table.save(@output.path, dictionary: false)
+ assert do
+ @table.equal_metadata(Arrow::Table.load(@output.path), false)
+ end
+ end
+
+ def test_save_load_dictionary_path
+ @table.save(@output.path, dictionary: [["label", false]])
+ assert do
+ @table.equal_metadata(Arrow::Table.load(@output.path), false)
+ end
+ end
end